Skip to content

Commit

Permalink
Merge pull request #23 from mskilab-org/sb_dev
Browse files Browse the repository at this point in the history
fixed rebinning problem + added masking
  • Loading branch information
sebastian-brylka authored Nov 21, 2023
2 parents 16dec5b + 6cd8f9b commit ff1e15f
Show file tree
Hide file tree
Showing 8 changed files with 104 additions and 113 deletions.
149 changes: 92 additions & 57 deletions R/dryclean.R
Original file line number Diff line number Diff line change
Expand Up @@ -59,41 +59,31 @@ dryclean <- R6::R6Class("dryclean",
#'
#' @param mc.cores interger (default == 1) number of cores to use for parallelization
#'
#' @param whole_genome boolean (default = TRUE) for this function always set this parameter to TRUE
#'
#' @param use.blacklist boolean (default = FALSE) whether to exclude off-target markers in case of Exomes or targeted sequencing. If set to TRUE, needs a GRange marking if each marker is set to be excluded or not
#' @param use.blacklist boolean (default = FALSE) whether to exclude off-target markers in case of Exomes or targeted sequencing. If set to TRUE, needs a GRange marking if each marker is set to be excluded or not or will use a default mask
#'
#' @param blacklist_path character (default = NA) if use.blacklist == TRUE, path a GRanges object marking if each marker is set to be excluded or not
#'
#' @param germline.filter boolean (default == FALSE) if germline markers need to be removed from decomposition
#'
#' @param germline.file character (default == NA) path to file with germline markers
#'
#' @param verbose boolean (default == TRUE) outputs progress
#'
#' @param is.human boolean (default == TRUE) organism type
#'
#' @param field character (default == "reads.corrected") field to use for processing
#'
#' @param all.chr list (default = c(as.character(1:22), "X")) list of chromosomes
#'
#' @param testing boolean (default = FALSE) DO NOT CHANGE
#'
#' @return Drycleaned coverage or drycleaned coverage after CBS in GRanges format
#' @return Drycleaned coverage in GRanges format

clean = function(cov, centered = FALSE, cbs = FALSE, cnsignif = 1e-5, mc.cores = 1, verbose = TRUE, whole_genome = TRUE, use.blacklist = FALSE, blacklist_path = NA, germline.filter = FALSE, germline.file = NA, field = "reads.corrected", is.human = TRUE, all.chr = c(as.character(1:22), "X"), testing = FALSE){
clean = function(cov, centered = FALSE, cbs = FALSE, cnsignif = 1e-5, mc.cores = 1, verbose = TRUE, use.blacklist = FALSE, blacklist_path = NA, germline.filter = FALSE, field = "reads.corrected", testing = FALSE){

message("Loading coverage")
private$history <- rbindlist(list(private$history, data.table(action = paste("Loaded coverage from", cov), date = as.character(Sys.time()))))
cov = readRDS(cov)

#detergent.pon.path = private$pon.path
cov <- cov %>% gr2dt() %>% filter(seqnames != 'Y') %>% dt2gr()

if(verbose == TRUE){
message("Loading PON a.k.a detergent")
}

#rpca.1 = readRDS(detergent.pon.path)

private$history <- rbindlist(list(private$history, data.table(action = "Loaded PON", date = as.character(Sys.time()))))

Expand All @@ -103,36 +93,35 @@ dryclean <- R6::R6Class("dryclean",
if (tumor.binsize != pon.binsize & testing == FALSE){
message(paste0("WARNING: Input tumor bin size = ", tumor.binsize,"bp. PON bin size = ", pon.binsize,"bp. Rebinning tumor to bin size of PON..."))
private$history <- rbindlist(list(private$history, data.table(action = paste("Rebinning tumor to", pon.binsize, "bp bin size"), date = as.character(Sys.time()))))
cov = collapse_cov(cov, bin.size = pon.binsize, this.field = field)
suppressWarnings({
cov = gr.val(query = private$pon$get_template(), cov, val = field)
})
}

tumor.length <- cov %>%
gr2dt() %>%
dplyr::filter(seqnames != "Y") %>%
dt2gr() %>%
length()

pon.length <- private$pon$get_template() %>%
gr2dt() %>%
dplyr::filter(seqnames != "Y") %>%
dt2gr() %>%
length()

if (tumor.length != pon.length ) {
dt1 = data.table(chr = names(seqlengths(cov)), length_coverage = seqlengths(cov))
dt2 = data.table(chr = names(private$pon$get_seqlengths()), length_pon = private$pon$get_seqlengths())
dt_mismatch = merge(dt1, dt2, by = "chr")
dt_mismatch <- dt_mismatch[order(match(dt_mismatch$chr, c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
"11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
"21", "22", "X", "Y")))]
dt_mismatch <- dt_mismatch %>%
filter(length_coverage != length_pon)
private$dt_mismatch = dt_mismatch
if(testing == FALSE){
stop("ERROR: seqlengths of coverage and PON do not match. Use get_mismatch() function to see mismatching seqlengths")
}

if (length(cov) != pon.length & testing == FALSE) {
dt_mismatch = data.table(chr = c(), coverage = c(), pon = c())
for(chr in c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "\
X")){
dt_mismatch <- rbind(dt_mismatch,
data.table(
chr = chr,
coverage = cov %>% gr2dt() %>% filter(seqnames == chr) %>% nrow(),
pon = private$pon$get_template() %>% gr2dt() %>% filter(seqnames == chr) %>% nrow()
))
}
dt_mismatch <- dt_mismatch %>% filter(coverage != pon)
private$dt_mismatch = dt_mismatch
if(testing == FALSE){
stop("ERROR: Number of bins of coverage and PON does not match. Use get_mismatch() function to see mismatched chromosomes")
}

}

if(verbose == TRUE){
message(paste0("Let's begin, this is whole exome/genome"))
}
Expand All @@ -143,38 +132,64 @@ dryclean <- R6::R6Class("dryclean",
stop("If germiline.filter is set to TRUE, pon must have a inf_germ element, see prepare_detergent for details")
}

#all.chr = c(as.character(1:22), "X")
all.chr = c(as.character(1:22), "X")

is.chr = FALSE

if (is.human){
if(any(grepl("chr", as.character(seqnames(cov))))){
cov = gr.sub(cov)
is.chr = TRUE
}
#cov = cov %Q% (seqnames %in% all.chr)
if(any(grepl("chr", as.character(seqnames(cov))))){
cov = gr.sub(cov)
is.chr = TRUE
}

local.all.chr = all.chr
cov = cov %Q% (seqnames %in% local.all.chr)
cov = cov[, field] %>% gr2dt() %>% setnames(., field, "signal")

cov <- cov %>%
dplyr::mutate(signal = ifelse(is.na(signal), 0, signal)) %>%
dplyr::mutate(signal = ifelse(is.infinite(signal), NA, signal))

if(centered == FALSE){
message("Centering the sample")
private$history <- rbindlist(list(private$history, data.table(action = paste("Mean-normalization of coverage"), date = as.character(Sys.time()))))
cov <- cov %>%
dplyr::mutate(signal = ifelse(is.na(signal), 0, signal)) %>%
dplyr::mutate(signal = ifelse(is.infinite(signal), NA, signal)) %>%
dplyr::mutate(signal = signal / mean(signal))
dplyr::mutate(signal = signal / median(signal))
}

cov = cov %>% dt2gr()

cov = sortSeqlevels(cov)
cov = sort(cov)

m.vec = prep_cov(cov, blacklist = use.blacklist)
if(use.blacklist == TRUE ){
if((is.na(blacklist_path) | blacklist_path == "NA")){
blacklist_path = system.file("extdata", "blacklist_A.rds", package = 'dryclean')
message(paste0("Applying the default mask to the coverage"))
private$history <- rbindlist(list(private$history, data.table(action = paste("Applying the defualt mask to coverage"), date = as.character(Sys.time()))))
}else{
blacklist_path = blacklist_path
message(paste0("Applying the provided mask to the coverage"))
private$history <- rbindlist(list(private$history, data.table(action = paste("Applying the provided mask to coverage"), date = as.character(Sys.time()))))
}

suppressWarnings({
blacklist_pon <- gUtils::gr.val(query = private$pon$get_template(),
target = readRDS(blacklist_path),
val = "blacklisted",
na.rm = TRUE)$blacklisted
blacklist_pon <- ifelse(blacklist_pon == 1, TRUE, FALSE)


blacklist_cov <- gUtils::gr.val(query = cov,
target = readRDS(blacklist_path),
val = "blacklisted",
na.rm = TRUE)$blacklisted
blacklist_cov <- ifelse(blacklist_cov == 1, TRUE, FALSE)
})

}

m.vec = prep_cov(cov, use.blacklist = use.blacklist, blacklist = blacklist_cov)

m.vec = as.matrix(m.vec$signal)
L.burnin = private$pon$get_L()
Expand All @@ -188,6 +203,13 @@ dryclean <- R6::R6Class("dryclean",
message("Initializing wash cycle")
}

if (use.blacklist){
blacklisted <- !blacklist_pon
L.burnin = L.burnin[blacklisted,]
S.burnin = S.burnin[blacklisted,]
U.hat = U.hat[blacklisted, ]
}

decomposed = wash_cycle(m.vec = m.vec, L.burnin = L.burnin,
S.burnin = S.burnin, r = r, U.hat = U.hat,
V.hat = V.hat, sigma.hat = sigma.hat)
Expand All @@ -200,14 +222,13 @@ dryclean <- R6::R6Class("dryclean",
cov[, median.chr := median(.SD$signal, na.rm = T), by = seqnames]

if (use.blacklist){
cov_template <- cov
cov[is.na(signal), signal := median.chr]
cov[is.infinite(signal), signal := median.chr]
blacklist.pon = readRDS(blacklist_path)
cov$blacklisted = blacklist.pon$blacklisted
cov[blacklisted == TRUE, signal := NA]
cov = na.omit(cov)
cov = cov[!blacklist_cov,]
}


setnames(cov, "signal", "input.read.counts")
cov = cbind(decomposed[[2]], cov)
colnames(cov)[1] = 'foreground.log'
Expand All @@ -224,6 +245,7 @@ dryclean <- R6::R6Class("dryclean",
cov[, log.reads := log(input.read.counts)]
cov[is.infinite(log.reads), log.reads := NA]


if (germline.filter){
germ.file = private$pon$get_inf_germ()
cov$germline.status = germ.file$germline.status
Expand All @@ -233,16 +255,30 @@ dryclean <- R6::R6Class("dryclean",
}

cov = dt2gr(cov)

if(use.blacklist){
cov_template = dt2gr(cov_template)
suppressWarnings({
cov <- gUtils::gr.val(query = cov_template,
target = cov,
val = c("background.log",
"foreground.log",
"input.read.counts",
"median.chr",
"foreground",
"background",
"log.reads"),
na.rm = TRUE)
})

}

if (is.chr){
cov = gr.chr(cov)
}

#dt = data.table(cov.dc = cov, cov.dc.cbs = NULL)

private$history <- rbindlist(list(private$history, data.table(action = paste("Finished drycleaning the coverage file"), date = as.character(Sys.time()))))


if(cbs == TRUE){

message("Starting CBS on the drycleaned sample")
Expand Down Expand Up @@ -276,7 +312,6 @@ dryclean <- R6::R6Class("dryclean",
out = gUtils::gr.fix(out, new.sl, drop = T)
cat(length(out), ' segments produced\n')
names(out) = NULL
#dt[, cov.dc.cbs := out]

gc()

Expand All @@ -287,7 +322,7 @@ dryclean <- R6::R6Class("dryclean",
private$history <- rbindlist(list(private$history, data.table(action = paste("Saved CBS output in current directory as cbs_output.rds"), date = as.character(Sys.time()))))

}

return(cov)
},

Expand Down
15 changes: 3 additions & 12 deletions R/helper_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ wash_cycle <- function(m.vec, L.burnin, S.burnin , r, N, U.hat, V.hat, sigma.hat
#'
#' @author Aditya Deshpande

prep_cov <- function(m.vec = m.vec, blacklist = FALSE, burnin.samples.path = NA){
prep_cov <- function(m.vec = m.vec, use.blacklist = FALSE, blacklist = NA){
m.vec = gr2dt(m.vec)
m.vec = m.vec[, .(seqnames, signal)]
m.vec[, median.chr := median(.SD$signal, na.rm = T), by = seqnames]
Expand All @@ -248,9 +248,8 @@ prep_cov <- function(m.vec = m.vec, blacklist = FALSE, burnin.samples.path = NA)
m.vec[signal == 0, signal := min.cov]
m.vec[signal < 0, signal := min.cov]

if (blacklist){
blacklist.pon = readRDS(paste0(burnin.samples.path, "/blacklist.rds"))
m.vec$blacklisted = blacklist.pon$blacklisted
if (use.blacklist){
m.vec$blacklisted = blacklist
m.vec[blacklisted == TRUE, signal := NA]
m.vec = na.omit(m.vec)
}
Expand Down Expand Up @@ -283,14 +282,6 @@ collapse_cov <- function(cov.gr, bin.size = target_resolution, this.field = fiel
cov.gr = cov.gr[!is.infinite(signal), .(signal = median(signal, na.rm = TRUE)),
by = .(seqnames, start = floor(start/BINSIZE.ROUGH)*BINSIZE.ROUGH+1)]
cov.gr[, end := (start + BINSIZE.ROUGH) - 1]
s <- cov_og %>% group_by(seqnames) %>% summarize(end = max(end)) %>% data.table() %>% filter(end %% BINSIZE.ROUGH == 0) %>% select(seqnames)
for (chr in s$seqnames){
cov_fixed <- cov.gr %>% filter(seqnames == chr) %>% select(seqnames, start, end, signal)
cov_fixed <- cov_fixed[nrow(cov_fixed)]
cov_fixed <- cov_fixed %>%
mutate(start = end + 1, end = end + BINSIZE.ROUGH)
cov.gr <- rbind(cov.gr,cov_fixed)
}
setnames(cov.gr, "signal", this.field)
cov.gr = dt2gr(cov.gr)
return(cov.gr)
Expand Down
Loading

0 comments on commit ff1e15f

Please sign in to comment.