diff --git a/R/dryclean.R b/R/dryclean.R index f481288..24e1a72 100644 --- a/R/dryclean.R +++ b/R/dryclean.R @@ -51,7 +51,7 @@ dryclean <- R6::R6Class("dryclean", #' #' @param cov character path to the granges coverage file to be drycleaned #' - #' @param centered boolean (default == FALSE) whether a coverage has already been centered by by dividing each bin by global mean signal of the sample + #' @param center boolean (default == TRUE) whether to center the coverage before drycleaning #' #' @param cbs boolean (default == FALSE) whether to perform cbs on the drycleaned coverage #' @@ -73,7 +73,7 @@ dryclean <- R6::R6Class("dryclean", #' #' @return Drycleaned coverage in GRanges format - clean = function(cov, centered = FALSE, cbs = FALSE, cnsignif = 1e-5, mc.cores = 1, verbose = TRUE, use.blacklist = FALSE, blacklist_path = NA, germline.filter = FALSE, field = "reads.corrected", testing = FALSE){ + clean = function(cov, center = TRUE, cbs = FALSE, cnsignif = 1e-5, mc.cores = 1, verbose = TRUE, use.blacklist = FALSE, blacklist_path = NA, germline.filter = FALSE, field = "reads.corrected", testing = FALSE){ message("Loading coverage") private$history <- rbindlist(list(private$history, data.table(action = paste("Loaded coverage from", cov), date = as.character(Sys.time())))) @@ -145,18 +145,17 @@ X")){ local.all.chr = all.chr cov = cov %Q% (seqnames %in% local.all.chr) cov = cov[, field] %>% gr2dt() %>% setnames(., field, "signal") + cov = cov %>% dt2gr() - - if(centered == FALSE){ - message("Centering the sample") + if(center == TRUE){ + + message("Median-centering the sample") private$history <- rbindlist(list(private$history, data.table(action = paste("Median-normalization of coverage"), date = as.character(Sys.time())))) - cov = cov %>% dt2gr() mcols(cov)[which(is.na(mcols(cov)[, "signal"])), "signal"] = 0 mcols(cov)[which(is.infinite(mcols(cov)[, "signal"])), "signal"] = NA values(cov)[, "signal"] = values(cov)[, "signal"] / mean(values(cov)[, "signal"], na.rm = TRUE) - }else{ - cov = cov %>% dt2gr() - } + } + cov = sortSeqlevels(cov) cov = sort(cov) @@ -263,7 +262,7 @@ X")){ suppressWarnings({ cov <- gUtils::gr.val(query = cov_template, target = cov, - val = c("background.log", + val =c("background.log", "foreground.log", "input.read.counts", "median.chr", @@ -526,12 +525,14 @@ pon <- R6::R6Class("pon", samp.final <- samp.final %>% mutate(file.available = file.exists(normal_cov)) + message("Checking for existence of files") samp.final = samp.final[file.available == TRUE] message(paste0(nrow(samp.final), " files present")) + mat.n = pbmcapply::pbmclapply(samp.final[, sample], function(nm, all.chr){ this.cov = tryCatch(readRDS(samp.final[sample == nm, normal_cov]), error = function(e) NULL) if (!is.null(this.cov)){ @@ -575,8 +576,8 @@ pon <- R6::R6Class("pon", mat.bind.t = matrix(unlist(mat.n), ncol = length(mat.n)) - print(nrow(mat.bind.t)) - print(ncol(mat.bind.t)) + #print(nrow(mat.bind.t)) + #print(ncol(mat.bind.t)) rm(mat.n) gc() diff --git a/README.html b/README.html index 5f839a0..3adb404 100644 --- a/README.html +++ b/README.html @@ -347,7 +347,11 @@

dryclean tutorial

Build Status codecov.io

+
+

Dockerized installation

+

To make our life easier, we have created a Docker container with the latest stable release of Dryclean and its dependencies. This can be found here. The latest updated version is 0.0.2, so make sure to select the correct tag.

+

Robust PCA based method to de-noise genomic coverage data.

@@ -363,7 +367,7 @@

Installations

Install mskilab R dependencies (gUtils)

devtools::install_github('mskilab/gUtils')

Install dryclean

-
devtools::install_github('mskilab/dryclean')
+
devtools::install_github('mskilab-org/dryclean')

(after installing R package) Add dryclean directory to PATH and test the executable

$ export PATH=${PATH}:$(Rscript -e 'cat(paste0(installed.packages()["dryclean", "LibPath"], "/dryclean/extdata/"))')
 $ drcln -h ## to see the help message
@@ -817,7 +821,7 @@

2. Normalizing the coverage aka drycleaning

    -
  1. log.reads: log of the mean-normalized count +
  2. log.reads: log of the median-normalized count @@ -867,13 +871,13 @@

    2. Normalizing the coverage aka drycleaning

    -centered +center -FALSE +TRUE -Whether a coverage has already been mean-normalized +Whether to center a coverage. If you used the Fragcounter to correct the coverage, set to FALSE as it has already been centered. @@ -884,7 +888,7 @@

    2. Normalizing the coverage aka drycleaning

    -Whether to perform cbs on the drycleaned coverage +Whether to perform cbs on the drycleaned coverage; If TRUE, saves CBS coverage as cbs_output.rds in output directory @@ -917,7 +921,7 @@

    2. Normalizing the coverage aka drycleaning

    -Whether to exclude off-target markers in case of Exomes or targeted sequencing; If set to TRUE, needs a GRange marking if each marker is set to be excluded or not +Whether to exclude off-target markers in case of Exomes or targeted sequencing; If set to TRUE, it will use a defualt mask or needs a path to GRanges marking if each marker is set to be excluded or not as blacklist_path @@ -944,17 +948,6 @@

    2. Normalizing the coverage aka drycleaning

    -germline.file - - -NA - - -Path to file with germline markers - - - - verbose @@ -964,32 +957,19 @@

    2. Normalizing the coverage aka drycleaning

    - - -is.human - - -TRUE - - -Organism type - - - - -all.chr - - -c(as.character(1:22), “X”) - - -List of chromosomes - -


    -

    For ‘dryclean’ to work correctly, the lengths of each sequence on each chromosome in the coverage and PON (Panel of Normal) data must match. If you attempt to normalize the coverage with PON data of different sequence lengths, you will encounter an error. In the event of such an error, you can utilize the get_mismatch() method to obtain a data table of all chromosomes with mismatched lengths. Additionally, you can use the get_history() method to review all actions performed on the object with timestamps.

    +Prerequisites for ‘dryclean’ to work correctly: + +

    Additionally, you can use the get_history() method to review all actions performed on the object with timestamps.

    3. Running dryclean on tumor sample from command line

    @@ -1081,8 +1061,8 @@

    3. Running dryclean on tumor sample from c -i INPUT, --input=INPUT path to the coverage file in GRanges format saved as .rds - -t CENTERED, --centered=CENTERED - are the samples centered + -t CENTER, --center=CENTER + whether to center the coverage before drycleaning -s CBS, --cbs=CBS whether to perform cbs on the drycleaned coverage @@ -1093,21 +1073,15 @@

    3. Running dryclean on tumor sample from c -c CORES, --cores=CORES number of cores to use - -w WHOLEGENOME, --wholeGenome=WHOLEGENOME - whether whole genome is being used - -b BLACKLIST, --blacklist=BLACKLIST whether there are blacklisted makers -l BLACKLIST_PATH, --blacklist_path=BLACKLIST_PATH - if --blacklist == TRUE, path to a GRanges object marking if each marker is set to be excluded or not + if --blacklist == TRUE, path to a GRanges object marking if each marker is set to be excluded or it willuse a default mask -g GERMLINE.FILTER, --germline.filter=GERMLINE.FILTER if PON based germline filter is to be used for removing some common germline events, if set to TRUE, give path to germline annotated file - -f GERMLINE.FILE, --germline.file=GERMLINE.FILE - path to file annotated with germline calls, if germline.filter == TRUE - -m HUMAN, --human=HUMAN whther the samples under consideration are human diff --git a/README.md b/README.md index c449603..f62b12c 100644 --- a/README.md +++ b/README.md @@ -320,7 +320,7 @@ The output has following metadata fields: 4. median.chr: median chromosome signal 5. foreground: Foreground signal, that forms SCNAs (S vector) in read count/ratio space 6. background: This is the L low ranked vector after decomposition and represent the background noise separated by dryclean in read count/ratio space -7. log.reads: log of the mean-normalized count +7. log.reads: log of the median-normalized count @@ -344,9 +344,9 @@ The parameters that can be used in clean() function: Field name in GRanges metadata of coverage to use for drycleaning - centered - FALSE - Whether a coverage has already been mean-normalized + center + TRUE + Whether to center a coverage. If you used the Fragcounter to correct the coverage, set to FALSE as it has already been centered. cbs @@ -388,7 +388,13 @@ The parameters that can be used in clean() function:
    -For 'dryclean' to work correctly, the number of bins on each chromosome in the coverage and PON (Panel of Normal) data must match. If you attempt to normalize the coverage with PON data of different number of bins, you will encounter an error. In the event of such an error, you can utilize the get_mismatch() method to obtain a data table of all chromosomes with mismatched lengths. Additionally, you can use the get_history() method to review all actions performed on the object with timestamps. +Prerequisites for 'dryclean' to work correctly: +
      +
    • The number of bins on each chromosome in the coverage and PON (Panel of Normal) data must match. If you attempt to normalize the coverage with PON data of different number of bins, you will encounter an error. In the event of such an error, you can utilize the get_mismatch() method to obtain a data table of all chromosomes with mismatched lengths.
    • +
    • The coverage data has to be centered. If the coverage has not been centered, set center=TRUE. WARNING: If you used Fragcounter to correct the coverage, it has already been centered, therefore set center=FALSE
    • +
    + +Additionally, you can use the get_history() method to review all actions performed on the object with timestamps. @@ -505,8 +511,8 @@ Options: -i INPUT, --input=INPUT path to the coverage file in GRanges format saved as .rds - -t CENTERED, --centered=CENTERED - are the samples centered + -t CENTER, --center=CENTER + whether to center the coverage before drycleaning -s CBS, --cbs=CBS whether to perform cbs on the drycleaned coverage diff --git a/drcln b/drcln index 3ee39b4..90513db 100755 --- a/drcln +++ b/drcln @@ -26,7 +26,7 @@ if (!exists('opt')) make_option(c("--mode"), type = "character", default = "coverage", help = "Mode of operation: 'pon' or 'coverage'. Set to 'pon' for PON generation and 'coverage' for normalizing a sample using existing PON"), make_option(c("-p", "--pon"), type = "character", default = NULL, help = "path to the existing Panel Of Normal (PON) saved as .rds"), make_option(c("-i", "--input"), type = "character", help = "path to the coverage file in GRanges format saved as .rds"), - make_option(c("-t", "--centered"), type = "logical", default = FALSE, help = "are the samples centered"), + make_option(c("-C", "--center"), type = "logical", default = TRUE, help = "whether to center the coverage before drycleaning"), make_option(c("-s", "--cbs"), type = "logical", default = FALSE, help = "whether to perform cbs on the drycleaned coverage"), make_option(c("-n", "--cnsignif"), type = "integer", default = 1e-5, help = "the significance levels for the test to accept change-points in cbs"), make_option(c("-c", "--cores"), type = "integer", default = 10, help = "number of cores to use"), @@ -104,7 +104,7 @@ suppressWarnings(suppressPackageStartupMessages(library(dryclean))) a <- dryclean_object$clean( cov = opt$input, - centered = opt$centered, + center = opt$center, cbs = opt$cbs, cnsignif = opt$cnsignif, mc.cores = opt$cores, diff --git a/inst/extdata/detergent_test.rds b/inst/extdata/detergent_test.rds index 68a871e..45b6ab7 100644 Binary files a/inst/extdata/detergent_test.rds and b/inst/extdata/detergent_test.rds differ diff --git a/inst/extdata/drcln b/inst/extdata/drcln index 3ee39b4..90513db 100755 --- a/inst/extdata/drcln +++ b/inst/extdata/drcln @@ -26,7 +26,7 @@ if (!exists('opt')) make_option(c("--mode"), type = "character", default = "coverage", help = "Mode of operation: 'pon' or 'coverage'. Set to 'pon' for PON generation and 'coverage' for normalizing a sample using existing PON"), make_option(c("-p", "--pon"), type = "character", default = NULL, help = "path to the existing Panel Of Normal (PON) saved as .rds"), make_option(c("-i", "--input"), type = "character", help = "path to the coverage file in GRanges format saved as .rds"), - make_option(c("-t", "--centered"), type = "logical", default = FALSE, help = "are the samples centered"), + make_option(c("-C", "--center"), type = "logical", default = TRUE, help = "whether to center the coverage before drycleaning"), make_option(c("-s", "--cbs"), type = "logical", default = FALSE, help = "whether to perform cbs on the drycleaned coverage"), make_option(c("-n", "--cnsignif"), type = "integer", default = 1e-5, help = "the significance levels for the test to accept change-points in cbs"), make_option(c("-c", "--cores"), type = "integer", default = 10, help = "number of cores to use"), @@ -104,7 +104,7 @@ suppressWarnings(suppressPackageStartupMessages(library(dryclean))) a <- dryclean_object$clean( cov = opt$input, - centered = opt$centered, + center = opt$center, cbs = opt$cbs, cnsignif = opt$cnsignif, mc.cores = opt$cores, diff --git a/man/dryclean.Rd b/man/dryclean.Rd index 110ada3..8a55355 100644 --- a/man/dryclean.Rd +++ b/man/dryclean.Rd @@ -47,7 +47,7 @@ Function begins the online rPCA process. Use this function if you performed batc \subsection{Usage}{ \if{html}{\out{
    }}\preformatted{dryclean$clean( cov, - centered = FALSE, + center = TRUE, cbs = FALSE, cnsignif = 1e-05, mc.cores = 1, @@ -65,7 +65,7 @@ Function begins the online rPCA process. Use this function if you performed batc \describe{ \item{\code{cov}}{character path to the granges coverage file to be drycleaned} -\item{\code{centered}}{boolean (default == FALSE) whether a coverage has already been centered by by dividing each bin by global mean signal of the sample} +\item{\code{center}}{boolean (default == TRUE) whether to center the coverage before drycleaning} \item{\code{cbs}}{boolean (default == FALSE) whether to perform cbs on the drycleaned coverage} diff --git a/tests/testthat/cbs_output.rds b/tests/testthat/cbs_output.rds new file mode 100644 index 0000000..577daeb Binary files /dev/null and b/tests/testthat/cbs_output.rds differ diff --git a/tests/testthat/test_dryclean.R b/tests/testthat/test_dryclean.R index d1ad13f..91df302 100644 --- a/tests/testthat/test_dryclean.R +++ b/tests/testthat/test_dryclean.R @@ -170,18 +170,6 @@ test_that("cbs", { }) - -test_that("centering", { - pon_object = pon$new(pon_path = detergent.path) - - dryclean_object <- dryclean$new(pon = pon_object) - - a <- dryclean_object$clean(cov = sample.1.path, testing = TRUE, centered = FALSE) - - expect_equal(a$background.log[1], 0.0508, tolerance = 0.001) - -}) - test_that("clustering", { expect_error( pon_object = pon$new(