Skip to content

Commit

Permalink
Merge pull request #30 from mskilab-org/sb_dev
Browse files Browse the repository at this point in the history
more robust approach to mismatched bin number
  • Loading branch information
sebastian-brylka authored Jan 19, 2024
2 parents 9c4eb5f + 2edf2c3 commit 3fefd0a
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 92 deletions.
21 changes: 12 additions & 9 deletions R/dryclean.R
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,13 @@ dryclean <- R6::R6Class("dryclean",
tumor.binsize = median(gr2dt(cov)$width)
pon.binsize = median(gr2dt(private$pon$get_template())$width)

is.chr = FALSE

if(any(grepl("chr", as.character(seqnames(cov))))){
cov = gr.sub(cov)
is.chr = TRUE
}

if (tumor.binsize != pon.binsize & testing == FALSE){
message(paste0("WARNING: Input tumor bin size = ", tumor.binsize,"bp. PON bin size = ", pon.binsize,"bp. Rebinning tumor to bin size of PON..."))
private$history <- rbindlist(list(private$history, data.table(action = paste("Rebinning tumor to", pon.binsize, "bp bin size"), date = as.character(Sys.time()))))
Expand Down Expand Up @@ -119,7 +126,10 @@ X")){
dt_mismatch <- dt_mismatch %>% filter(coverage != pon)
private$dt_mismatch = dt_mismatch
if(testing == FALSE){
stop("ERROR: Number of bins of coverage and PON does not match. Use get_mismatch() function to see mismatched chromosomes")
message("WARNING: Number of bins of coverage and PON does not match. Use get_mismatch() function to see mismatched chromosomes\nAligning coverage to the PON")
suppressWarnings({
cov = gr.val(query = private$pon$get_template(), cov, val = field)
})
}
}

Expand All @@ -135,13 +145,6 @@ X")){

all.chr = c(as.character(1:22), "X")

is.chr = FALSE

if(any(grepl("chr", as.character(seqnames(cov))))){
cov = gr.sub(cov)
is.chr = TRUE
}

local.all.chr = all.chr
cov = cov %Q% (seqnames %in% local.all.chr)
cov = cov[, field] %>% gr2dt() %>% setnames(., field, "signal")
Expand All @@ -153,7 +156,7 @@ X")){
private$history <- rbindlist(list(private$history, data.table(action = paste("Median-normalization of coverage"), date = as.character(Sys.time()))))
mcols(cov)[which(is.na(mcols(cov)[, "signal"])), "signal"] = 0
mcols(cov)[which(is.infinite(mcols(cov)[, "signal"])), "signal"] = NA
values(cov)[, "signal"] = values(cov)[, "signal"] / mean(values(cov)[, "signal"], na.rm = TRUE)
values(cov)[, "signal"] = values(cov)[, "signal"] / median(values(cov)[, "signal"], na.rm = TRUE)
}


Expand Down
118 changes: 38 additions & 80 deletions README.html
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,7 @@ <h3><font color="black"> 1. Creating Panel of Normal aka detergent </font></h3>
create_new_pon = TRUE,
normal_vector = normal_vector_example
)</code></pre>
<p><b>NOTE:</b> We recommend using raw reads from normal samples in PON generation for the most optimal performance.</p>
<p>The parameters that could be used in PON generation:</p>
<table style="border: 1px solid black; border-collapse: collapse;">
<tbody>
Expand Down Expand Up @@ -877,86 +878,43 @@ <h3><font color="black"> 2. Normalizing the coverage aka drycleaning </font></h3
TRUE
</td>
<td style="border: 1px solid black; padding: 5px;">
Whether to center a coverage. If you used the Fragcounter to correct the coverage, set to <code>FALSE</code> as it has already been centered.
</td>
</tr>
<tr>
<td style="border: 1px solid black; padding: 5px;">
cbs
</td>
<td style="border: 1px solid black; padding: 5px;">
FALSE
</td>
<td style="border: 1px solid black; padding: 5px;">
Whether to perform cbs on the drycleaned coverage; If TRUE, saves CBS coverage as cbs_output.rds in output directory
</td>
</tr>
<tr>
<td style="border: 1px solid black; padding: 5px;">
cnsignif
</td>
<td style="border: 1px solid black; padding: 5px;">
1e-5
</td>
<td style="border: 1px solid black; padding: 5px;">
The significance levels for the tests in cbs to accept change-points
</td>
</tr>
<tr>
<td style="border: 1px solid black; padding: 5px;">
mc.cores
</td>
<td style="border: 1px solid black; padding: 5px;">
1
</td>
<td style="border: 1px solid black; padding: 5px;">
Number of cores to use for parallelization
</td>
</tr>
<tr>
<td style="border: 1px solid black; padding: 5px;">
use.blacklist
</td>
<td style="border: 1px solid black; padding: 5px;">
FALSE
</td>
<td style="border: 1px solid black; padding: 5px;">
Whether to exclude off-target markers in case of Exomes or targeted sequencing; If set to TRUE, it will use a defualt mask or needs a path to GRanges marking if each marker is set to be excluded or not as <code>blacklist_path</code>
</td>
</tr>
<tr>
<td style="border: 1px solid black; padding: 5px;">
blacklist_path
</td>
<td style="border: 1px solid black; padding: 5px;">
NA
</td>
<td style="border: 1px solid black; padding: 5px;">
If use.blacklist == TRUE, path a GRanges object marking if each marker is set to be excluded or not
</td>
</tr>
<tr>
<td style="border: 1px solid black; padding: 5px;">
germline.filter
</td>
<td style="border: 1px solid black; padding: 5px;">
FALSE
</td>
<td style="border: 1px solid black; padding: 5px;">
Whether germline markers need to be removed from decomposition
</td>
</tr>
<tr>
<td style="border: 1px solid black; padding: 5px;">
verbose
</td>
<td style="border: 1px solid black; padding: 5px;">
TRUE
</td>
<td style="border: 1px solid black; padding: 5px;">
Outputs progress
</td>
Whether to center a coverage
</tr>
<pre><code>&lt;tr&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;cbs&lt;/td&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;FALSE&lt;/td&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;Whether to perform cbs on the drycleaned coverage; If TRUE, saves CBS coverage as cbs_output.rds in output directory&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;cnsignif&lt;/td&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;1e-5&lt;/td&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;The significance levels for the tests in cbs to accept change-points&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;mc.cores&lt;/td&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;1&lt;/td&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;Number of cores to use for parallelization&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;use.blacklist&lt;/td&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;FALSE&lt;/td&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;Whether to exclude off-target markers in case of Exomes or targeted sequencing; If set to TRUE, it will use a defualt mask or needs a path to GRanges marking if each marker is set to be excluded or not as &lt;code&gt;blacklist_path&lt;/code&gt; &lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;blacklist_path&lt;/td&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;NA&lt;/td&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;If use.blacklist == TRUE, path a GRanges object marking if each marker is set to be excluded or not&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;germline.filter&lt;/td&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;FALSE&lt;/td&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;Whether germline markers need to be removed from decomposition&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;verbose&lt;/td&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;TRUE&lt;/td&gt;
&lt;td style=&quot;border: 1px solid black; padding: 5px;&quot;&gt;Outputs progress&lt;/td&gt;
&lt;/tr&gt;</code></pre>
</tbody>
</table>
<p><br></p>
Expand All @@ -966,7 +924,7 @@ <h3><font color="black"> 2. Normalizing the coverage aka drycleaning </font></h3
The number of bins on each chromosome in the coverage and PON (Panel of Normal) data must match. If you attempt to normalize the coverage with PON data of different number of bins, you will encounter an error. In the event of such an error, you can utilize the <code>get_mismatch()</code> method to obtain a data table of all chromosomes with mismatched lengths.
</li>
<li>
The coverage data has to be centered. If the coverage has not been centered, set <code>center=TRUE</code>. WARNING: If you used Fragcounter to correct the coverage, it has already been centered, therefore set <code>center=FALSE</code>
The coverage data has to be centered. If the coverage has not been centered, set <code>center=TRUE</code>. NOTE: If you used Fragcounter to correct the coverage, it has already been centered.
</li>
</ul>
<p>Additionally, you can use the <code>get_history()</code> method to review all actions performed on the object with timestamps.</p>
Expand Down
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ Option 2: Create a new PON from normal samples.

To create a new PON, the vector with paths to the normal samples is needed.


Following is an example of such a vector


Expand All @@ -98,6 +99,8 @@ pon_object = pon$new(
)
```

<b>NOTE:</b> We recommend using raw reads from normal samples in PON generation for the most optimal performance.

The parameters that could be used in PON generation:

<table style="border: 1px solid black; border-collapse: collapse;">
Expand Down Expand Up @@ -346,7 +349,7 @@ The parameters that can be used in clean() function:
<tr>
<td style="border: 1px solid black; padding: 5px;">center</td>
<td style="border: 1px solid black; padding: 5px;">TRUE</td>
<td style="border: 1px solid black; padding: 5px;">Whether to center a coverage. If you used the Fragcounter to correct the coverage, set to <code>FALSE</code> as it has already been centered.</td>
<td style="border: 1px solid black; padding: 5px;">Whether to center a coverage
</tr>
<tr>
<td style="border: 1px solid black; padding: 5px;">cbs</td>
Expand Down Expand Up @@ -391,7 +394,7 @@ The parameters that can be used in clean() function:
Prerequisites for 'dryclean' to work correctly:
<ul>
<li>The number of bins on each chromosome in the coverage and PON (Panel of Normal) data must match. If you attempt to normalize the coverage with PON data of different number of bins, you will encounter an error. In the event of such an error, you can utilize the <code>get_mismatch()</code> method to obtain a data table of all chromosomes with mismatched lengths.</li>
<li>The coverage data has to be centered. If the coverage has not been centered, set <code>center=TRUE</code>. WARNING: If you used Fragcounter to correct the coverage, it has already been centered, therefore set <code>center=FALSE</code></li>
<li>The coverage data has to be centered. If the coverage has not been centered, set <code>center=TRUE</code>. NOTE: If you used Fragcounter to correct the coverage, it has already been centered (set <code>center=FALSE</code>).</li>
</ul>

Additionally, you can use the <code>get_history()</code> method to review all actions performed on the object with timestamps.
Expand Down
Binary file modified inst/extdata/detergent_test.rds
Binary file not shown.
Binary file modified tests/testthat/cbs_output.rds
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/testthat/test_dryclean.R
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ test_that("clean", {

expect_true(identical(colnames(values(a)), c("background.log", "foreground.log","input.read.counts", "median.chr", "foreground", "background", "log.reads")))

expect_equal(a$background.log[1], 0.0508, tolerance = 0.001)
expect_equal(a$background.log[1], 0.0306, tolerance = 0.001)
})


Expand Down

0 comments on commit 3fefd0a

Please sign in to comment.