From 2edf2c334b93666b8f4911c1357e66d5c7759d88 Mon Sep 17 00:00:00 2001 From: sebastian-brylka Date: Fri, 19 Jan 2024 11:35:51 -0500 Subject: [PATCH] more robust approach to mismatched bin number --- R/dryclean.R | 21 +++--- README.html | 118 ++++++++++---------------------- README.md | 7 +- inst/extdata/detergent_test.rds | Bin 3417 -> 3424 bytes tests/testthat/cbs_output.rds | Bin 503 -> 500 bytes tests/testthat/test_dryclean.R | 2 +- 6 files changed, 56 insertions(+), 92 deletions(-) diff --git a/R/dryclean.R b/R/dryclean.R index 24e1a72..7953fe5 100644 --- a/R/dryclean.R +++ b/R/dryclean.R @@ -91,6 +91,13 @@ dryclean <- R6::R6Class("dryclean", tumor.binsize = median(gr2dt(cov)$width) pon.binsize = median(gr2dt(private$pon$get_template())$width) + is.chr = FALSE + + if(any(grepl("chr", as.character(seqnames(cov))))){ + cov = gr.sub(cov) + is.chr = TRUE + } + if (tumor.binsize != pon.binsize & testing == FALSE){ message(paste0("WARNING: Input tumor bin size = ", tumor.binsize,"bp. PON bin size = ", pon.binsize,"bp. Rebinning tumor to bin size of PON...")) private$history <- rbindlist(list(private$history, data.table(action = paste("Rebinning tumor to", pon.binsize, "bp bin size"), date = as.character(Sys.time())))) @@ -119,7 +126,10 @@ X")){ dt_mismatch <- dt_mismatch %>% filter(coverage != pon) private$dt_mismatch = dt_mismatch if(testing == FALSE){ - stop("ERROR: Number of bins of coverage and PON does not match. Use get_mismatch() function to see mismatched chromosomes") + message("WARNING: Number of bins of coverage and PON does not match. Use get_mismatch() function to see mismatched chromosomes\nAligning coverage to the PON") + suppressWarnings({ + cov = gr.val(query = private$pon$get_template(), cov, val = field) + }) } } @@ -135,13 +145,6 @@ X")){ all.chr = c(as.character(1:22), "X") - is.chr = FALSE - - if(any(grepl("chr", as.character(seqnames(cov))))){ - cov = gr.sub(cov) - is.chr = TRUE - } - local.all.chr = all.chr cov = cov %Q% (seqnames %in% local.all.chr) cov = cov[, field] %>% gr2dt() %>% setnames(., field, "signal") @@ -153,7 +156,7 @@ X")){ private$history <- rbindlist(list(private$history, data.table(action = paste("Median-normalization of coverage"), date = as.character(Sys.time())))) mcols(cov)[which(is.na(mcols(cov)[, "signal"])), "signal"] = 0 mcols(cov)[which(is.infinite(mcols(cov)[, "signal"])), "signal"] = NA - values(cov)[, "signal"] = values(cov)[, "signal"] / mean(values(cov)[, "signal"], na.rm = TRUE) + values(cov)[, "signal"] = values(cov)[, "signal"] / median(values(cov)[, "signal"], na.rm = TRUE) } diff --git a/README.html b/README.html index 3adb404..983f200 100644 --- a/README.html +++ b/README.html @@ -395,6 +395,7 @@

1. Creating Panel of Normal aka detergent

create_new_pon = TRUE, normal_vector = normal_vector_example ) +

NOTE: We recommend using raw reads from normal samples in PON generation for the most optimal performance.

The parameters that could be used in PON generation:

@@ -877,86 +878,43 @@

2. Normalizing the coverage aka drycleaning

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Whether to center a coverage +
<tr>
+  <td style="border: 1px solid black; padding: 5px;">cbs</td>
+  <td style="border: 1px solid black; padding: 5px;">FALSE</td>
+  <td style="border: 1px solid black; padding: 5px;">Whether to perform cbs on the drycleaned coverage; If TRUE, saves CBS coverage as cbs_output.rds in output directory</td>
+</tr>
+<tr>
+  <td style="border: 1px solid black; padding: 5px;">cnsignif</td>
+  <td style="border: 1px solid black; padding: 5px;">1e-5</td>
+  <td style="border: 1px solid black; padding: 5px;">The significance levels for the tests in cbs to accept change-points</td>
+</tr>
+<tr>
+  <td style="border: 1px solid black; padding: 5px;">mc.cores</td>
+  <td style="border: 1px solid black; padding: 5px;">1</td>
+  <td style="border: 1px solid black; padding: 5px;">Number of cores to use for parallelization</td>
+</tr>
+<tr>
+  <td style="border: 1px solid black; padding: 5px;">use.blacklist</td>
+  <td style="border: 1px solid black; padding: 5px;">FALSE</td>
+  <td style="border: 1px solid black; padding: 5px;">Whether to exclude off-target markers in case of Exomes or targeted sequencing; If set to TRUE, it will use a defualt mask or needs a path to GRanges marking if each marker is set to be excluded or not as <code>blacklist_path</code> </td>
+</tr>
+<tr>
+  <td style="border: 1px solid black; padding: 5px;">blacklist_path</td>
+  <td style="border: 1px solid black; padding: 5px;">NA</td>
+  <td style="border: 1px solid black; padding: 5px;">If use.blacklist == TRUE, path a GRanges object marking if each marker is set to be excluded or not</td>
+</tr>
+<tr>
+  <td style="border: 1px solid black; padding: 5px;">germline.filter</td>
+  <td style="border: 1px solid black; padding: 5px;">FALSE</td>
+  <td style="border: 1px solid black; padding: 5px;">Whether germline markers need to be removed from decomposition</td>
+</tr>
+<tr>
+  <td style="border: 1px solid black; padding: 5px;">verbose</td>
+  <td style="border: 1px solid black; padding: 5px;">TRUE</td>
+  <td style="border: 1px solid black; padding: 5px;">Outputs progress</td>
+</tr>
-Whether to center a coverage. If you used the Fragcounter to correct the coverage, set to FALSE as it has already been centered. -
-cbs - -FALSE - -Whether to perform cbs on the drycleaned coverage; If TRUE, saves CBS coverage as cbs_output.rds in output directory -
-cnsignif - -1e-5 - -The significance levels for the tests in cbs to accept change-points -
-mc.cores - -1 - -Number of cores to use for parallelization -
-use.blacklist - -FALSE - -Whether to exclude off-target markers in case of Exomes or targeted sequencing; If set to TRUE, it will use a defualt mask or needs a path to GRanges marking if each marker is set to be excluded or not as blacklist_path -
-blacklist_path - -NA - -If use.blacklist == TRUE, path a GRanges object marking if each marker is set to be excluded or not -
-germline.filter - -FALSE - -Whether germline markers need to be removed from decomposition -
-verbose - -TRUE - -Outputs progress -


@@ -966,7 +924,7 @@

2. Normalizing the coverage aka drycleaning

get_mismatch() method to obtain a data table of all chromosomes with mismatched lengths.
  • -The coverage data has to be centered. If the coverage has not been centered, set center=TRUE. WARNING: If you used Fragcounter to correct the coverage, it has already been centered, therefore set center=FALSE +The coverage data has to be centered. If the coverage has not been centered, set center=TRUE. NOTE: If you used Fragcounter to correct the coverage, it has already been centered.
  • Additionally, you can use the get_history() method to review all actions performed on the object with timestamps.

    diff --git a/README.md b/README.md index f62b12c..1e0984b 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,7 @@ Option 2: Create a new PON from normal samples. To create a new PON, the vector with paths to the normal samples is needed. + Following is an example of such a vector @@ -98,6 +99,8 @@ pon_object = pon$new( ) ``` +NOTE: We recommend using raw reads from normal samples in PON generation for the most optimal performance. + The parameters that could be used in PON generation: @@ -346,7 +349,7 @@ The parameters that can be used in clean() function: - + @@ -391,7 +394,7 @@ The parameters that can be used in clean() function: Prerequisites for 'dryclean' to work correctly: Additionally, you can use the get_history() method to review all actions performed on the object with timestamps. diff --git a/inst/extdata/detergent_test.rds b/inst/extdata/detergent_test.rds index 45b6ab78e6a86e6ec805d2998d6bc73be27649ce..eec52af1ad8f7b7c65b69345bd410a0aeebf2253 100644 GIT binary patch literal 3424 zcmV-m4WIHKiwFP!000001Kn78Je2$QA7d9G;ilzw%Z-YHMao)PO4$Ze$yjHm zv?#eLl_F`gb=};`*k;BW>9NlsOWh<}i=}H9hLG}mIwkG}DXGWh%+xaAX2zRKJDzAlDg<+E@1H^#T{85Ie-#4jWY*aH4=&ib zBw$5TzBG`lF}pKR5|+F(;bj1lt{S~6v@jJ(X|I|;EL8_jHRr1uo#r5sm7PzP+mA!) zwZ!t<>ZQo<=Tz#%yMiH;bFa8?K_oo#aJ-`t!NL5e@MMT*%o;G$#nwzHIUV>?Gi@@Um&<@#orM ze4(sCLssC+Ii8+rT&`PlIUN#ci609)sF1Gu_FUfLC`jpHQU+~nF*^n?PE?~trKe9G z(8-+h*|H>D+$J)F z1ve-7wJ8JB{Fce4ErAGK^RC}Bb$&@%hu*1Ou^B&oU9vAa>+|OcA+H_F7xJW-h&!oMzs%1KjlZFTJ!fYGYcU7rjcjw zkQihLI(2Vtbin+(^5m6#ho>by^4R(auw)eGk6mX#$$`VVA}!TWLRC3cN75a!6@Sh2gRrSVA@VvWp=fk(gr6RP$DeD5@r9BNhE^)|M4p}=vWl~-4F_@k ztHSCOMj&?(yW; z#Po+ybdeCMda(^VXAMPh8~?Bk)y3>K0gL*)uibMql4|u)zQ@TPVQ$kh^cw#OO5SY} zuvtb!!bztL9!RDEvr*;2*adSW+Q@pmdfx{~Z=O?S8}bM;%%dunE>pz(+wtU8CBKcU zH;G_tTVN6VuXyGGlrTaye%+=FrF^>rly!%I9aEy6v}QLvl9<0aYAkT3AB4@0qX-Lh zL$SV)c*Vme9)GSK#urM0Mne48X7KcMHd(Q1GZ|=x7SE_lR3UZm`hLL?0bm-W@~1D} zhS}Nh;zT%+|I9rf2}!DVith5s^YZ3O=16R>+1UXs-Ub{_0FTZsK6J2&P7F9_7dKQ$ zpqNmreVutY+!yY6Lq0hfC%3!3fHyjV!im3BAgzak!}$ZqFtl1f#($nMQd7Sq9?78r zrzX_aBqjle{h-G+tr1C(hPLvtYjaPQ^YjCZ1U(p`ZE!+_yI&^_QR7m?_hs zC$&9$&hNKict@7evEIkf<%%;-_WTSZ`SP~T!e*PJ!lDGmLi4HbXXv_M;T-jiL6?n3v{_u_8yXJAnGWXhZOsW6mh zs~DQUAO2QLD)4Z1f$k#vRk(&;BqrFkY)IY#x~U4+8;_(wH_5*v$JZL#j!UU;;OIcB zfr_%G*GlLcb=c`T?DuDYa7h7i6HD5jJY10R6NMJzs%Ez!3g?BtGvI z^yznPzT>6h%<1i>N-eP1@ImWBYpg>ZWb)y{g-U?;Rv($p2 z12W9>mv_P7h+3AFUj=lk$?06(z77V~l~1tQ!q=K-yN`tNoCn1MO7%?0&=$|Dz`>4Wz@3$`otNfv9TlB}RV z{F3RmitwrT#u=(`HRM258(z=&?odaiAC=sr&%SEa} z?3a+wDl6HZS6 zj^PUL+vm>bN-jI&TSj^e!(#1~Ybp%k<GTMt@MW&p(5?sr77OWRM5kR}*vRAYUPX-aO9BG#3s-%NTda$GN~D#Vu6_YS zgDx@=Tk?@Ur1DyioHO#a-%dA_B@M;?DVb{*@$MPY{3e#>o?eS|+zxi$D>HS^+GKA! zT3$_r_vKGJ`|3XdXSeU?s@T`O{SNCr7g$b$(04nh?qE9}Xv=z1UXCn<0eVW(k!y4) z&^{Zu&CCayB7EGme^Y|Kd`)8k!$KHjrq*`0FGXIPc|Op_--lnd8da1;aL`w$w^|^( z3I_V$>d{$jq`%~Gu=|pIQ}JA_wY_N7NSoOFYF�XgZs4s=L=71{%yC>X+5Qz$-hs zX53Nu`!`}M^}acLsCr&0z2zVbZuRY6%Etr_yQeMWF$cQ39KN`6(7at%s)#nMfT1f! zra6`Kka5beTiJNSG`(sy6@}kbB1`g zv0=drxlp2d@e*r58WI)v#C=#$2@J!z_P(+?aJ{Gbz?sf;+PS_@Ts>6Rk8tgK2>?kiaO+u zLGoU|~)6?gka9(CA0dY$By+_Ocf)r}ynu9|$VE8JM#l>4NJ8@o| z@byD-WuJW@;f-3s1$BE~-dxF4B1HCDC}QynAZBD;JKf3({FtuvMsgR4WvJ8#VgUqe?tJHIb zkT0wF&ThUZWN(#qx9@1Z!Ucp%+v@>mtMYLOxYq}v}ZKpSK^y58b2q~nLZaD&`7~~4sl4nhUoa|jU&jj=%^PG=F7Pf{I4i^9wYQZp2aq`VX7nY zd@XEvO}h@cD^9#9-?I|3gA|WO`&U4AOHEjHW7j5Hv^M z$aub}6DfXm)p7j>dw7@EvZq(zGPIlU36d6Az`HM->^|GPN1Af#kCc|&nZ+mmANTk3 zlNF5Z`wO&x8(jIMrE>v}kAK08&l(Xj(aW9SMkLR&Cv=oRqM|!Eeu=NzulUx2L_A6) zp%yr)XvR>bqND@gzH=;Ym zgh(Mc6DWk~e)&u-jg4ofD&jU}DKj~|CiVr<0^+A<#lMF{oSBX%!O4|yh&UsEQFC=G z;sgUSR`Z!Ogvb!+cavxSr<@)ax6ymnSw2}#< zI+5|oXVLNhHG8p{!TvK{lh)jLtO|_k$i&-H4y* z;h$ql3_Ubs^!V`-oTBPNws&&zcA}C96IV~{J7z+Mi0&S4Q^lB|jQ+mI)*Q=scDCXM zU!CnlE%Y3}rHmGJma*>bacpv||MOJ|x}%%_Hd{qVE@X<{*HMX|_J*c8nDqQLL)=s* zQZuvuFWMYk z%B`X0n!^MNcVRVcAxd^RltU9EayCOc zXr-t`N_224TU$AgF(_KE^Nbuj$U3D+>#Si2OTJg$8Je{Fef#G(*Y&%e_qm_nb3ga* ze(&e@UiORNa5!EZ9}bV>q<2&=7#6RGa`gqu`lne)bX8aG3Y!U_T}`gI zsrD4Pb55yVv^x|YvF?@>EsBQ6ZijDeh+<*>(>dc6_c#?W*2d;kC_R1Qm3qcfDCb)e zY@sX&*itE%F)l}VS1sg zaf1}!r*j-T)u2MV_EIJ!B_w|=>hyw47mHvoa_he(xU~=?p1iv?V1eQIJ-X{wA-51mU5r zNq+5$KsCK#xMk}Fgra`i@0nsA6l*KmW|&)I`|P3k(DnOgcS&NrAfPL(v}jc@L1^>y zPs**$L#X_tZ&=w8P#WZ+ac}WnBvP{ZO#XvfpvH6zOO*2>F>M9=sgDXF^SZu!-;fAo z@!R(N+;j-@@5C8H3J#T*_R3)Mqd=FGT`+bn4N4C>XbZiofzpJh)zDc!4_KNJ}EZ6T)i>PHQgI1M1Z#cWJXt2*oXTV{+z0 zD85JtQ@+@a?X!U5_|3msg=u5FEkO5r&g^j4LTKh6WqNIG5bCx~dL9$sL+RTse3o(v zNTl;A-3MYBKy6ZbFm~P)iPg85sM-GkGF#_VTU~h!S*9_S%jM)T|JI!GvVz~{HCu$R zxfRd}{vq!90HtYR8#ZoJgr_{a0~ED~fDu=!k)mP(kLNDXiW$4W?FV5n;>m)1Jy4<} zAX@pbg`=O%!}LOF@W>VawOJfHg+Y?9)*?ZIp4l_6Wy(O?_hUc*2p>>&X}p z&N)%Gq~G!aqaj84R`G2f8BW~nkvej(&-ly$cHU1o93LL-TRdoE6P|2vc8)ISFc-z7 zS@mwMhaxC?1qD&ElNa&$IW=!pY@JRfySrME??d4P)S0D z->fT$4OW9cC2G&;?~;Ii;l|2eYu`b8Pk!$1CAXnx`Fl|pnO|T~`(%3adm0SoTgitN z9DqMnQwrUj9HFP!W;L#{4~YwPDjSkH1U+7|*P4!|Lyxn6=kLB2(0*K8?I)HdwCO4- zs(Y*g=IG%q2dZL$b#rrn&S5{GP#%e|Qe1}&^rVH4>dQiZLg#j0z9o<*IDRxS{|{j5 zbZgymk%bN)1fToPkdp_ID838A;yuuYf0~dMVgnxv9u~gyvxF`!nq2P>Oc=atUfAdQ z8v5OL%sIWa2>J}DN9c7P&>!>T&~1ZoC^XorZ&)?P$;%*LO1DLmH1ypFOPjxU6AT@c zqy}Ev4TB@9+2($g(4{J^d3ncr7*HuSX^mBaY|q3Fr#~-1b%g(EJr6A(Jnxz@oPa4-qCs;uhyKVQ zqivOu)9;NnROzJ0g6ilXf=};>ulx*19qv^7Xa}uv6<*n82y(~s^5zz$JkEOSsD60H zvjAu}ei_9q^Z;f?o5eHnY-sDF@?@-M0@Fn_;U`rQU^#EPn|h4}XI3f@uC32Qs;}5A zBL&JLuXf%y8hxY)%jk-w4?o*J!r2}6U9 zl2Ka=5GGP}wO85>dDCyL9Y&Xc694o^YnO2D8PeK(FTpjl4(YrZYPU~v`ku8*T|d0{ zax%QHDDPr6`~@r%-|^~uuQ}_T7JJUqZTW$D^Y?oD4m>2t^`^fXSq=k~^pvAlDNv|! z=E62(A83j4anU%d0L%h)13tYX7^Kqbx;mC4uZ`UwXyEU|#!XF13PLzw)^A_Kms1S` z{cpBY=nSO4^hv1evi;NNIaz3UB-9}7A`5D?%?hFAOwy^IJ{uTlG<~R3Ru2PzSWCC! zyy4HY#5S+{rtqQqd6mRgdl>xLw`VyI6b>q9kAu5=iXDg@rKKDL5NOTz+ z=0BecrOFqB=mX-AEWaoI!=frk)0=1GE0qh^dRq@R=QKiE(L(9QkHttTNxf#-Vs9vl zd?I$KH51z>0!5G2)P<607_S6MJQ}I2>`)}#D|{&Gm;sXJ(_k5ycZ! zsrwikihwFAUzrpwg+%`qzb9=I5i;7`d2R=sf=m;+=;-c3%>O*j7+hyrb;2-_GcU1r zIl|aY14ekGSIP=GV7ygW5yKb&`jy({YDRxR)<@q>myOSJ`$6cLLZN{xBB9twd#8@l zI7dI5hv@}|N1eI%mQ;?NGVg?)oY6dpSHSP{u6POQUR5ggLkW=PD^C&?ZN+$VIp;+F zI3!&*?gL59s)gs(tiRGjRUkz7naN}46+&M90iFN}J)jI}usrX0BM*ODB0P9l33+TL z^m6NX4|47Go#o@Y%}93W3&BofFT|&dxjC2_gp@S;KC3IQ16p9LN$~JGc#N!C!|x~q zw^Qvm{MNMqUer=9OYD*XN>jV!#*k%D@p_@&Gb<{R7%?7|Ay9&xjR_fkdx8hD+9S?} zt`tPhoL7<1^mqwHO8#2eZ`0vX*>!w6{uPqgx#WDvJ$v>D5#3^y>!fNChPst=h|xy4 zze?9ZtH}Ur(`gNShS^YFbJgJ9ucJ^+Ui_55;wI#gj`>CI&*Juja1j(gYkD`7tc|?P zS=119D}lpzGl;!)@Z&q=LizKD%JIDHxH;qfg@cyr-fOYE?fk}F!{esE9|WK^HX`f! zk}hbLuIm&`l7YAR@AmZZT|zqDPHwK8>ke-}ZLuD=d=Jlcj~dFERL|lQ|JD5+&s~|e z<3OQC+~A=TK^_S>9^OS)&TkfL#tVQYMJngeIn* zg(v=N^&;Gd{j*$D?2%auiJwF!c#^TuRHyahCjVU^IO!Y39Bf?-g#;F+6bcy>s6*Uh z6iZQjhvIt_D^UD^LIK4p6l+i@qnP~ST94vK6dO>WAM&_OD7K)`L;)zYQEWq@gF+X@ zE)@DGCciQ#znD!>n4;K=Vjl_%6bDgQp|C+QSwyznB9A%RksW4;S^|JoL-)_7zNS3=ICngHeHQ)ZVz9#DG_uN zSL^ZP1UN0#kz`}*=xOWaOqjfSBHy4193i^8xlAWxQZoAcmReIR-r3cP>V9^%3uSCS z{H0_xsk5|oPq$-JZT;(4#qWwP{#|bsa&{zgiwFP!000001D#aKPQx$|b>b#X*(FY#xq&DGaX_4?R6s@00|f*pWKt6% zY_}vX2ysE;!}t-tg)m84oH!vYma0x>^Sn1RzL-J?VN^pV!WM|gIy>E8+eQXnLa1hf z!T@6f{+*EC3&LqwVzVCz8-WGk*+iTC@*a#nPzD}oA)_@nH#I-YWhB(y>>J|BK>*T+ z=3Zx1c=9$5`0nO;$|$Eo9#cvBM3QV?qkDFAqz5=$c`l;l#P(VQ^=7NWCyeUl42gS3 zJgU{-=xv?T7(tjqD8R5o84%G|%$iY#OPuO(G){}hQO4>-Sepnds@7@-t*A(u4Z3=i z%1-oa^bP&8pWy;nuXK>CP?A7abklnD`|>v9;OY+D*706X0r@gakLTd@X{h%Hx4M`A zprGD4NpY0ZpvAp@x;&q*+NOrMitt1JP=s-1S2AIf&JqUK_{ zA-Q6l9^peqn;Rg{#roY2nw;eO?S1|A`EYrsaB*5uZq3Y2dy-a+c$1UhPCbcUlT#uT q)_I+mf6kWO-VX@>>xNa_{_`{+R8I43(qG@r&-5n+b#X`Ae+00*Fc=7A!$3MU`k56cDVCNll2b z-IBNTaHZt%E zLNyZ<1{fRg?}YS55KhAqn|({z7%T|SC)(tf_h9svGVnl48LhdwsrgwhBcbkQUlLai z1CTy6_d27(lUH%TcQ-FmMmZI-OC=c)NwRs3-tobK9^i20xr~w%+iMZjo2?2TGOCv| zBJMTus8)ZYzjZ=m1YruH0K*PtK*T^XYepF^ajL`7BrTpq8EaEveJZS~TB{kfq9SEB z=;~1_JJGMvH}uPXh6`Z5(m}F9Ndj5XP3z7d$g7Nlt2^{|yL){FcId2 diff --git a/tests/testthat/test_dryclean.R b/tests/testthat/test_dryclean.R index 91df302..b9454eb 100644 --- a/tests/testthat/test_dryclean.R +++ b/tests/testthat/test_dryclean.R @@ -123,7 +123,7 @@ test_that("clean", { expect_true(identical(colnames(values(a)), c("background.log", "foreground.log","input.read.counts", "median.chr", "foreground", "background", "log.reads"))) - expect_equal(a$background.log[1], 0.0508, tolerance = 0.001) + expect_equal(a$background.log[1], 0.0306, tolerance = 0.001) })
    center TRUEWhether to center a coverage. If you used the Fragcounter to correct the coverage, set to FALSE as it has already been centered.Whether to center a coverage
    cbs