From 3f05bf5e24971c8c4e536fdebcb02c90aac3ba35 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 31 Jul 2024 13:00:35 -0400 Subject: [PATCH 01/16] readme version --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a2a1ae41..bdbc7227 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![pylint](imgs/pylint.svg)](https://github.com/acenglish/truvari/actions/workflows/pylint.yml) [![FuncTests](https://github.com/acenglish/truvari/actions/workflows/func_tests.yml/badge.svg?branch=develop&event=push)](https://github.com/acenglish/truvari/actions/workflows/func_tests.yml) [![coverage](imgs/coverage.svg)](https://github.com/acenglish/truvari/actions/workflows/func_tests.yml) -[![develop](https://img.shields.io/github/commits-since/acenglish/truvari/v4.2.2)](https://github.com/ACEnglish/truvari/compare/v4.2.2...develop) +[![develop](https://img.shields.io/github/commits-since/acenglish/truvari/v4.3.0)](https://github.com/ACEnglish/truvari/compare/v4.3.0...develop) [![Downloads](https://static.pepy.tech/badge/truvari)](https://pepy.tech/project/truvari) ![Logo](https://raw.githubusercontent.com/ACEnglish/truvari/develop/imgs/BoxScale1_DarkBG.png) From 3150ee654ab69f9ca3e5942db6e26edb4eb4cc10 Mon Sep 17 00:00:00 2001 From: Adam English Date: Thu, 1 Aug 2024 19:49:31 -0400 Subject: [PATCH 02/16] correctly filtering star alleles --- .../bench_starallele/candidate.refine.bed | 1 + .../bench/bench_starallele/fn.vcf.gz | Bin 0 -> 2192 bytes .../bench/bench_starallele/fn.vcf.gz.tbi | Bin 0 -> 128 bytes .../bench/bench_starallele/fp.vcf.gz | Bin 0 -> 1814 bytes .../bench/bench_starallele/fp.vcf.gz.tbi | Bin 0 -> 129 bytes .../answer_key/bench/bench_starallele/log.txt | 50 ++++++++++++++++++ .../bench/bench_starallele/params.json | 1 + .../bench/bench_starallele/summary.json | 17 ++++++ .../bench/bench_starallele/tp-base.vcf.gz | Bin 0 -> 1608 bytes .../bench/bench_starallele/tp-base.vcf.gz.tbi | Bin 0 -> 72 bytes .../bench/bench_starallele/tp-comp.vcf.gz | Bin 0 -> 1606 bytes .../bench/bench_starallele/tp-comp.vcf.gz.tbi | Bin 0 -> 72 bytes repo_utils/sub_tests/bench.sh | 7 +++ .../test_files/variants/star.base.vcf.gz | Bin 0 -> 1812 bytes .../test_files/variants/star.base.vcf.gz.tbi | Bin 0 -> 129 bytes .../test_files/variants/star.comp.vcf.gz | Bin 0 -> 1543 bytes .../test_files/variants/star.comp.vcf.gz.tbi | Bin 0 -> 128 bytes truvari/matching.py | 2 +- 18 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 repo_utils/answer_key/bench/bench_starallele/candidate.refine.bed create mode 100644 repo_utils/answer_key/bench/bench_starallele/fn.vcf.gz create mode 100644 repo_utils/answer_key/bench/bench_starallele/fn.vcf.gz.tbi create mode 100644 repo_utils/answer_key/bench/bench_starallele/fp.vcf.gz create mode 100644 repo_utils/answer_key/bench/bench_starallele/fp.vcf.gz.tbi create mode 100644 repo_utils/answer_key/bench/bench_starallele/log.txt create mode 100644 repo_utils/answer_key/bench/bench_starallele/params.json create mode 100644 repo_utils/answer_key/bench/bench_starallele/summary.json create mode 100644 repo_utils/answer_key/bench/bench_starallele/tp-base.vcf.gz create mode 100644 repo_utils/answer_key/bench/bench_starallele/tp-base.vcf.gz.tbi create mode 100644 repo_utils/answer_key/bench/bench_starallele/tp-comp.vcf.gz create mode 100644 repo_utils/answer_key/bench/bench_starallele/tp-comp.vcf.gz.tbi create mode 100644 repo_utils/test_files/variants/star.base.vcf.gz create mode 100644 repo_utils/test_files/variants/star.base.vcf.gz.tbi create mode 100644 repo_utils/test_files/variants/star.comp.vcf.gz create mode 100644 repo_utils/test_files/variants/star.comp.vcf.gz.tbi diff --git a/repo_utils/answer_key/bench/bench_starallele/candidate.refine.bed b/repo_utils/answer_key/bench/bench_starallele/candidate.refine.bed new file mode 100644 index 00000000..015059aa --- /dev/null +++ b/repo_utils/answer_key/bench/bench_starallele/candidate.refine.bed @@ -0,0 +1 @@ +chr1 6004926 6005445 \ No newline at end of file diff --git a/repo_utils/answer_key/bench/bench_starallele/fn.vcf.gz b/repo_utils/answer_key/bench/bench_starallele/fn.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..5f9c77cd978dcd0f977fccc4ba9f48230705ff7f GIT binary patch literal 2192 zcmV;B2ygcviwFb&00000{{{d;LjnME2+desbKpj99Qi$47uY{E3j z`Mj_R7a2K=Su)QTF4b+vFm+1X7+PZ}aWsQEZH%_X5XYvDVHgUGUSlvNI)K_3qr%w4 z&}^#mV^$ao+t_d{1;(l{hNJ0#(b^ch!srIkY*RDa7^lK$w&v&-u@pJPHAu4@OPNGM zYLJ0#(^P_J!xOa(4J)=r>Rc0DGj)yH3Q(PFY++rqhze9^8xzMN#8xd=rQ5+8BtchH zja2EjHELM8P866b-8R-N2wLT*O1EK~#4?GZBDBi3slyL36&0aXvUStKj$pv&h6ch7M_CzE*B5N-dofkjuNtjprATv43{EUOutRC1?KYin-G~xt zSH~<#b9ne^&)6GnjVGn?Rg$wg%UYezVjiN1t@4FHAQLEGFqE+w3X%u~aSktlFoAed zJW8{OWhmw%->5=ENxoe8WY_-}1|pKaz0T8h)+%OLwE7<@OL3>TXH_+^YS?fTUJjbQvtdbk_$)f@fxDDvkZ>$2}*ypC9O&Vg^) zl0`hoS;M$8?LL5QY4ivlWNy3QruQ7qdh@pd8iAebFV^TbeMZPZ*z6O%uVF+n)ei3` zLhO4G=5j~>$WJIxcP4X^uCov}0d2uD$Vs)I-YkGco{ukbb$fYN)YhL|Zu=6KeR&xN zdo6vD4)@LZq(kfQEQml2=M_O#+MPJ?`}*{^-%Pf(byUXipY z_kw0wD*bOhwtSZ>pP)pOGQ?$?^I6MF`Wzn*@SZ`MlN_ZpBx4QJXNW06Pp=U6B?uBI zU68BxFY+MU4^gRiQwc%Q;J;Q(Rww7JSU{ie)@K$@hJMM1sFD z>AI9?rB5g>XX1FOOMrA%T7b;+c>=M%@O)7$0OBTCBGz&Q%%PPPSr5~Uof5t$eJ^EY z3ll|#^9)FNJnD-Qh?Xo6YlS1hm&ct<`I5&$#`D)L(FFT*U9>`;qB%=g2HhN$L`Jh< z{k;OT1y?PkZ>XF82wIwFGubsgv5I;AxP)-{MqhuV=-yR)yRR92qb;J-2mkB+#R-yB z0(WULeYn23JxMDhaMyNn^LR4pK@M*M@eMt3CGEl0N0ha21y{Im0Z*094-Ugv%!2(6 zIOK41s)R9I^3K7O$FI9iuZNT44RxKqfLbh9G5ao?syw66rl}iyuNvo>vzNxCVmBrg zr$_dY+ZL1WQ8$xA5b2v3>o!me{=$HbQLlF{f|nT=3nhCS6l~MYTh9?9h&X*s?;AwY zO+OsO9V$8;rVP33ZA&#is=Zr%L*II@Wh+}xPvsO~VF$d-X)&#f9Wg5ivRlFXFrTtN zYt1|16o$#EQpAS;Czii3CTCwRkPIULL6^4#IjDqjDj3Yu??eCx*M~!phuwc?`3Vc} z;3STN)#)=x#zMZzQV=VYmbPeBk0>HPM)cikzkm>s!tr~FSrrx3}Xq@s4DqB@$@>2Ls1pt?1>)>+bwR_8SH3<+tsKwX!J0A zVd24s{9w5~HeA@mhO)2&d~oFjy6Ibs79G^NNHn&T)rq7$Rd5zTZO95MGPf_L~~87hM4dV?UeK@j{G>Ek9r z^s+*oqhhl8{Ej${TR6p0_fPBYHN;ZBypqZds(LM|RvI{bm#Tmp{-34%5KewyM8AOz zPo%kF+6|g(#BT7MHdt<8y2o;|{N@?me&FSoBAhROcfzq}65~#WnR{ zA|op9X@;X7BB_3Wd&Fpw&mw!|vyUFX`kuN<^8E){9qg|~AOHX#iwFb&00000{{{d; SLjnLB00RI30000000017I4g4i literal 0 HcmV?d00001 diff --git a/repo_utils/answer_key/bench/bench_starallele/fn.vcf.gz.tbi b/repo_utils/answer_key/bench/bench_starallele/fn.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..9b6c2bc815fa5a0f178daffb410a3fcdb8208a8b GIT binary patch literal 128 zcmb2|=3rp}f&Xj_PR>jW$qd|upHfm%5)u-ak|cPUP6f;o?U-!b#dA<7Zn6Sbh|7Xa zJgPDVt|t<98yqnxDgD{Qu`#e}iriB7m&jD6`(Xx#U;5jvK5{cK$fMaI&A<${5<~z1 D^SC7s literal 0 HcmV?d00001 diff --git a/repo_utils/answer_key/bench/bench_starallele/fp.vcf.gz b/repo_utils/answer_key/bench/bench_starallele/fp.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..8e13e350313f891ebec8fd869d3a4baeab192bdb GIT binary patch literal 1814 zcmV+x2kH19iwFb&00000{{{d;LjnN#2F+MmZ`(K!es+Ha(LC(K#*Rqc)@lK-uLQ{9 z+G@}|1&T!3L}kepY3IQH_#H}?d?~SuT`X!K^`URR84hPY4tsVMv6x0_Hur^l-yN*9 z7Ws5`Hn_U>`eXNxtDZaROeW_&%7ct81WOb5b0>}wSP7bOwD38n;pacWCP)*(rnyZp z%kX(jlc|`wM6+z&&nS1nS< zV^kPZu@v1lYcOVo(QQ=&jCzQ%DvYLM)iPA$5Mx&u)lzNE#AZznl?_NWZL>Cs7;iwj zVi`s)h-@(tQ&*LmrQuDkv8EcDN~{{tCfSOqXsU_pK$~nU*tRjY>fBZ7wiOkUpw;Xe zuF`F(L^m}J*I=r2TZ(Ez&~-djx^>IIrh#jAgjCr!GYRKUZ}d&Zq!X*@2BuM$D0G&|^Y8nXa}bRlLOflMGVqbQ>h@{xuV zKT5NZW+-M{tW{x4NiO>y-sQifAR_78>oi?f3=zw?Kr??4Zw0d`xgGgx=NY+b%4en!oC(@$Pe^KJcw_4B6t1e1GI(&=$vHeDfPBgJTAt*t7EC`A2uJt3km zejwzE{uNKaQ8SV`Ntamwi-5ME8RVo|Pb>1l$?CY1tJ}d{zPH|ZQ1+!#^yMJ-_gcE2 zi0l>3nBd4q0>Ytl2x38f0dG@)`2I&f73enRmggpR30;Tgs-TJxk zv;7cj^==}8XpwRzze`6UH;9i3JZ{VHccSbXC`L30}wY(6TXxqpnz7c$$F4x^px;Bl6x+9HaDrsu*je& zk4N1vfM`yAzN{hQY<`?%!saaYGbUckLgVzuGH(TuqA5*i2HhMMg^VJ9`BDK61-D*E z-%vCC6|_`jk?filU&Ks2&LJGO*4JMtx_1@dYO8u%KNQi)z4tKcpCCyEaJNh*_t*W~ zle9ttcWuWvk0%#B$l;C8zoREsTlPTpGs>D%!4)o;!PDC22f`rck-y&o9}%ukl^}*o z-UXPl_;r``x;sAJP}k`*sKtB{(-+xP#TkXxO}(}Ds&Q6~K5tDb*4CtAx9~o)Y%%#6 zH8VK`;kKbDngyi=?=WC1sMWgQ{&U3mthRc$O7O7UI?E|S7-74uYHc0Ma??Hz;tu5< zE*D+)Q~9};S5w=%Rq}2GD_c%ZttP-i?5xOUKBw~?8Ouv!ckf1ln9v^^wL9Vbg_)_+ z!Y%$MDxN8oQ!nF4hT(&h%lm;GQGz(-6ej6+B7h_7r$dm(+<)VKOoKZ(gJXYj`t*^} zfGx5VWD2E)Em~{_6L~a>XV`mCG5#BDJ6woZB9(7XX5^&NgKLrx$vkC;OKj=n$>%ip z@*9U<3X3sKRv9FKLw~Iw<)hE8YnVj-2gmSeZ^f`}y^{vf-e&lO%S<`d-(@rGXh*7b&+Vg3)iiZ`zV zZ}MrwYp9B}84sNG6~qMv*PZIl;128Thim&SKlQj0*69vCr?^76*eR!dSDpYi$$x-S zlwWntGM#F!c1_!<24Yul<&SdtbTQFYXES%g8DywVF)O&e{m4l1lfs3#*HsEZ zE9dan7<%nauc7>n+wlFjWDGc0&pHfm%5)u-ak|cPUP6f;o?U-!b#dA<7Zn6T`6PE>> zc=q3rSRgg=U|C{^#I9X6ZY?)7Y`yv~E&hT`ZCRYiz>p$#yyhb}1A{!8CDIJcU^_ts E0Mv6R00000 literal 0 HcmV?d00001 diff --git a/repo_utils/answer_key/bench/bench_starallele/log.txt b/repo_utils/answer_key/bench/bench_starallele/log.txt new file mode 100644 index 00000000..516d6b93 --- /dev/null +++ b/repo_utils/answer_key/bench/bench_starallele/log.txt @@ -0,0 +1,50 @@ +2024-08-01 19:48:18,699 [INFO] Truvari v4.3.0 +2024-08-01 19:48:18,699 [INFO] Command /Users/english/code/truvari/truvari/__main__.py bench -b repo_utils/test_files/variants/star.base.vcf.gz -c repo_utils/test_files/variants/star.comp.vcf.gz -s 0 -o test_results/bench_starallele/ +2024-08-01 19:48:18,700 [INFO] Params: +{ + "base": "/Users/english/code/truvari/repo_utils/test_files/variants/star.base.vcf.gz", + "comp": "/Users/english/code/truvari/repo_utils/test_files/variants/star.comp.vcf.gz", + "output": "test_results/bench_starallele/", + "includebed": null, + "extend": 0, + "debug": false, + "reference": null, + "refdist": 500, + "pctseq": 0.7, + "minhaplen": 50, + "pctsize": 0.7, + "pctovl": 0.0, + "typeignore": false, + "chunksize": 1000, + "bSample": "HG002", + "cSample": "HG002", + "dup_to_ins": false, + "sizemin": 0, + "sizefilt": 0, + "sizemax": 50000, + "passonly": false, + "no_ref": false, + "pick": "single", + "check_monref": true, + "check_multi": true +} +2024-08-01 19:48:18,705 [INFO] Zipped 7 variants Counter({'base': 6, 'comp': 1}) +2024-08-01 19:48:18,705 [INFO] 1 chunks of 7 variants Counter({'base': 5, '__filtered': 1, 'comp': 1}) +2024-08-01 19:48:18,721 [INFO] Stats: { + "TP-base": 0, + "TP-comp": 0, + "FP": 1, + "FN": 5, + "precision": 0.0, + "recall": 0.0, + "f1": null, + "base cnt": 5, + "comp cnt": 1, + "TP-comp_TP-gt": 0, + "TP-comp_FP-gt": 0, + "TP-base_TP-gt": 0, + "TP-base_FP-gt": 0, + "gt_concordance": 0, + "gt_matrix": {} +} +2024-08-01 19:48:18,721 [INFO] Finished bench diff --git a/repo_utils/answer_key/bench/bench_starallele/params.json b/repo_utils/answer_key/bench/bench_starallele/params.json new file mode 100644 index 00000000..5b9a1694 --- /dev/null +++ b/repo_utils/answer_key/bench/bench_starallele/params.json @@ -0,0 +1 @@ +{"base": "/Users/english/code/truvari/repo_utils/test_files/variants/star.base.vcf.gz", "comp": "/Users/english/code/truvari/repo_utils/test_files/variants/star.comp.vcf.gz", "output": "test_results/bench_starallele/", "includebed": null, "extend": 0, "debug": false, "reference": null, "refdist": 500, "pctseq": 0.7, "minhaplen": 50, "pctsize": 0.7, "pctovl": 0.0, "typeignore": false, "chunksize": 1000, "bSample": "HG002", "cSample": "HG002", "dup_to_ins": false, "sizemin": 0, "sizefilt": 0, "sizemax": 50000, "passonly": false, "no_ref": false, "pick": "single", "check_monref": true, "check_multi": true} \ No newline at end of file diff --git a/repo_utils/answer_key/bench/bench_starallele/summary.json b/repo_utils/answer_key/bench/bench_starallele/summary.json new file mode 100644 index 00000000..aab0d861 --- /dev/null +++ b/repo_utils/answer_key/bench/bench_starallele/summary.json @@ -0,0 +1,17 @@ +{ + "TP-base": 0, + "TP-comp": 0, + "FP": 1, + "FN": 5, + "precision": 0.0, + "recall": 0.0, + "f1": null, + "base cnt": 5, + "comp cnt": 1, + "TP-comp_TP-gt": 0, + "TP-comp_FP-gt": 0, + "TP-base_TP-gt": 0, + "TP-base_FP-gt": 0, + "gt_concordance": 0, + "gt_matrix": {} +} \ No newline at end of file diff --git a/repo_utils/answer_key/bench/bench_starallele/tp-base.vcf.gz b/repo_utils/answer_key/bench/bench_starallele/tp-base.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..dd02c744f5fd29a66e328d8c42e129dd3461b033 GIT binary patch literal 1608 zcmV-O2DkYiiwFb&00000{{{d;LjnLR2F+O8ZsJG~eP+I*HF?;F4aDuc*<~e`OM;XO z157k|ij3XHt+Cx6cN;>!eyiP+!!T+v`WB<8SO?HC#;7nhHZ)tU z<1s6YrPzw$STz`{!WfRG14cW>*cC=Mux6W@d5m!?jAm<&Zegn?hsqYDS&mhkM2xo} zL$OV>7NqVlsg|KBHA}$nC}rQ23C3xclWsnTuOCbmpmvm-*AgdBlPAQ2LjkpQ`gkK9PWFF*)K zG$~%O%qJO&C>L+4u%je5gBjlEzabzZ>~+0hdBqUWj0+UHX|xmg9*KIIgcHc6|31rg zcYF}a*Tj=c@y`qc2-J@WPKD4PjlbC^O8{r=3w(#@LW#fx&E_*WF_EsKdy zT|st?E7Rs9*p^1mz#wzm1e(^;3tEfUKTsd+I?an z|MvBWyYzg95>3hwVD#KM-*=>W$B(wqpy0wiPg*b-v$(R>YI4?&PX>0D8_elFbX zFvMEDn@S)`8K?5QbmUXM@l&q!h@2$;In)Etf}bN!Nk-XO+~Wd?IDI9Ox|HZzPS{*Z z_~}%a07+I_fXvfH0LVf}sUTNCw^PmxYW1H-D)B$AVigq_3#S zeg!QRSs=Tn#nXt2=NQ7FZ~FQxMfa}a+ilHg8^ErF-{w%GK zztgYUi5PEUbW*IW4Ajz9(iyf$Z<#*b`IoW2<>*oWd|URf@R7 zKPO^Eh@5>XM=}f-1YO<|Lzlh1Dh8l^*D#O#502s0!HQwq`e%)!{p|<}SDMXiQ}~tJDt(pt-FI+3 zyOW!I&YJ!PuB+ZQ=Ed7xnWJuRX9C{mFW5)bSk`)9MQ1s5`%%n0`9aj@XcbZ~L@RJr zRwp?~*quC;Pu>Z71s&Yb-3Wa7c`>WVYhZDxZw>~@+`E}P+%?7z)5fUZm<)yum~a|j zzIJaLjW+6>%H;NEg3BowaT+WIA`7f~X(nQcjVdI4i zUq&QZh_IvTwquyOsvcw1Ek?zTW~k0FM%!SpW2=r~7&RDui&0dp1LzoIR2Ul@nyuFH zm=(rSY{hV_8jMw83`f%eqa9=H3ZolXvrWxB#yAy5vo%MzuvL>oWed_Q$Er;t##@k~ z*rr(vQg@hC%g~gXqv36?v96iArrI^2ZL$?h(KQR#fwtLJu;XBC*SV|G?I;=~L9eMA zuF`F5s$uCmuEA94wiV5SpzC<5bQ`vbEfd#NM6I%I>hK4fH5E~-Wb3A-IF4OYeYMIp zlmpwkS(Bh|+b%K{U4aQX#ynLtma1AhE+E5)$z69=K(1$(_jx=gtb;FS>x^`!g3)BL zKdaXyNdqVMZ3;|_apwP{toxt}<)YBxKKra3FS+2X| zgHXOEo;-?wW*9)AeoSyGg#Kv!ZJ(h}0ZM%&`$MuMY0?~bCpCso?;1WG8qU2yq-n&T z6VUiakZfszTUh@+6g;ByCVaLJalJH-p}-ya88zdko2*guA&0{AJQDoMXKo;#=OppM z*k#{2fq%Q6^g_*kzPVNuwK-E~U$CMqaZ73XG<#LQKXb8kS(ChYQSKA}ibB7!nE2Ec zWXHHNZ9al+Y4i*XGPg~jX)V2=wRrsl^}(()3v+azE)jB&Vz#l~))b^Fs`l}Dd_-Pc zPskJfD;|%dW*~Et=FEdbz*rClIjPRm3S4lqJndw3KYS=^JDUtQbE%YbIgH$cksc(1 ztx-q?M=lZ&4v8U%jJN`REC5o}eV5XwL2#fI!zgvd!LZ-X&}@RdG+QHSQJw__ES3Ck zzaDXyp3hLCNf~0C3K|@-q|4}ZfMWt_P6SE=BxCi`CB)>T`5M9=f*^s?xuWj;T)6Bo z#9F{yg zB@o5L<#`Per}1f$DUE65GAh=aM&sm1UW|fB(Sjs|!8H42Bcs60Un;<{;MNQ2D{7{{ zf|d#v$f;@ZETZB$hH&VczWz$ly{q_kTQl0mv4~C|XHVn78In{2_uFLpcssa1ODiOB z-*$5Me0I}=9NxM7JDStlwg;-8QPzSAo^W0W-_{O45PA_0+`|d@i16)J@gjKSU4bc$ z*88Nly~*i;x=mN07I7Al7dcer9fjV8dgtub;4CRw?QAOc&ZgqD@F8+DV)8R;GPwld zwy7w(4Xp(~VZl~Vt98ZQRX}-I+r2w2c-(H?e1Q-~*lBBe+rYBj^pAshK*fa1LvQA3 z^SPE+*V=}?;oS-r%ctjd6JQ~BT4uA@)A^o^6|J#fyRj#xEvaHfqncsZ}*RwlW zUdbGFdpo=DK7YYJsur@=`}#S{LF|rV-pLQ5K1ZvNdLdeYtFk-E zLBj6jeSGpY&@1TRZSF?k%g>7~MLqzF>wI%1Nao(njW+6>%q;?O2A<#28zQfqm09{3gcSV6g8QzHQqIjJ3t+2DSjy#Mm{)!?x)eDjuiC zxVop?zN^5vHOBT$3ovFAz?B% zLB!Ptkp&)*9M@Jr5RrX}QhdhxK#UI^>ri|Kknx)Z@@KHuNfYmyfN zK0G@J?)5jp<0|+z7i3OYW7A1WLllvPNH_wUKq4WCkr)Mego0GS10aMWnwK9HEFui0 zl#35n*m07Z!4&WF-w-g7;(a|YmNi3685bxC7U@pndnCqj5>7zN=+`XQz0pA`-||qF z5}lX^5E!2m97~~pJNmxQ(5C>U5t8#ES@I&+M!m5j@cEa4&xe68VJwOw<+B_-{s}Bw zM&LHK{z?Q->5B%hy+hnpk!8@}u6&KOObhZg(jJ%4c)m;pzl>NAi`fgw!z2sX4^H5J z*W;_i^kz5Lx^8GwWBL{A`UrK`3NJ|BEN& zNQQtoW-s+>FNbMxu+oEEur*4k;3z-> z(jgfnu^@qfr#V2%zVA{-G>#9vVwe_zI9T?(DVmN^SS;2^MpU(+m=%)$<86bxI{5@A z8doW1g`jbRrC6oM6C4qcITxsik<2wLR*+MKUe=KI5DW>NE)c5vxd_-{ib}tmNFZ7i zoXYp|posF#IaQ?%a*{`iq5~lDY?-D|3y51+fyBZ%~ zsJr47+#*|~oj9_!zddoRN7-7F-S{)n9akD;8;t`b_4z~gt(LXjQ;0@hk*E#M7*;PMez;~d^B0&t25 z>KC~2P2XR$`O}l8J&EHiStO4uU0|yPI}*3x)-W&y3KOUAFPh#pFMuupo$F4)<$q)}24c~<<6+p2I@BCms}zq9molyLu73Hr&ll?Vzd$lfkh#vNVH zHgGM6)o%%gS{KF(3eVI$0S!-VnycybgvXgbJ|lMFa6aFC2{Q-!Nrq9)Pl81?L-CrO zB<0`i#G7Yvv^k4a%)_n*8wiOtOnL-0uHv_c=68%ul&b#K(Mb zsiw-DRxjWDGc0&pHfm%5)u-ak|cPUP6f;o?U-!b#dA<7ZgK+Gj!4Bz z_BR|(zg0@fT!j%VI?ahIh+P-uuYSz#xxii8KQ<*iH}u E02c!#@Bjb+ literal 0 HcmV?d00001 diff --git a/repo_utils/test_files/variants/star.comp.vcf.gz b/repo_utils/test_files/variants/star.comp.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..529d2174d4e88ecd068992bd45bc83952dadc6f6 GIT binary patch literal 1543 zcmV+i2Kf0OiwFb&00000{{{d;LjnK^1;Wshn27`Ui@NL^xV5}`hH?ReuCdRHY9=1)-Q1Li5 z#??LD_FVH4lxM2xo}TlXAC z2{Lw=4A(YwMQM1eHMUI0G7V1w+KR2ax@Edp1=^~uW8cTvQyo{!?dv9pU@5wWYq>qs zuwBc-3QR4xr<*P$t>UTWwmk>C4pwx;sMU5X_`!~%BStN@<+!@{*UN zpzqr$a&$|F4QXPYYZ})uTnm?w;p6zeH!UI8(~E~?_Ci<}Urg5v(wzuK^ZEX+UX#2K z@aEY;aIe1!9#_G)xgc}G8k-5VXG@+}W# zDba~(0D#s!cl)h;2>>c8+iY$W$cjYtEGA+p0NPApD$;&$jpnq&WwTLd>jUO{ERsqwGa*P+@I z#8yEwV368sK+|4@aeMy$H|j%NcbY8GL$N~0N4nF&R>#zlp&RDs^+brg1)-1?{V$%7 zBQ2Ji6iXID5wI47fhKi5?Kpr#R>u#yeHcELy`7GSo4wSly&R^&!AcKu!PY3Df};Ql zNQY#Q#DW9@{>%YV_I;N!qH%oS6~nX$#KE%PP0@6W!eX&TGNP&l#jKG0A1@o+)yXF~ z(YQ)6D+G-jEX684p5Tap%(*~CjAX81v4Wf;^s!zbCmf+1WBgp|D^ul*h9kR6vxGfG-t9 zoMy*KCN!gIz^GVn291;dmgOpl0?kQI7;JM?4Kj*@A05A;HnV-HlnBsdGH!jo+zhxX9Sz$)?3aI!U+2v)9Tn*j+^y)5|60da9Q+bpEl1*Ud!y* z-Usj1&G_-YHF})1Zu_nAVAui&YJL6IyK7Zfz?KxhRo&!UWf5O~0wWJ&Uc?Uo03VA8 z1ONa4009360763o0N?;^PQhx!Fbq9=-_jb}iIXNl2a~40gfUuC=&>X&4U9QBp)lyj z*Gf0&p#tgYNqUmHYmh7xLaQugoFQ5=gBBO4(HgsiXiq>Bfw~M?B1p5}Rnhz5a)Gtv znn-Y?JI`R>>FuBX<<9%L|1ncrq^17$4U&>+K8G3N*J+`ag8S)->U5J69bFiF4|zeP z^W)$q(0pl4oevJA(#cad2S5ITi>j^nAoxb*Dk+%7(rlTy t%Tlc5#{=pzxAHmx001A02m}BC000301^_}s0stET0{{R300000001F&jW$qd|upHfm%5)u-ak|cPUP6f;o?U-!b#dA<7ZgK+GjtIp| z>kpbJGi%+@;}LcaoqoPU$IPq$(&8@~wn&u8+YAl{hOHZJGkxS{V30?%Lz;mZY$b>Q E02n4El>h($ literal 0 HcmV?d00001 diff --git a/truvari/matching.py b/truvari/matching.py index 6d55f188..60b85958 100644 --- a/truvari/matching.py +++ b/truvari/matching.py @@ -155,7 +155,7 @@ def filter_call(self, entry, base=False): Returns True if the call should be filtered Base has different filtering requirements, so let the method know """ - if self.params.check_monref and entry.alts in (None, '*'): # ignore monomorphic reference + if self.params.check_monref and entry.alts and entry.alts[0] in (None, '*'): # ignore monomorphic reference return True if self.params.check_multi and len(entry.alts) > 1: From b5b78b1821039e723ae83e5b9fe88cde9eb5b61d Mon Sep 17 00:00:00 2001 From: Adam English Date: Thu, 1 Aug 2024 20:22:30 -0400 Subject: [PATCH 03/16] func test update refine inputs had star alleles --- .../refine_output_one/refine.regions.txt | 34 +++++++++---------- .../refine.variant_summary.json | 14 ++++---- .../refine.region_summary.json | 16 ++++----- .../refine_output_three/refine.regions.txt | 24 ++++--------- .../refine.variant_summary.json | 8 ++--- .../refine_output_two/refine.regions.txt | 20 +++++------ 6 files changed, 53 insertions(+), 63 deletions(-) diff --git a/repo_utils/answer_key/refine/refine_output_one/refine.regions.txt b/repo_utils/answer_key/refine/refine_output_one/refine.regions.txt index 41bfa0b8..f617c30b 100644 --- a/repo_utils/answer_key/refine/refine_output_one/refine.regions.txt +++ b/repo_utils/answer_key/refine/refine_output_one/refine.regions.txt @@ -6,7 +6,7 @@ chr20 709729 709975 2 2 0 0 False 2 2 0 0 TP chr20 1149807 1150220 2 2 0 0 False 2 2 0 0 TP chr20 1257933 1260528 2 2 0 0 False 2 2 0 0 TP chr20 1301599 1302667 2 2 0 0 False 2 2 0 0 TP -chr20 2240925 2241726 1 1 5 1 True 5 5 0 0 TP +chr20 2240925 2241726 1 1 4 1 True 5 5 0 0 TP chr20 2442966 2443943 2 2 0 0 False 2 2 0 0 TP chr20 2842574 2842997 2 2 0 0 False 2 2 0 0 TP chr20 4032219 4033601 2 2 2 1 True 4 4 0 0 TP @@ -19,7 +19,7 @@ chr20 5554104 5554185 0 0 0 0 False 0 0 0 0 TN chr20 5962327 5962410 0 0 0 0 False 0 0 0 0 TN chr20 7435215 7435749 2 2 0 0 False 2 2 0 0 TP chr20 7719330 7719414 1 1 0 0 False 1 1 0 0 TP -chr20 7720911 7721003 1 1 1 0 True 1 1 0 0 TP +chr20 7720911 7721003 1 1 0 0 False 1 1 0 0 TP chr20 7721059 7721319 0 0 0 0 False 0 0 0 0 TN chr20 8661911 8662657 1 1 3 1 True 9 9 0 0 TP chr20 9015548 9016039 1 1 0 0 False 1 1 0 0 TP @@ -33,7 +33,7 @@ chr20 16226954 16228487 3 3 0 0 False 3 3 0 0 TP chr20 16257741 16259345 2 2 2 0 True 4 4 1 0 FN chr20 16395149 16395421 3 3 1 0 True 3 3 0 0 TP chr20 16894360 16894435 1 1 0 0 False 1 1 0 0 TP -chr20 17081216 17081410 2 2 1 0 True 2 2 0 0 TP +chr20 17081216 17081410 2 2 0 0 False 2 2 0 0 TP chr20 17197123 17197241 2 2 0 0 False 2 2 0 0 TP chr20 17303131 17303609 1 1 0 0 False 1 1 0 0 TP chr20 18209097 18210727 3 3 2 1 True 5 5 0 0 TP @@ -43,9 +43,9 @@ chr20 19663366 19663526 2 2 0 0 False 2 2 0 0 TP chr20 20295955 20296565 2 2 2 1 True 3 3 0 0 TP chr20 20320285 20320579 1 1 2 1 True 1 1 0 0 TP chr20 20337200 20337707 1 1 6 2 True 3 3 0 0 TP -chr20 20354570 20358323 7 7 9 1 True 16 16 0 0 TP +chr20 20354570 20358323 7 7 8 1 True 16 16 0 0 TP chr20 20458640 20458937 2 2 0 0 False 2 2 0 0 TP -chr20 21120212 21120539 2 2 1 0 True 1 1 0 0 TP +chr20 21120212 21120539 2 2 0 0 False 2 2 0 0 TP chr20 21721174 21721735 2 2 1 0 True 3 3 0 0 TP chr20 22081961 22084122 4 4 3 1 True 9 9 0 0 TP chr20 22471831 22472129 2 2 0 0 False 2 2 0 0 TP @@ -54,7 +54,7 @@ chr20 23155504 23155975 3 3 2 2 True 5 5 0 0 TP chr20 23267965 23268087 2 2 0 0 False 2 2 0 0 TP chr20 23560883 23561191 1 1 2 2 True 3 3 0 0 TP chr20 23942431 23942955 1 1 0 0 False 1 1 0 0 TP -chr20 24407983 24408997 3 3 1 0 True 2 2 0 0 TP +chr20 24407983 24408997 3 3 0 0 False 3 3 0 0 TP chr20 24681988 24682277 2 2 1 0 True 2 2 0 0 TP chr20 24836081 24836384 2 2 0 0 False 2 2 0 0 TP chr20 24956961 24957198 2 2 0 0 False 2 2 0 0 TP @@ -71,10 +71,10 @@ chr20 32723469 32724637 0 0 0 0 False 0 0 0 0 TN chr20 32724927 32724999 0 0 0 0 False 0 0 0 0 TN chr20 34205619 34205960 1 1 0 0 False 1 1 0 0 TP chr20 34235750 34236213 0 0 2 1 True 2 2 0 0 TP -chr20 35539120 35539724 4 4 1 0 True 4 4 0 0 TP +chr20 35539120 35539724 4 4 0 0 False 4 4 0 0 TP chr20 35580013 35580886 1 1 2 1 True 7 7 0 0 TP chr20 36642142 36642222 1 1 0 0 False 1 1 0 0 TP -chr20 37361752 37362058 2 2 1 0 True 2 2 0 0 TP +chr20 37361752 37362058 2 2 0 0 False 2 2 0 0 TP chr20 38123590 38124705 2 2 1 4 True 7 7 0 0 TP chr20 38194533 38194666 2 2 0 0 False 2 2 0 0 TP chr20 38313618 38315069 3 3 0 0 False 3 3 0 0 TP @@ -113,13 +113,13 @@ chr20 53913028 53913352 2 2 0 0 False 2 2 0 0 TP chr20 54690771 54690873 2 2 0 0 False 2 2 0 0 TP chr20 55532159 55533378 1 1 0 0 False 1 1 0 0 TP chr20 55624482 55625807 6 6 6 0 True 11 11 0 0 TP -chr20 55627344 55628450 7 7 4 0 True 8 8 0 0 TP +chr20 55627344 55628450 7 7 3 0 True 8 8 0 0 TP chr20 55943919 55945807 2 2 2 1 True 5 5 0 0 TP chr20 56280422 56282114 4 4 5 1 True 10 10 0 0 TP chr20 56284258 56284407 2 2 0 0 False 2 2 0 0 TP chr20 56710555 56711193 2 2 0 0 False 2 2 0 0 TP chr20 57090672 57091411 1 1 2 0 True 4 4 0 0 TP -chr20 57110212 57110785 2 2 1 0 True 2 2 0 0 TP +chr20 57110212 57110785 2 2 0 0 False 2 2 0 0 TP chr20 57189946 57190540 0 0 3 1 True 1 1 0 0 TP chr20 57350808 57350987 1 1 1 0 True 3 3 0 0 TP chr20 57417271 57418234 2 2 0 0 False 2 2 0 0 TP @@ -143,7 +143,7 @@ chr20 61176725 61176974 2 2 0 0 False 2 2 0 0 TP chr20 61201683 61202474 2 2 4 1 True 7 7 0 0 TP chr20 61229587 61229717 2 2 0 0 False 2 2 0 0 TP chr20 61282506 61283669 4 4 2 0 True 6 6 0 0 TP -chr20 61289625 61290555 1 1 2 1 True 2 2 0 0 TP +chr20 61289625 61290555 1 1 1 1 True 2 2 0 0 TP chr20 61329258 61329577 0 0 0 2 False 0 0 0 2 FP chr20 61331528 61331753 2 2 0 0 False 2 2 0 0 TP chr20 61337205 61337590 2 2 0 0 False 2 2 0 0 TP @@ -152,8 +152,8 @@ chr20 61475406 61475726 2 2 0 0 False 2 2 0 0 TP chr20 61561920 61562503 0 0 2 1 True 2 2 0 0 TP chr20 61653427 61654067 1 1 0 0 False 1 1 0 0 TP chr20 61723833 61724244 1 1 0 0 False 1 1 0 0 TP -chr20 61744188 61744654 2 2 1 0 True 2 2 0 0 TP -chr20 61783403 61784839 3 3 1 0 True 4 4 0 0 TP +chr20 61744188 61744654 2 2 0 0 False 2 2 0 0 TP +chr20 61783403 61784839 3 3 0 0 False 3 3 0 0 TP chr20 61919676 61921374 2 2 0 0 False 2 2 0 0 TP chr20 62057573 62059139 1 1 3 1 True 7 7 0 0 TP chr20 62212844 62213874 4 4 0 0 False 4 4 0 0 TP @@ -161,7 +161,7 @@ chr20 62270279 62271094 1 1 3 1 True 4 4 0 0 TP chr20 62317837 62318459 2 2 0 0 False 2 2 0 0 TP chr20 62321031 62321873 2 2 3 0 True 5 5 0 0 TP chr20 62349612 62349932 1 1 5 1 True 6 6 0 0 TP -chr20 62360190 62360712 0 0 8 2 True 5 5 0 0 TP +chr20 62360190 62360712 0 0 7 2 True 5 5 0 0 TP chr20 62417424 62418062 3 3 0 0 False 3 3 0 0 TP chr20 62422444 62422791 2 2 0 0 False 2 2 0 0 TP chr20 62504279 62504614 2 2 0 0 False 2 2 0 0 TP @@ -190,7 +190,7 @@ chr20 63294079 63294266 2 2 0 0 False 2 2 0 0 TP chr20 63303655 63304858 2 2 0 0 False 2 2 0 0 TP chr20 63334892 63335504 2 2 0 0 False 2 2 0 0 TP chr20 63361274 63361471 2 2 0 0 False 2 2 0 0 TP -chr20 63372065 63372691 2 2 1 0 True 2 2 0 0 TP +chr20 63372065 63372691 2 2 0 0 False 2 2 0 0 TP chr20 63392592 63392812 1 1 0 0 False 1 1 0 0 TP chr20 63401702 63402319 1 1 0 0 False 1 1 0 0 TP chr20 63450491 63450613 2 2 0 0 False 2 2 0 0 TP @@ -199,7 +199,7 @@ chr20 63491589 63493064 1 1 3 1 True 4 4 0 0 TP chr20 63535674 63536245 1 1 2 0 True 1 1 0 0 TP chr20 63546436 63546842 1 1 0 0 False 1 1 0 0 TP chr20 63553294 63553757 1 1 0 0 False 1 1 0 0 TP -chr20 63559288 63559963 1 1 4 1 True 4 4 0 0 TP +chr20 63559288 63559963 1 1 2 1 True 4 4 0 0 TP chr20 63641779 63642135 1 1 2 1 True 2 2 0 0 TP chr20 63669579 63670496 2 2 0 0 False 2 2 0 0 TP chr20 63693325 63693884 1 1 6 1 True 10 10 0 0 TP @@ -217,7 +217,7 @@ chr20 64042716 64043061 1 1 0 0 False 1 1 0 0 TP chr20 64065853 64066011 0 0 0 1 False 0 0 0 1 FP chr20 64090704 64091389 0 0 0 2 False 0 0 0 2 FP chr20 64096658 64097164 0 0 0 2 False 0 0 0 2 FP -chr20 64125109 64127974 3 3 3 0 True 12 12 0 0 TP +chr20 64125109 64127974 3 3 2 0 True 12 12 0 0 TP chr20 64131804 64133955 5 5 8 1 True 24 24 0 0 TP chr20 64134883 64136386 3 3 1 0 True 4 4 0 0 TP chr20 64157894 64158976 2 2 0 0 False 2 2 0 0 TP diff --git a/repo_utils/answer_key/refine/refine_output_one/refine.variant_summary.json b/repo_utils/answer_key/refine/refine_output_one/refine.variant_summary.json index edc91450..b9464c8e 100644 --- a/repo_utils/answer_key/refine/refine_output_one/refine.variant_summary.json +++ b/repo_utils/answer_key/refine/refine_output_one/refine.variant_summary.json @@ -1,11 +1,11 @@ { - "TP-base": 593, - "TP-comp": 593, + "TP-base": 594, + "TP-comp": 594, "FP": 15, "FN": 2, - "precision": 0.975328947368421, - "recall": 0.9966386554621849, - "f1": 0.9858686616791356, - "base cnt": 595, - "comp cnt": 608 + "precision": 0.9753694581280788, + "recall": 0.9966442953020134, + "f1": 0.9858921161825727, + "base cnt": 596, + "comp cnt": 609 } \ No newline at end of file diff --git a/repo_utils/answer_key/refine/refine_output_three/refine.region_summary.json b/repo_utils/answer_key/refine/refine_output_three/refine.region_summary.json index af76b54f..20e8fdaa 100644 --- a/repo_utils/answer_key/refine/refine_output_three/refine.region_summary.json +++ b/repo_utils/answer_key/refine/refine_output_three/refine.region_summary.json @@ -2,17 +2,17 @@ "TP": 22, "TN": 0, "FP": 24, - "FN": 58, - "base P": 83, + "FN": 48, + "base P": 73, "base N": 9, - "comp P": 91, + "comp P": 81, "comp N": 1, - "PPV": 0.24175824175824176, - "TPR": 0.26506024096385544, + "PPV": 0.2716049382716049, + "TPR": 0.3013698630136986, "TNR": 0.0, "NPV": 0.0, - "ACC": 0.2391304347826087, - "BA": 0.13253012048192772, - "F1": 0.25287356321839083, + "ACC": 0.2682926829268293, + "BA": 0.1506849315068493, + "F1": 0.2857142857142857, "UND": 0 } \ No newline at end of file diff --git a/repo_utils/answer_key/refine/refine_output_three/refine.regions.txt b/repo_utils/answer_key/refine/refine_output_three/refine.regions.txt index 4866a470..18097dbd 100644 --- a/repo_utils/answer_key/refine/refine_output_three/refine.regions.txt +++ b/repo_utils/answer_key/refine/refine_output_three/refine.regions.txt @@ -1,37 +1,31 @@ chrom start end in_tpbase in_tp in_fn in_fp refined out_tpbase out_tp out_fn out_fp state chr20 278819 279179 3 3 1 0 False 3 3 1 0 FN chr20 641802 642530 3 3 2 0 False 3 3 2 0 FN -chr20 2240850 2241400 1 1 5 1 True 1 1 4 1 FN,FP +chr20 2240850 2241400 1 1 4 1 True 1 1 4 1 FN,FP chr20 4032247 4033338 2 2 2 1 True 2 2 2 2 FN,FP chr20 5040366 5040587 0 0 0 2 False 0 0 0 2 FP chr20 5041831 5042378 1 1 1 1 True 2 2 0 0 TP -chr20 7720842 7721078 1 1 1 0 False 1 1 1 0 FN chr20 8661834 8662229 1 1 3 1 True 4 4 0 0 TP chr20 10802617 10802954 1 1 1 0 False 1 1 1 0 FN chr20 13848162 13848654 3 3 1 0 False 3 3 1 0 FN chr20 14861944 14862754 5 5 5 1 True 6 6 1 0 FN chr20 16257744 16259315 2 2 2 0 False 2 2 2 0 FN chr20 16395091 16395483 3 3 1 0 False 3 3 1 0 FN -chr20 17081183 17081475 2 2 1 0 False 2 2 1 0 FN chr20 18209029 18210244 3 3 2 1 True 6 6 0 2 FP chr20 20295904 20296440 2 2 2 1 True 5 5 0 0 TP chr20 20320229 20320629 1 1 2 1 True 4 4 0 0 TP chr20 20337175 20337734 1 1 6 2 True 3 3 0 0 TP chr20 20354802 20355545 3 3 1 0 False 3 3 1 0 FN -chr20 20356420 20357920 4 4 8 1 True 9 9 2 0 FN -chr20 21120188 21120571 2 2 1 0 False 2 2 1 0 FN +chr20 20356420 20357920 4 4 7 1 True 9 9 2 0 FN chr20 21721341 21721756 2 2 1 0 False 2 2 1 0 FN chr20 22082156 22084015 4 4 3 1 True 3 3 4 1 FN,FP chr20 23155468 23155967 3 3 2 2 True 4 4 1 1 FN,FP chr20 23560829 23561208 1 1 2 2 True 2 2 0 0 TP -chr20 24407963 24408930 3 3 1 0 False 3 3 1 0 FN chr20 24681956 24682235 2 2 1 0 False 2 2 1 0 FN chr20 25781680 25781901 0 0 0 1 False 0 0 0 1 FP chr20 32722934 32723155 0 0 1 0 False 0 0 1 0 FN chr20 34235788 34236091 0 0 2 1 True 2 2 0 0 TP -chr20 35539102 35539692 4 4 1 0 False 4 4 1 0 FN chr20 35580576 35580866 1 1 2 1 True 3 3 2 1 FN,FP -chr20 37361675 37361996 2 2 1 0 False 2 2 1 0 FN chr20 38123689 38124113 2 2 1 4 True 3 3 3 4 FN,FP chr20 38463887 38464454 2 2 2 0 False 2 2 2 0 FN chr20 41196260 41196605 3 3 1 0 False 3 3 1 0 FN @@ -44,11 +38,10 @@ chr20 50775536 50775942 1 1 2 1 True 1 1 0 1 FP chr20 51953709 51953930 0 0 0 1 False 0 0 0 1 FP chr20 53203989 53204362 2 2 2 1 True 2 2 0 0 TP chr20 55624698 55625762 6 6 6 0 False 6 6 6 0 FN -chr20 55627528 55628415 7 7 4 0 False 7 7 4 0 FN +chr20 55627528 55628415 7 7 3 0 False 7 7 3 0 FN chr20 55944162 55945285 2 2 2 1 True 2 2 2 0 FN chr20 56280431 56282023 4 4 5 1 True 4 4 2 1 FN,FP chr20 57090758 57091276 1 1 2 0 False 1 1 2 0 FN -chr20 57110340 57110703 2 2 1 0 False 2 2 1 0 FN chr20 57190146 57190538 0 0 3 1 True 1 1 0 0 TP chr20 57350746 57351030 1 1 1 0 False 1 1 1 0 FN chr20 57948891 57949456 1 1 4 1 True 1 1 4 1 FN,FP @@ -58,16 +51,14 @@ chr20 60702895 60703197 2 2 2 1 True 3 3 0 0 TP chr20 61100811 61102515 1 1 4 1 True 4 4 0 0 TP chr20 61201712 61202352 2 2 4 1 True 3 3 0 0 TP chr20 61282815 61283589 4 4 2 0 False 4 4 2 0 FN -chr20 61289552 61290383 1 1 2 1 True 2 2 0 0 TP +chr20 61289552 61290383 1 1 1 1 True 2 2 0 0 TP chr20 61329235 61329551 0 0 0 2 False 0 0 0 2 FP chr20 61561999 61562362 0 0 2 1 True 1 1 0 0 TP -chr20 61744291 61744702 2 2 1 0 False 2 2 1 0 FN -chr20 61783848 61784808 3 3 1 0 False 3 3 1 0 FN chr20 62057492 62058878 1 1 3 1 True 2 2 0 0 TP chr20 62270303 62270937 1 1 3 1 True 4 4 0 0 TP chr20 62321286 62321840 2 2 3 0 False 2 2 3 0 FN chr20 62349531 62349936 1 1 5 1 True 1 1 2 1 FN,FP -chr20 62360300 62360712 0 0 8 2 True 4 4 2 1 FN,FP +chr20 62360300 62360712 0 0 7 2 True 4 4 2 1 FN,FP chr20 62830540 62830807 2 2 1 1 True 1 1 0 0 TP chr20 62875131 62875514 2 2 3 0 False 2 2 3 0 FN chr20 63027956 63029140 4 4 1 1 True 3 3 0 0 TP @@ -75,10 +66,9 @@ chr20 63048983 63049269 3 3 1 0 False 3 3 1 0 FN chr20 63154577 63155031 1 1 1 0 False 1 1 1 0 FN chr20 63167363 63167674 2 2 1 0 False 2 2 1 0 FN chr20 63221399 63221831 1 1 2 1 True 2 2 0 0 TP -chr20 63372104 63372510 2 2 1 0 False 2 2 1 0 FN chr20 63491847 63492500 1 1 3 1 True 4 4 0 0 TP chr20 63535641 63536112 1 1 2 0 False 1 1 2 0 FN -chr20 63559305 63559829 1 1 4 1 True 5 5 0 0 TP +chr20 63559305 63559829 1 1 2 1 True 5 5 0 0 TP chr20 63641737 63642125 1 1 2 1 True 3 3 0 0 TP chr20 63693339 63693842 1 1 6 1 True 10 10 1 0 FN chr20 63770826 63771124 0 0 0 2 False 0 0 0 2 FP @@ -87,7 +77,7 @@ chr20 63964695 63966223 1 1 1 0 False 1 1 1 0 FN chr20 64065772 64065993 0 0 0 1 False 0 0 0 1 FP chr20 64090623 64091117 0 0 0 2 False 0 0 0 2 FP chr20 64096929 64097150 0 0 0 2 False 0 0 0 2 FP -chr20 64125250 64127985 3 3 3 0 False 3 3 3 0 FN +chr20 64125250 64127985 3 3 2 0 False 3 3 2 0 FN chr20 64131803 64133966 5 5 8 1 True 5 5 6 5 FN,FP chr20 64134880 64136440 3 3 1 0 False 3 3 1 0 FN chr20 64173328 64176440 4 4 7 3 True 9 9 3 6 FN,FP diff --git a/repo_utils/answer_key/refine/refine_output_three/refine.variant_summary.json b/repo_utils/answer_key/refine/refine_output_three/refine.variant_summary.json index abb55304..13ae86d6 100644 --- a/repo_utils/answer_key/refine/refine_output_three/refine.variant_summary.json +++ b/repo_utils/answer_key/refine/refine_output_three/refine.variant_summary.json @@ -2,10 +2,10 @@ "TP-base": 452, "TP-comp": 452, "FP": 43, - "FN": 107, + "FN": 95, "precision": 0.9131313131313131, - "recall": 0.8085867620751341, - "f1": 0.857685009487666, - "base cnt": 559, + "recall": 0.8263254113345521, + "f1": 0.8675623800383877, + "base cnt": 547, "comp cnt": 495 } \ No newline at end of file diff --git a/repo_utils/answer_key/refine/refine_output_two/refine.regions.txt b/repo_utils/answer_key/refine/refine_output_two/refine.regions.txt index 0902bf25..8e0d4009 100644 --- a/repo_utils/answer_key/refine/refine_output_two/refine.regions.txt +++ b/repo_utils/answer_key/refine/refine_output_two/refine.regions.txt @@ -18,15 +18,15 @@ chr20 13848064 13848749 3 3 1 0 True 3 3 0 0 TP chr20 14861837 14862841 5 5 5 1 True 15 15 0 0 TP chr20 16226854 16228587 3 3 0 0 False 3 3 0 0 TP chr20 16395049 16395521 3 3 1 0 True 3 3 0 0 TP -chr20 17081116 17081510 2 2 1 0 True 2 2 0 0 TP +chr20 17081116 17081510 2 2 0 0 False 2 2 0 0 TP chr20 17197023 17197341 2 2 0 0 False 2 2 0 0 TP chr20 18208997 18210827 3 3 2 1 True 5 5 0 0 TP chr20 18675550 18676015 2 2 0 0 False 2 2 0 0 TP chr20 19663266 19663626 2 2 0 0 False 2 2 0 0 TP chr20 20337100 20337807 1 1 6 2 True 3 3 0 0 TP -chr20 20354470 20358423 7 7 9 1 True 16 16 0 0 TP +chr20 20354470 20358423 7 7 8 1 True 16 16 0 0 TP chr20 20458540 20459037 2 2 0 0 False 2 2 0 0 TP -chr20 21120112 21120639 2 2 1 0 True 1 1 0 0 TP +chr20 21120112 21120639 2 2 0 0 False 2 2 0 0 TP chr20 21721074 21721835 2 2 1 0 True 3 3 0 0 TP chr20 22471731 22472229 2 2 0 0 False 2 2 0 0 TP chr20 22882602 22882930 2 2 0 0 False 2 2 0 0 TP @@ -61,11 +61,11 @@ chr20 50853539 50854028 2 2 0 0 False 2 2 0 0 TP chr20 51500492 51500980 2 2 0 0 False 2 2 0 0 TP chr20 53203887 53204573 2 2 2 1 True 4 4 0 0 TP chr20 53415542 53415960 2 2 0 0 False 2 2 0 0 TP -chr20 55627244 55628550 7 7 4 0 True 8 8 0 0 TP +chr20 55627244 55628550 7 7 3 0 True 8 8 0 0 TP chr20 55943819 55945907 2 2 2 1 True 5 5 0 0 TP chr20 56284158 56284507 2 2 0 0 False 2 2 0 0 TP chr20 56710455 56711293 2 2 0 0 False 2 2 0 0 TP -chr20 57110112 57110885 2 2 1 0 True 2 2 0 0 TP +chr20 57110112 57110885 2 2 0 0 False 2 2 0 0 TP chr20 57417171 57418334 2 2 0 0 False 2 2 0 0 TP chr20 57974704 57975302 2 2 0 0 False 2 2 0 0 TP chr20 58196600 58197036 2 2 0 0 False 2 2 0 0 TP @@ -82,8 +82,8 @@ chr20 61329158 61329677 0 0 0 2 False 0 0 0 2 FP chr20 61331428 61331853 2 2 0 0 False 2 2 0 0 TP chr20 61337105 61337690 2 2 0 0 False 2 2 0 0 TP chr20 61475306 61475826 2 2 0 0 False 2 2 0 0 TP -chr20 61744088 61744754 2 2 1 0 True 2 2 0 0 TP -chr20 61783303 61784939 3 3 1 0 True 4 4 0 0 TP +chr20 61744088 61744754 2 2 0 0 False 2 2 0 0 TP +chr20 61783303 61784939 3 3 0 0 False 3 3 0 0 TP chr20 61919576 61921474 2 2 0 0 False 2 2 0 0 TP chr20 62057473 62059239 1 1 3 1 True 7 7 0 0 TP chr20 62212744 62213974 4 4 0 0 False 4 4 0 0 TP @@ -110,11 +110,11 @@ chr20 63293979 63294366 2 2 0 0 False 2 2 0 0 TP chr20 63303555 63304958 2 2 0 0 False 2 2 0 0 TP chr20 63334792 63335604 2 2 0 0 False 2 2 0 0 TP chr20 63361174 63361571 2 2 0 0 False 2 2 0 0 TP -chr20 63371965 63372791 2 2 1 0 True 2 2 0 0 TP +chr20 63371965 63372791 2 2 0 0 False 2 2 0 0 TP chr20 63450391 63450713 2 2 0 0 False 2 2 0 0 TP chr20 63486663 63487516 2 2 0 0 False 2 2 0 0 TP chr20 63491489 63493164 1 1 3 1 True 4 4 0 0 TP -chr20 63559188 63560063 1 1 4 1 True 4 4 0 0 TP +chr20 63559188 63560063 1 1 2 1 True 4 4 0 0 TP chr20 63641679 63642235 1 1 2 1 True 2 2 0 0 TP chr20 63669479 63670596 2 2 0 0 False 2 2 0 0 TP chr20 63693225 63693984 1 1 6 1 True 10 10 0 0 TP @@ -123,7 +123,7 @@ chr20 63824531 63825244 2 2 0 0 False 2 2 0 0 TP chr20 63948414 63948858 2 2 1 1 True 3 3 0 0 TP chr20 64090604 64091489 0 0 0 2 False 0 0 0 2 FP chr20 64096558 64097264 0 0 0 2 False 0 0 0 2 FP -chr20 64125009 64128074 3 3 3 0 True 12 12 0 0 TP +chr20 64125009 64128074 3 3 2 0 True 12 12 0 0 TP chr20 64131704 64134055 5 5 8 1 True 24 24 0 0 TP chr20 64134783 64136486 3 3 1 0 True 4 4 0 0 TP chr20 64173309 64176629 4 4 7 3 True 17 17 0 0 TP From b63c5c34f7ec440f833d3e7c840dc695b6cc288e Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 7 Aug 2024 11:31:59 -0400 Subject: [PATCH 04/16] Fix #221 `stratify -w` is now default. `-w` is removed. `-v` is the old default. Or.. stratify by default counts variants within regions and `-v` counts those outside regions --- repo_utils/sub_tests/stratify.sh | 2 +- truvari/stratify.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/repo_utils/sub_tests/stratify.sh b/repo_utils/sub_tests/stratify.sh index 812f0a0c..b7f3257a 100644 --- a/repo_utils/sub_tests/stratify.sh +++ b/repo_utils/sub_tests/stratify.sh @@ -1,7 +1,7 @@ # ------------------------------------------------------------ # stratify # ------------------------------------------------------------ -run test_stratify $truv stratify -w \ +run test_stratify $truv stratify \ $INDIR/beds/include.bed \ $INDIR/variants/input1.vcf.gz \ -o $OD/stratify.txt diff --git a/truvari/stratify.py b/truvari/stratify.py index 60c81b5c..7535554a 100644 --- a/truvari/stratify.py +++ b/truvari/stratify.py @@ -1,5 +1,5 @@ """ -Count variants per-region in vcf +Count variants which start and end within each bed region """ import os import logging @@ -30,8 +30,8 @@ def parse_args(args): help="Output bed-like file") parser.add_argument("--header", action="store_true", help="Input regions have header to preserve in output") - parser.add_argument("-w", "--within", action="store_true", - help="Only count variants contained completely within region boundaries") + parser.add_argument("-v", "--complement", action="store_false", + help="Only count variants not within region boundaries") parser.add_argument("--debug", action="store_true", help="Verbose logging") args = parser.parse_args(args) @@ -62,7 +62,7 @@ def count_entries(vcf, chroms, regions, within): return counts -def benchdir_count_entries(benchdir, regions, within=False, threads=4): +def benchdir_count_entries(benchdir, regions, within=True, threads=4): """ Count the number of variants per bed region in Truvari bench directory by state @@ -95,12 +95,12 @@ def stratify_main(cmdargs): regions = pd.read_csv(args.regions, sep='\t', header=read_header) r_list = regions.to_numpy().tolist() # the methods expect lists if os.path.isdir(args.in_vcf): - counts = benchdir_count_entries(args.in_vcf, r_list, args.within) + counts = benchdir_count_entries(args.in_vcf, r_list, args.complement) else: chroms = np.array([_[0] for _ in r_list]) intvs = np.array([[_[1], _[2]] for _ in r_list]) counts = count_entries(pysam.VariantFile( - args.in_vcf), chroms, intvs, args.within) + args.in_vcf), chroms, intvs, args.complement) counts = pd.Series(counts, name="count").to_frame() counts.index = regions.index regions = regions.join(counts) From e0f9ce61c8766430a3b3f8c642f44d382e329e12 Mon Sep 17 00:00:00 2001 From: Adam English Date: Sun, 11 Aug 2024 13:42:53 -0400 Subject: [PATCH 05/16] update bed parsing slightly faster IntervalTree.from_tuples than .addi --- README.md | 1 + repo_utils/answer_key/anno/anno_bpovl.jl | Bin 2393 -> 2410 bytes truvari/annotations/density.py | 4 ++-- truvari/region_vcf_iter.py | 7 +++++-- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index bdbc7227..69deb8d4 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,7 @@ Use Truvari's comparison engine to consolidate redundant variants in a merged mu - [segment](https://github.com/acenglish/truvari/wiki/segment) - Normalization of SVs into disjointed genomic regions - [stratify](https://github.com/acenglish/truvari/wiki/stratify) - Count variants per-region in vcf - [divide](https://github.com/ACEnglish/truvari/wiki/divide) - Divide a VCF into independent shards + - [ga4gh](https://github.com/ACEnglish/truvari/wiki/ga4gh) - Consolidate benchmarking result VCFs ## 🔎 More Information diff --git a/repo_utils/answer_key/anno/anno_bpovl.jl b/repo_utils/answer_key/anno/anno_bpovl.jl index 00328b57e8e4ffe188c3641446f8c64abf7baf43..11e3e0c8fad7ccad07fddeb64c92f702ab60c767 100644 GIT binary patch delta 278 zcmca9^h$`Ofn{pqMiwqcjvm&OlFEYADU$^l;~C>O=Q9>CGfGc>z@jRu*pZ>+El`@o z{T~6iCr@M*pRBz0S&&tnk;bI70Mt@OC_S|b%E(Zitj+GgXfZj5 sT~|adL$MR$j|QgXywco)O1=2W`HVS}B{-%qT1{Tdq0i_v`7TEd0KvIFk^lez delta 260 zcmaDQbW@0>fn{p+MiwqcrVPo+VvO;V_p;|~e#FSa%qTVaIg2WfLPv(8w?JtU)Bpe6 zlNYjzPqttcn7jgrp|l`B0|YQaX@SX#tm2F`CZz?Sw!x^WRZv!j%4BPH2S)SBHSD^= kvKb1U5a%~ACFhmq7F6mj-pkI&XgPTwhd!g@ Date: Mon, 26 Aug 2024 12:46:26 -0400 Subject: [PATCH 06/16] correct monomorphic reference check --- truvari/matching.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/truvari/matching.py b/truvari/matching.py index 60b85958..fd114b4a 100644 --- a/truvari/matching.py +++ b/truvari/matching.py @@ -155,7 +155,7 @@ def filter_call(self, entry, base=False): Returns True if the call should be filtered Base has different filtering requirements, so let the method know """ - if self.params.check_monref and entry.alts and entry.alts[0] in (None, '*'): # ignore monomorphic reference + if self.params.check_monref and (not entry.alts or entry.alts[0] in (None, '*')): # ignore monomorphic reference return True if self.params.check_multi and len(entry.alts) > 1: From a39ce0ac4af2741a475fd92b82b74549bdacf112 Mon Sep 17 00:00:00 2001 From: Adam English Date: Mon, 26 Aug 2024 13:50:01 -0400 Subject: [PATCH 07/16] test coverage for monref --- repo_utils/run_unittest.py | 15 +++++++++++++++ repo_utils/test_files/variants/filter.vcf | 10 ++++++++++ truvari/matching.py | 5 +---- 3 files changed, 26 insertions(+), 4 deletions(-) create mode 100644 repo_utils/test_files/variants/filter.vcf diff --git a/repo_utils/run_unittest.py b/repo_utils/run_unittest.py index efe22db2..74fdae7f 100644 --- a/repo_utils/run_unittest.py +++ b/repo_utils/run_unittest.py @@ -70,3 +70,18 @@ vcf = pysam.VariantFile(vcf_fn) for entry in truvari.region_filter_fetch(vcf, tree, False): assert entry.info['include'] == 'in', f"Bad in {str(entry)}" + + +""" +Filtering logic +""" +vcf = pysam.VariantFile("repo_utils/test_files/variants/filter.vcf") +matcher = truvari.Matcher() +matcher.params.sizemin = 0 +matcher.params.sizefilt = 0 +matcher.params.passonly = True +for entry in vcf: + try: + assert matcher.filter_call(entry), f"Didn't filter {str(entry)}" + except ValueError as e: + assert e.args[0].startswith("Cannot compare multi-allelic"), f"Unknown exception {str(entry)}" diff --git a/repo_utils/test_files/variants/filter.vcf b/repo_utils/test_files/variants/filter.vcf new file mode 100644 index 00000000..cbffdf9a --- /dev/null +++ b/repo_utils/test_files/variants/filter.vcf @@ -0,0 +1,10 @@ +##fileformat=VCFv4.2 +##contig= +##FORMAT= +##INFO= +##FILTER= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG002 +reference 6 F1 TATATAT . 30 . reason=monref GT 1|1 +reference 7 F2 ATATGCG * 30 . reason=star GT 1|1 +reference 8 F3 ATATGCG A 30 FAIL reason=Fail GT 1|1 +reference 9 F3 ATATGCG A,C 30 . reason=multi GT 1|1 diff --git a/truvari/matching.py b/truvari/matching.py index fd114b4a..95b225f7 100644 --- a/truvari/matching.py +++ b/truvari/matching.py @@ -1,7 +1,6 @@ """ Comparison engine """ -import sys import types import logging from collections import Counter, defaultdict @@ -159,9 +158,7 @@ def filter_call(self, entry, base=False): return True if self.params.check_multi and len(entry.alts) > 1: - logging.error("Cannot compare multi-allelic records. Please split") - logging.error("line %s", str(entry)) - sys.exit(10) + raise ValueError(f"Cannot compare multi-allelic records. Please split\nline {str(entry)}") if self.params.passonly and truvari.entry_is_filtered(entry): return True From ffa0e813f7f8357d1b33d3e0c5ec887067eebf8f Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 28 Aug 2024 21:23:35 -0400 Subject: [PATCH 08/16] off-by-one error in stratify. Messed with truvari refine --- truvari/stratify.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/truvari/stratify.py b/truvari/stratify.py index 7535554a..cdade0cf 100644 --- a/truvari/stratify.py +++ b/truvari/stratify.py @@ -53,10 +53,9 @@ def count_entries(vcf, chroms, regions, within): for idx, row in enumerate(zip(chroms, regions)): chrom, coords = row start, end = coords - end += 1 tree[chrom].addi(start, end + 1) counts_idx[(chrom, start, end)] = idx - for _, location in truvari.region_filter(vcf, tree, within, True): + for _, location in truvari.region_filter(vcf, tree, inside=within, with_region=True): key = (location[0], location[1].begin, location[1].end - 1) counts[counts_idx[key]] += 1 return counts From 967a5b66af08354fe95bb816a9fca41fbd1cad6f Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 4 Sep 2024 11:02:33 -0400 Subject: [PATCH 09/16] raising warnings about --reference deprecation it will be removed at some point --- truvari/bench.py | 6 ++++-- truvari/collapse.py | 5 +++++ truvari/matching.py | 2 -- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/truvari/bench.py b/truvari/bench.py index 52e38799..9dc90d66 100644 --- a/truvari/bench.py +++ b/truvari/bench.py @@ -143,8 +143,10 @@ def check_params(args): if args.includebed and not os.path.exists(args.includebed): logging.error("Include bed %s does not exist", args.includebed) check_fail = True - if args.reference and not os.path.exists(args.reference): - logging.error("Reference %s does not exist", args.reference) + if args.reference: + logging.warning("`--reference` is no longer recommended and will be deprecated by v5") + not os.path.exists(args.reference): + logging.error("Reference %s does not exist", args.reference) check_fail = True return check_fail diff --git a/truvari/collapse.py b/truvari/collapse.py index 27ff4027..4e56c8b8 100644 --- a/truvari/collapse.py +++ b/truvari/collapse.py @@ -566,6 +566,11 @@ def check_params(args): if args.hap and args.keep != "first": check_fail = True logging.error("Using --hap must use --keep first") + if args.reference: + logging.warning("`--reference` is no longer recommended and will be deprecated by v5") + not os.path.exists(args.reference): + logging.error("Reference %s does not exist", args.reference) + check_fail = True return check_fail diff --git a/truvari/matching.py b/truvari/matching.py index 95b225f7..5ff891e3 100644 --- a/truvari/matching.py +++ b/truvari/matching.py @@ -89,8 +89,6 @@ def __init__(self, args=None): self.reference = None if self.params.reference is not None: - #sys.stderr.write("WARNING `--reference` is no longer recommended for use with bench/collapse ") - #sys.stderr.write("results will be slower and less accurate.\n") self.reference = pysam.FastaFile(self.params.reference) @staticmethod From bce0b5b1fcf5f1dfda5740826bfc6a66213a98ec Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 4 Sep 2024 11:12:23 -0400 Subject: [PATCH 10/16] syntax error --- truvari/bench.py | 2 +- truvari/collapse.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/truvari/bench.py b/truvari/bench.py index 9dc90d66..83aed73a 100644 --- a/truvari/bench.py +++ b/truvari/bench.py @@ -145,7 +145,7 @@ def check_params(args): check_fail = True if args.reference: logging.warning("`--reference` is no longer recommended and will be deprecated by v5") - not os.path.exists(args.reference): + if not os.path.exists(args.reference): logging.error("Reference %s does not exist", args.reference) check_fail = True return check_fail diff --git a/truvari/collapse.py b/truvari/collapse.py index 4e56c8b8..a1388912 100644 --- a/truvari/collapse.py +++ b/truvari/collapse.py @@ -568,7 +568,7 @@ def check_params(args): logging.error("Using --hap must use --keep first") if args.reference: logging.warning("`--reference` is no longer recommended and will be deprecated by v5") - not os.path.exists(args.reference): + if not os.path.exists(args.reference): logging.error("Reference %s does not exist", args.reference) check_fail = True return check_fail From ee167cc91feb76b3023fefff82fff346ed721223 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 4 Sep 2024 11:16:25 -0400 Subject: [PATCH 11/16] fixing reference warning --- truvari/bench.py | 2 +- truvari/collapse.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/truvari/bench.py b/truvari/bench.py index 83aed73a..2b880852 100644 --- a/truvari/bench.py +++ b/truvari/bench.py @@ -147,7 +147,7 @@ def check_params(args): logging.warning("`--reference` is no longer recommended and will be deprecated by v5") if not os.path.exists(args.reference): logging.error("Reference %s does not exist", args.reference) - check_fail = True + check_fail = True return check_fail diff --git a/truvari/collapse.py b/truvari/collapse.py index a1388912..e715d924 100644 --- a/truvari/collapse.py +++ b/truvari/collapse.py @@ -570,7 +570,7 @@ def check_params(args): logging.warning("`--reference` is no longer recommended and will be deprecated by v5") if not os.path.exists(args.reference): logging.error("Reference %s does not exist", args.reference) - check_fail = True + check_fail = True return check_fail From 82330d43c1b89158dd441776017755d92e1a6ea3 Mon Sep 17 00:00:00 2001 From: Adam English Date: Sat, 7 Sep 2024 11:06:20 -0400 Subject: [PATCH 12/16] new anno chunks command --- repo_utils/answer_key/anno/chunks.bed | 11 ++++ repo_utils/sub_tests/anno.sh | 10 ++- truvari/anno.py | 1 + truvari/annotations/__init__.py | 2 + truvari/annotations/chunks.py | 77 +++++++++++++++++++++++ truvari/collapse.py | 88 ++++++++++++++++++++++++--- 6 files changed, 179 insertions(+), 10 deletions(-) create mode 100644 repo_utils/answer_key/anno/chunks.bed create mode 100644 truvari/annotations/chunks.py diff --git a/repo_utils/answer_key/anno/chunks.bed b/repo_utils/answer_key/anno/chunks.bed new file mode 100644 index 00000000..3a6ee6d1 --- /dev/null +++ b/repo_utils/answer_key/anno/chunks.bed @@ -0,0 +1,11 @@ +chr20 149012 149095 3 3 3 +chr20 278929 279098 7 7 7 +chr20 280210 280275 1 1 1 +chr20 306267 306268 2 2 2 +chr20 380877 380878 1 1 1 +chr20 420664 420665 2 2 2 +chr20 613782 613837 1 1 1 +chr20 641905 642391 11 11 11 +chr20 709758 709852 2 2 2 +chr20 764441 764537 3 3 3 +chr20 949515 949619 1 1 1 diff --git a/repo_utils/sub_tests/anno.sh b/repo_utils/sub_tests/anno.sh index 961c3c4d..91325985 100644 --- a/repo_utils/sub_tests/anno.sh +++ b/repo_utils/sub_tests/anno.sh @@ -171,10 +171,18 @@ if [ $test_anno_grpaf_subset ]; then assert_equal $(fn_md5 $ANSDIR/anno/anno_grpaf.subtags.vcf) $(fn_md5 $OD/anno_grpaf.subtags.vcf) fi -# +# add id run test_anno_addid \ $truv anno addid $INDIR/variants/multi.vcf.gz -o $OD/addid.vcf if [ $test_anno_addid ]; then assert_exit_code 0 assert_equal $(fn_md5 $ANSDIR/anno/addid.vcf) $(fn_md5 $OD/addid.vcf) fi + + +# chunks +run test_anno_chunks $truv anno chunks $INDIR/variants/multi.vcf.gz -o $OD/chunks.bed +if [ $test_anno_chunks ]; then + assert_exit_code 0 + assert_equal $(fn_md5 $ANSDIR/anno/chunks.bed) $(fn_md5 $OD/chunks.bed) +fi diff --git a/truvari/anno.py b/truvari/anno.py index 62ce7592..29189eda 100644 --- a/truvari/anno.py +++ b/truvari/anno.py @@ -8,6 +8,7 @@ ANNOS = { "addid": ("Set ID field", tannos.addid_main), "bpovl": ("Annotation Intersection", tannos.bpovl_main), + "chunks": ("Chunk Boundaries and Variant Counts", tannos.chunks_main), "density": ("Variant Density", tannos.density_main), "dpcnt": ("Call Depth Counts", tannos.dpcnt_main), "gcpct": ("GC Percent", tannos.gcpct_main), diff --git a/truvari/annotations/__init__.py b/truvari/annotations/__init__.py index ad9e04f2..1a7de4a5 100644 --- a/truvari/annotations/__init__.py +++ b/truvari/annotations/__init__.py @@ -1,6 +1,7 @@ """ annotation modules """ from truvari.annotations.addid import addid_main from truvari.annotations.bpovl import bpovl_main +from truvari.annotations.chunks import chunks_main from truvari.annotations.density import density_main from truvari.annotations.dpcnt import dpcnt_main from truvari.annotations.gccontent import gcpct_main @@ -18,6 +19,7 @@ __all__ = [ 'addid_main', 'bpovl_main', + 'chunks_main', 'density_main', 'dpcnt_main', 'gcpct_main', diff --git a/truvari/annotations/chunks.py b/truvari/annotations/chunks.py new file mode 100644 index 00000000..a303ca5c --- /dev/null +++ b/truvari/annotations/chunks.py @@ -0,0 +1,77 @@ +""" +Count the number of variants in each chunk +Column 3: total number of variants +Column 4: comma-deliminted number of sub-chunks after accounting for size +Column 5: comma-deliminted number of sub-chunks after accounting for size and distance again +""" +import sys +import argparse +import pysam + +import truvari +from truvari.collapse import tree_size_chunker, tree_dist_chunker + +def parse_args(args): + """ + Parse arguments + """ + parser = argparse.ArgumentParser(prog="chunks", description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("input", type=str, + help="Input VCF") + parser.add_argument("-o", "--output", type=str, default="/dev/stdout", + help="Output name") + parser.add_argument("-c", "--chunksize", type=int, default=500, + help="Distance between variants to split chunks (%(default)s)") + parser.add_argument("-s", "--sizemin", type=int, default=50, + help="Minimum SV length") + parser.add_argument("-S", "--sizemax", type=int, default=50000, + help="Maximum SV length") + args = parser.parse_args(args) + truvari.setup_logging(show_version=True) + return args + +def get_bounds(cnk): + """ + Min start and max end of variants + """ + mstart = sys.maxsize + mend = 0 + for i in cnk: + mstart = min(mstart, i.start) + mend = max(mend, i.stop) + return mstart, mend + +def chunks_main(args): + """ + Main + """ + args = parse_args(args) + v = pysam.VariantFile(args.input) + m = truvari.Matcher() + m.params.pctseq = 0 + m.params.sizemin = args.sizemin + m.params.sizefilt = args.sizemin + m.params.sizemax = args.sizemax + m.params.chunksize = args.chunksize + m.params.refdist = args.chunksize + c = truvari.chunker(m, ('base', v)) + + with open(args.output, 'w') as fout: + for chunk, _ in c: + if not chunk['base']: + continue + s, e = get_bounds(chunk['base']) + chrom = chunk['base'][0].chrom + num = len(chunk['base']) + fout.write(f"{chrom}\t{s}\t{e}\t{num}") + s_cnts = [] + d_cnts = [] + for i, _ in tree_size_chunker(m, [(chunk, 0)]): + if i['base']: + s_cnts.append(len(i['base'])) + for j, _ in tree_dist_chunker(m, [(i, 0)]): + d_cnts.append(len(j['base'])) + s_cnts = ",".join(map(str, s_cnts)) + d_cnts = ",".join(map(str, d_cnts)) + fout.write(f"\t{s_cnts}\t{d_cnts}\n") diff --git a/truvari/collapse.py b/truvari/collapse.py index e715d924..068054ee 100644 --- a/truvari/collapse.py +++ b/truvari/collapse.py @@ -16,7 +16,6 @@ import pysam import numpy as np -from intervaltree import IntervalTree import truvari import truvari.bench as trubench @@ -721,6 +720,76 @@ def dump_log(self): logging.info("%d samples' FORMAT fields consolidated", self["stats_box"]["consol_cnt"]) +class LinkedList: + """ + Simple linked list which should(?) be faster than concatenating a bunch + regular lists + """ + def __init__(self, data=None): + """ + init + """ + self.head = None + self.tail = None + if data is not None: + self.append(data) + + def append(self, data): + """ + Put data onto end of list + """ + new_node = (data, None) + if not self.head: + self.head = new_node + self.tail = new_node + return + self.tail[1] = new_node + self.tail = new_node + + + def to_list(self): + """ + Turn into a regular list + """ + cur_node = self.head + ret = [] + while cur_node: + ret.append(cur_node[0]) + cur_node = cur_node[1] + return ret + + def concatenate(self, other): + """ + Combine two linked lists + """ + if not self.head: # If the first list is empty + return other + self.tail[1] = other.head + self.tail = other.tail + return self + +def merge_intervals(intervals): + """ + Merge sorted list of tuples + """ + if not intervals: + return [] + merged = [] + + current_start, current_end, current_data = intervals[0] + for i in range(1, len(intervals)): + next_start, next_end, next_data = intervals[i] + + # Should be <=, maybe. but this replicates intervaltree + if next_start < current_end: + current_end = max(current_end, next_end) + current_data.concatenate(next_data) + else: + # No overlap + merged.append((current_start, current_end, current_data)) + current_start, current_end, current_data = next_start, next_end, next_data + merged.append((current_start, current_end, current_data)) + return merged def tree_size_chunker(matcher, chunks): """ @@ -733,8 +802,8 @@ def tree_size_chunker(matcher, chunks): chunk_count += 1 yield chunk, chunk_count continue - tree = IntervalTree() yield {'__filtered': chunk['__filtered'], 'base': []}, chunk_count + to_add = [] for entry in chunk['base']: # How much smaller/larger would be in sizesim? sz = truvari.entry_size(entry) @@ -743,17 +812,18 @@ def tree_size_chunker(matcher, chunks): sz *= - \ 1 if truvari.entry_variant_type( entry) == truvari.SV.DEL else 1 - tree.addi(sz - diff, sz + diff, data=[entry]) - tree.merge_overlaps(data_reducer=lambda x, y: x + y) + to_add.append((sz - diff, sz + diff, LinkedList(entry))) + tree = merge_intervals(to_add) for intv in tree: chunk_count += 1 - yield {'base': intv.data, '__filtered': []}, chunk_count + yield {'base': intv[2].to_list(), '__filtered': []}, chunk_count def tree_dist_chunker(matcher, chunks): """ To reduce the number of variants in a chunk try to sub-chunk by reference distance before hand Needs to return the same thing as a chunker + This does nothing """ chunk_count = 0 for chunk, _ in chunks: @@ -761,17 +831,17 @@ def tree_dist_chunker(matcher, chunks): chunk_count += 1 yield chunk, chunk_count continue - tree = IntervalTree() yield {'__filtered': chunk['__filtered'], 'base': []}, chunk_count + to_add = [] for entry in chunk['base']: st, ed = truvari.entry_boundaries(entry) st -= matcher.params.refdist ed += matcher.params.refdist - tree.addi(st, ed, data=[entry]) - tree.merge_overlaps(data_reducer=lambda x, y: x + y) + to_add.append((st, ed, LinkedList(entry))) + tree = merge_intervals(to_add) for intv in tree: chunk_count += 1 - yield {'base': intv.data, '__filtered': []}, chunk_count + yield {'base': intv[2].to_list(), '__filtered': []}, chunk_count def collapse_main(args): From 6aac99d25657ee0691817748603489ae6bd1d4c4 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 7 Sep 2024 15:08:52 +0000 Subject: [PATCH 13/16] Update coverage score --- imgs/coverage.svg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/imgs/coverage.svg b/imgs/coverage.svg index ea71b13a..a9915353 100644 --- a/imgs/coverage.svg +++ b/imgs/coverage.svg @@ -17,7 +17,7 @@ coverage - 90% - 90% + 89% + 89% From feda14b5fa8577a24d3f9c8fd9d7004e311ec494 Mon Sep 17 00:00:00 2001 From: Adam English Date: Sat, 7 Sep 2024 11:42:41 -0400 Subject: [PATCH 14/16] update func tests --- imgs/coverage.svg | 4 ++-- .../answer_key/collapse/input1_median_collapsed.vcf | 5 +++-- repo_utils/answer_key/collapse/input1_median_removed.vcf | 3 +-- repo_utils/sub_tests/collapse.sh | 3 +++ truvari/collapse.py | 8 +++++--- 5 files changed, 14 insertions(+), 9 deletions(-) diff --git a/imgs/coverage.svg b/imgs/coverage.svg index ea71b13a..6a91c8d4 100644 --- a/imgs/coverage.svg +++ b/imgs/coverage.svg @@ -17,7 +17,7 @@ coverage - 90% - 90% + 91% + 91% diff --git a/repo_utils/answer_key/collapse/input1_median_collapsed.vcf b/repo_utils/answer_key/collapse/input1_median_collapsed.vcf index a5af9093..f3b9734b 100644 --- a/repo_utils/answer_key/collapse/input1_median_collapsed.vcf +++ b/repo_utils/answer_key/collapse/input1_median_collapsed.vcf @@ -888,7 +888,7 @@ chr20 419860 . AGTGACCCTGCACCTGGCT A 60 . QNAME=HG002-S9-H2-000001F;QSTART=37411 chr20 420228 . A C 60 . QNAME=HG002-S9-H2-000001F;QSTART=374466;QSTRAND=+ GT:PL:DP 0|1:2,3,6:24,29 chr20 420465 . A AG 60 . QNAME=HG002-S9-H2-000001F;QSTART=374704;QSTRAND=+ GT:PL:DP 0|1:6,10,1:32,35 chr20 420561 . A T 60 . QNAME=HG002-S9-H1-000001F;QSTART=399939;QSTRAND=+ GT:PL:DP 1|1:5,6,7:36,12 -chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCCATCCCCCGTCCGCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 61 . QNAME=HG002-S9-H2-000001F;QSTART=374905;QSTRAND=+;SVTYPE=INS;SVLEN=227;NumCollapsed=1;NumConsolidated=0;CollapseId=4.0;CollapseStart=420664;CollapseEnd=420665;CollapseSize=226 GT:PL 0/1:. +chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCCATCCCCCGTCCGCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 61 . QNAME=HG002-S9-H2-000001F;QSTART=374905;QSTRAND=+;SVTYPE=INS;SVLEN=227;NumCollapsed=1;NumConsolidated=0;CollapseId=9.0;CollapseStart=420664;CollapseEnd=420665;CollapseSize=226 GT:PL 0/1:. chr20 421409 . G A 60 . QNAME=HG002-S9-H1-000001F;QSTART=401013;QSTRAND=+ GT:PL:DP 1|1:2,3,7:14,41 chr20 421527 . T C 60 . QNAME=HG002-S9-H2-000001F;QSTART=375993;QSTRAND=+ GT:PL:DP 0|1:3,8,4:49,42 chr20 422066 . A G 60 . QNAME=HG002-S9-H1-000001F;QSTART=401670;QSTRAND=+ GT:PL:DP 1|1:3,7,8:9,39 @@ -1260,7 +1260,7 @@ chr20 639104 . A AT 60 . QNAME=HG002-S9-H1-000001F;QSTART=618555;QSTRAND=+ GT:PL chr20 640046 . C T 60 . QNAME=HG002-S9-H1-000001F;QSTART=619497;QSTRAND=+ GT:PL:DP 1|0:10,10,9:11,48 chr20 640049 . C T 60 . QNAME=HG002-S9-H1-000001F;QSTART=619500;QSTRAND=+ GT:PL:DP 1|1:2,3,4:13,44 chr20 641878 . C G 60 . QNAME=HG002-S9-H1-000001F;QSTART=621329;QSTRAND=+ GT:PL:DP 1|0:8,1,7:7,13 -chr20 641913 . G GGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGA 60 . QNAME=HG002-S9-H1-000001F;QSTART=621365;QSTRAND=+;SVTYPE=INS;SVLEN=66;NumCollapsed=1;NumConsolidated=0;CollapseId=6.0;CollapseStart=642120;CollapseEnd=642121;CollapseSize=66 GT:PL:DP 1/0:7,5,7:34,13 +chr20 641913 . G GGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGA 60 . QNAME=HG002-S9-H1-000001F;QSTART=621365;QSTRAND=+;SVTYPE=INS;SVLEN=66 GT:PL:DP 1/0:7,5,7:34,13 chr20 641944 . GGA G 60 . QNAME=HG002-S9-H1-000001F;QSTART=621462;QSTRAND=+ GT:PL:DP 1|0:6,6,6:17,7 chr20 642012 . GGT G 60 . QNAME=HG002-S9-H1-000001F;QSTART=621528;QSTRAND=+ GT:PL:DP 1|0:5,7,1:6,23 chr20 642037 . T TG 60 . QNAME=HG002-S9-H1-000001F;QSTART=621551;QSTRAND=+ GT:PL:DP 1|0:4,10,10:14,47 @@ -1280,6 +1280,7 @@ chr20 642284 . A G 60 . QNAME=HG002-S9-H1-000001F;QSTART=621795;QSTRAND=+ GT:PL: chr20 642300 . G GCCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGCCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGC 60 . QNAME=HG002-S9-H1-000001F;QSTART=621812;QSTRAND=+;SVTYPE=INS;SVLEN=408 GT:PL:DP 1/0:5,10,5:44,36 chr20 642300 . G GGC 60 . QNAME=HG002-S9-H2-000001F;QSTART=597225;QSTRAND=+ GT:PL:DP 0|1:4,10,8:25,7 chr20 642330 . G C 60 . QNAME=HG002-S9-H1-000001F;QSTART=622249;QSTRAND=+ GT:PL:DP 1|0:2,9,8:16,10 +chr20 642330 . G GGCCCAGCGGGGGTGGAGTTGCCTGTGGTGGGGGGCCCAGCGGGGGTGGAGTTGCCTGTGGGGGGGC 60 . QNAME=HG002-S9-H2-000001F;QSTART=597257;QSTRAND=+;SVTYPE=INS;SVLEN=66 GT:PL:DP 0/1:6,1,9:20,25 chr20 642362 . G GC 60 . QNAME=HG002-S9-H1-000001F;QSTART=622282;QSTRAND=+ GT:PL:DP 1|1:10,5,1:17,12 chr20 642391 . G GC 60 . QNAME=HG002-S9-H1-000001F;QSTART=622312;QSTRAND=+ GT:PL:DP 1|1:2,9,2:10,50 chr20 642420 . G GC 60 . QNAME=HG002-S9-H2-000001F;QSTART=597415;QSTRAND=+ GT:PL:DP 0|1:7,1,2:41,16 diff --git a/repo_utils/answer_key/collapse/input1_median_removed.vcf b/repo_utils/answer_key/collapse/input1_median_removed.vcf index 2905ef00..cccc217d 100644 --- a/repo_utils/answer_key/collapse/input1_median_removed.vcf +++ b/repo_utils/answer_key/collapse/input1_median_removed.vcf @@ -47,5 +47,4 @@ ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA24385 -chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCATCCCCCGTCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 60 . QNAME=HG002-S9-H1-000001F;QSTART=400044;QSTRAND=+;SVTYPE=INS;SVLEN=226;PctSeqSimilarity=0.9956;PctSizeSimilarity=0.9956;PctRecOverlap=1;SizeDiff=1;StartDistance=0;EndDistance=0;GTMatch=.;TruScore=99;MatchId=4.0 GT:PL:DP 1/0:4,8,6:32,9 -chr20 642330 . G GGCCCAGCGGGGGTGGAGTTGCCTGTGGTGGGGGGCCCAGCGGGGGTGGAGTTGCCTGTGGGGGGGC 60 . QNAME=HG002-S9-H2-000001F;QSTART=597257;QSTRAND=+;SVTYPE=INS;SVLEN=66;PctSeqSimilarity=0.9576;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-417;EndDistance=-417;GTMatch=.;TruScore=65;MatchId=6.0 GT:PL:DP 0/1:6,1,9:20,25 +chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCATCCCCCGTCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 60 . QNAME=HG002-S9-H1-000001F;QSTART=400044;QSTRAND=+;SVTYPE=INS;SVLEN=226;PctSeqSimilarity=0.9956;PctSizeSimilarity=0.9956;PctRecOverlap=1;SizeDiff=1;StartDistance=0;EndDistance=0;GTMatch=.;TruScore=99;MatchId=9.0 GT:PL:DP 1/0:4,8,6:32,9 diff --git a/repo_utils/sub_tests/collapse.sh b/repo_utils/sub_tests/collapse.sh index 93220119..70c5d66b 100644 --- a/repo_utils/sub_tests/collapse.sh +++ b/repo_utils/sub_tests/collapse.sh @@ -74,6 +74,8 @@ if [ $test_collapse_badparams ]; then assert_exit_code 100 fi +# Lower collapse sub-chunk threshold +export COLLAP_SUB=1 run test_collapse_median $truv collapse -f $INDIR/references/reference.fa \ -i $INDIR/variants/input1.vcf.gz \ -o $OD/input1_median_collapsed.vcf \ @@ -82,6 +84,7 @@ run test_collapse_median $truv collapse -f $INDIR/references/reference.fa \ if [ $test_collapse_median ]; then collapse_assert 1_median fi +unset COLLAP_SUB run test_collapse_intragt $truv collapse -i $INDIR/variants/bcftools_merged.vcf.gz \ -o $OD/inputintragt_collapsed.vcf \ diff --git a/truvari/collapse.py b/truvari/collapse.py index 068054ee..7cd44c79 100644 --- a/truvari/collapse.py +++ b/truvari/collapse.py @@ -738,7 +738,7 @@ def append(self, data): """ Put data onto end of list """ - new_node = (data, None) + new_node = [data, None] if not self.head: self.head = new_node self.tail = new_node @@ -797,8 +797,9 @@ def tree_size_chunker(matcher, chunks): Needs to return the same thing as a chunker """ chunk_count = 0 + thresh = 1 if "COLLAP_SUB" in os.environ and os.environ["COLLAP_SUB"] == "1" else 100 for chunk, _ in chunks: - if len(chunk['base']) < 100: # fewer than 100 is fine + if len(chunk['base']) < thresh: # fewer than 100 is fine chunk_count += 1 yield chunk, chunk_count continue @@ -826,8 +827,9 @@ def tree_dist_chunker(matcher, chunks): This does nothing """ chunk_count = 0 + thresh = 1 if "COLLAP_SUB" in os.environ and os.environ["COLLAP_SUB"] == "1" else 100 for chunk, _ in chunks: - if len(chunk['base']) < 100: # fewer than 100 is fine + if len(chunk['base']) < thresh: # fewer than 100 is fine chunk_count += 1 yield chunk, chunk_count continue From a5391dd42a9d797b19db29b0dc3956857812b142 Mon Sep 17 00:00:00 2001 From: Adam English Date: Mon, 9 Sep 2024 09:39:57 -0400 Subject: [PATCH 15/16] version bump --- repo_utils/answer_key/help.txt | 2 +- truvari/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/repo_utils/answer_key/help.txt b/repo_utils/answer_key/help.txt index 07955c2f..d282d1af 100644 --- a/repo_utils/answer_key/help.txt +++ b/repo_utils/answer_key/help.txt @@ -1,6 +1,6 @@ usage: truvari [-h] CMD ... -Truvari v4.3.0 Structural Variant Benchmarking and Annotation +Truvari v4.3.1 Structural Variant Benchmarking and Annotation Available commands: bench Performance metrics from comparison of two VCFs diff --git a/truvari/__init__.py b/truvari/__init__.py index 1c508655..4a94883e 100644 --- a/truvari/__init__.py +++ b/truvari/__init__.py @@ -88,7 +88,7 @@ :data:`truvari.SZBINTYPE` """ -__version__ = '4.3.0' +__version__ = '4.3.1' from truvari.annotations.af_calc import ( From 0f023376acd2a3d17c100e113e01cf1ef7945d66 Mon Sep 17 00:00:00 2001 From: Adam English Date: Mon, 9 Sep 2024 09:41:43 -0400 Subject: [PATCH 16/16] freezing wiki --- docs/v4.3.1/Citations.md | 30 ++ ...-with-a-High\342\200\220Density-of-SVs.md" | 23 + docs/v4.3.1/Development.md | 90 ++++ docs/v4.3.1/Home.md | 36 ++ docs/v4.3.1/Installation.md | 72 +++ docs/v4.3.1/MatchIds.md | 74 +++ docs/v4.3.1/Multi-allelic-VCFs.md | 11 + docs/v4.3.1/Updates.md | 297 +++++++++++ docs/v4.3.1/anno.md | 494 ++++++++++++++++++ docs/v4.3.1/bench.md | 295 +++++++++++ docs/v4.3.1/collapse.md | 163 ++++++ docs/v4.3.1/consistency.md | 179 +++++++ docs/v4.3.1/divide.md | 58 ++ docs/v4.3.1/ga4gh.md | 3 + docs/v4.3.1/phab.md | 157 ++++++ docs/v4.3.1/refine.md | 142 +++++ docs/v4.3.1/segment.md | 18 + docs/v4.3.1/stratify.md | 58 ++ docs/v4.3.1/vcf2df.md | 81 +++ 19 files changed, 2281 insertions(+) create mode 100644 docs/v4.3.1/Citations.md create mode 100644 "docs/v4.3.1/Collapse-on-Regions-with-a-High\342\200\220Density-of-SVs.md" create mode 100644 docs/v4.3.1/Development.md create mode 100644 docs/v4.3.1/Home.md create mode 100644 docs/v4.3.1/Installation.md create mode 100644 docs/v4.3.1/MatchIds.md create mode 100644 docs/v4.3.1/Multi-allelic-VCFs.md create mode 100644 docs/v4.3.1/Updates.md create mode 100644 docs/v4.3.1/anno.md create mode 100644 docs/v4.3.1/bench.md create mode 100644 docs/v4.3.1/collapse.md create mode 100644 docs/v4.3.1/consistency.md create mode 100644 docs/v4.3.1/divide.md create mode 100644 docs/v4.3.1/ga4gh.md create mode 100644 docs/v4.3.1/phab.md create mode 100644 docs/v4.3.1/refine.md create mode 100644 docs/v4.3.1/segment.md create mode 100644 docs/v4.3.1/stratify.md create mode 100644 docs/v4.3.1/vcf2df.md diff --git a/docs/v4.3.1/Citations.md b/docs/v4.3.1/Citations.md new file mode 100644 index 00000000..d600b860 --- /dev/null +++ b/docs/v4.3.1/Citations.md @@ -0,0 +1,30 @@ +# Citing Truvari + +English, A.C., Menon, V.K., Gibbs, R.A. et al. Truvari: refined structural variant comparison preserves allelic diversity. Genome Biol 23, 271 (2022). https://doi.org/10.1186/s13059-022-02840-6 + +# Citations + +List of publications using Truvari. Most of these are just pulled from a [Google Scholar Search](https://scholar.google.com/scholar?q=truvari). Please post in the [show-and-tell](https://github.com/spiralgenetics/truvari/discussions/categories/show-and-tell) to have your publication added to the list. +* [A robust benchmark for detection of germline large deletions and insertions](https://www.nature.com/articles/s41587-020-0538-8) +* [Leveraging a WGS compression and indexing format with dynamic graph references to call structural variants](https://www.biorxiv.org/content/10.1101/2020.04.24.060202v1.abstract) +* [Duphold: scalable, depth-based annotation and curation of high-confidence structural variant calls](https://academic.oup.com/gigascience/article/8/4/giz040/5477467?login=true) +* [Parliament2: Accurate structural variant calling at scale](https://academic.oup.com/gigascience/article/9/12/giaa145/6042728) +* [Learning What a Good Structural Variant Looks Like](https://www.biorxiv.org/content/10.1101/2020.05.22.111260v1.full) +* [Long-read trio sequencing of individuals with unsolved intellectual disability](https://www.nature.com/articles/s41431-020-00770-0) +* [lra: A long read aligner for sequences and contigs](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1009078) +* [Samplot: a platform for structural variant visual validation and automated filtering](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02380-5) +* [AsmMix: A pipeline for high quality diploid de novo assembly](https://www.biorxiv.org/content/10.1101/2021.01.15.426893v1.abstract) +* [Accurate chromosome-scale haplotype-resolved assembly of human genomes](https://www.nature.com/articles/s41587-020-0711-0) +* [Accurate circular consensus long-read sequencing improves variant detection and assembly of a human genome](https://www.nature.com/articles/s41587-019-0217-9) +* [NPSV: A simulation-driven approach to genotyping structural variants in whole-genome sequencing data](https://academic.oup.com/bioinformatics/article-abstract/37/11/1497/5466452) +* [SVIM-asm: structural variant detection from haploid and diploid genome assemblies](https://academic.oup.com/bioinformatics/article/36/22-23/5519/6042701?login=true) +* [Readfish enables targeted nanopore sequencing of gigabase-sized genomes](https://www.nature.com/articles/s41587-020-00746-x) +* [stLFRsv: A Germline Structural Variant Analysis Pipeline Using Co-barcoded Reads](https://internal-journal.frontiersin.org/articles/10.3389/fgene.2021.636239/full) +* [Long-read-based human genomic structural variation detection with cuteSV](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02107-y) +* [An international virtual hackathon to build tools for the analysis of structural variants within species ranging from coronaviruses to vertebrates](https://f1000research.com/articles/10-246) +* [Paragraph: a graph-based structural variant genotyper for short-read sequence data](https://link.springer.com/article/10.1186/s13059-019-1909-7) +* [Genome-wide investigation identifies a rare copy-number variant burden associated with human spina bifida](https://www.nature.com/articles/s41436-021-01126-9) +* [TT-Mars: Structural Variants Assessment Based on Haplotype-resolved Assemblies](https://www.biorxiv.org/content/10.1101/2021.09.27.462044v1.abstract) +* [An ensemble deep learning framework to refine large deletions in linked-reads](https://www.biorxiv.org/content/10.1101/2021.09.27.462057v1.abstract) +* [MAMnet: detecting and genotyping deletions and insertions based on long reads and a deep learning approach](https://academic.oup.com/bib/advance-article-abstract/doi/10.1093/bib/bbac195/6587170)](https://academic.oup.com/bib/advance-article-abstract/doi/10.1093/bib/bbac195/6587170) +* [Automated filtering of genome-wide large deletions through an ensemble deep learning framework](https://www.sciencedirect.com/science/article/pii/S1046202322001712#b0110) diff --git "a/docs/v4.3.1/Collapse-on-Regions-with-a-High\342\200\220Density-of-SVs.md" "b/docs/v4.3.1/Collapse-on-Regions-with-a-High\342\200\220Density-of-SVs.md" new file mode 100644 index 00000000..1ea24210 --- /dev/null +++ "b/docs/v4.3.1/Collapse-on-Regions-with-a-High\342\200\220Density-of-SVs.md" @@ -0,0 +1,23 @@ +Some regions of the reference genome can give rise to a huge number of SVs. These regions are typically in telomeres or centromeres, especially on chm13. Furthermore, some multi-sample VCFs might contain samples with a high genetic distance from the reference can also create an unreasonable number of SVs. These 'high-density' regions can cause difficulties for `truvari collapse` when there are too many comparisons that need to be made. + +If you find `truvari collapse` 'freezing' during processing where it is no longer writing variants to the output VCFs, you should investigate if there are regions which should be excluded from analysis. To do this, first run: + +``` +truvari anno chunks input.vcf.gz > counts.bed +``` + +The `counts.bed` will have the chromosome, start, and end position of each chunk in the VCF as well as three additional columns. The 3rd column has the number of SVs inside the chunk while the 4th and 5th have a comma-separated counts of the number of variants in 'sub-chunks'. These sub-chunk counts correspond to advanced separation of the variants beyond just their window. The 4th is after separating by svtype and svlen, the 5th is re-chunking the svtype/svlen separation by distance. + +If you find spans in the `counts.bed` with a huge number of SVs, these are prime candidates for exclusion to speed up `truvari collapse`. To exclude them, you first subset the regions of interest and then use `bedtools` to create a bed file that will skip them. + +``` +# exclude regions with more than 30k SVs. You should test this threshold. +awk '$4 >= 30000' counts.bed > to_exclude.bed +bedtools complement -g genome.bed -i to_exclude.bed > to_analyize.bed + +truvari collapse --bed to_analyze.bed -i input.vcf.gz +``` + +When considering what threshold you would like to use, just looking at the 3rd column may not be sufficient as the 'sub-chunks' may be smaller and will therefore run faster. There also is no guarantee that a region with high SV density will be slow. For example, if there are 100k SVs in a window, but they all could collapse, it would only take O(N - 1) comparisons to perform the collapse. The problems arise when the 100k SVs have few redundant SVs and therefore requires O(N**2) comparisons. + +A conservative workflow for figuring out which regions to exclude would run `truvari collapse` without a `--bed`, wait for regions to 'freeze', and then check if the last variant written out by `truvari collapse` is just before a high-density chunk in the `counts.bed`. That region could then be excluded an a collapsing could be repeated until success. \ No newline at end of file diff --git a/docs/v4.3.1/Development.md b/docs/v4.3.1/Development.md new file mode 100644 index 00000000..ccb38493 --- /dev/null +++ b/docs/v4.3.1/Development.md @@ -0,0 +1,90 @@ +# Truvari API +Many of the helper methods/objects are documented such that developers can reuse truvari in their own code. To see developer documentation, visit [readthedocs](https://truvari.readthedocs.io/en/latest/). + +Documentation can also be seen using +```python +import truvari +help(truvari) +``` + +# docker + +A Dockerfile exists to build an image of Truvari. To make a Docker image, clone the repository and run +```bash +docker build -t truvari . +``` + +You can then run Truvari through docker using +```bash +docker run -v `pwd`:/data -it truvari +``` +Where `pwd` can be whatever directory you'd like to mount in the docker to the path `/data/`, which is the working directory for the Truvari run. You can provide parameters directly to the entry point. +```bash +docker run -v `pwd`:/data -it truvari anno svinfo -i example.vcf.gz +``` + +If you'd like to interact within the docker container for things like running the CI/CD scripts +```bash +docker run -v `pwd`:/data --entrypoint /bin/bash -it truvari +``` +You'll now be inside the container and can run FuncTests or run Truvari directly +```bash +bash repo_utils/truvari_ssshtests.sh +truvari anno svinfo -i example.vcf.gz +``` + +# CI/CD + +Scripts that help ensure the tool's quality. Extra dependencies need to be installed in order to run Truvari's CI/CD scripts. + +```bash +pip install pylint anybadge coverage +``` + +Check code formatting with +```bash +python repo_utils/pylint_maker.py +``` +We use [autopep8](https://pypi.org/project/autopep8/) (via [vim-autopep8](https://github.com/tell-k/vim-autopep8)) for formatting. + +Test the code and generate a coverage report with +```bash +bash repo_utils/truvari_ssshtests.sh +``` + +Truvari leverages github actions to perform these checks when new code is pushed to the repository. We've noticed that the actions sometimes hangs through no fault of the code. If this happens, cancel and resubmit the job. Once FuncTests are successful, it uploads an artifact of the `coverage html` report which you can download to see a line-by-line accounting of test coverage. + +# git flow + +To organize the commits for the repository, we use [git-flow](https://danielkummer.github.io/git-flow-cheatsheet/). Therefore, `develop` is the default branch, the latest tagged release is on `master`, and new, in-development features are within `feature/` + +When contributing to the code, be sure you're working off of develop and have run `git flow init`. + +# versioning + +Truvari uses [Semantic Versioning](https://semver.org/) and tries to stay compliant to [PEP440](https://peps.python.org/pep-0440/). As of v3.0.0, a single version is kept in the code under `truvari/__init__.__version__`. We try to keep the suffix `-dev` on the version in the develop branch. When cutting a new release, we may replace the suffix with `-rc` if we've built a release candidate that may need more testing/development. Once we've committed to a full release that will be pushed to PyPi, no suffix is placed on the version. If you install Truvari from the develop branch, the git repo hash is appended to the installed version as well as '.uc' if there are un-staged commits in the repo. + +# docs + +The github wiki serves the documentation most relevant to the `develop/` branch. When cutting a new release, we freeze and version the wiki's documentation with the helper utility `docs/freeze_wiki.sh`. + +# Creating a release +Follow these steps to create a release + +0) Bump release version +1) Run tests locally +2) Update API Docs +3) Change Updates Wiki +4) Freeze the Wiki +5) Ensure all code is checked in +6) Do a [git-flow release](https://danielkummer.github.io/git-flow-cheatsheet/) +7) Use github action to make a testpypi release +8) Check test release +```bash +python3 -m venv test_truvari +python3 -m pip install --index-url https://test.pypi.org/simple --extra-index-url https://pypi.org/simple/ truvari +``` +9) Use GitHub action to make a pypi release +10) Download release-tarball.zip from step #9’s action +11) Create release (include #9) from the tag +12) Checkout develop and Bump to dev version and README ‘commits since’ badge \ No newline at end of file diff --git a/docs/v4.3.1/Home.md b/docs/v4.3.1/Home.md new file mode 100644 index 00000000..e76a3cb3 --- /dev/null +++ b/docs/v4.3.1/Home.md @@ -0,0 +1,36 @@ +The wiki holds documentation most relevant for develop. For information on a specific version of Truvari, see [`docs/`](https://github.com/spiralgenetics/truvari/tree/develop/docs) + +Citation: +English, A.C., Menon, V.K., Gibbs, R.A. et al. Truvari: refined structural variant comparison preserves allelic diversity. Genome Biol 23, 271 (2022). https://doi.org/10.1186/s13059-022-02840-6 + +# Before you start +VCFs aren't always created with a strong adherence to the format's specification. + +Truvari expects input VCFs to be valid so that it will only output valid VCFs. + +We've developed a separate tool that runs multiple validation programs and standard VCF parsing libraries in order to validate a VCF. + +Run [this program](https://github.com/acenglish/usable_vcf) over any VCFs that are giving Truvari trouble. + +Furthermore, Truvari expects 'resolved' SVs (e.g. DEL/INS) and will not interpret BND signals across SVTYPEs (e.g. combining two BND lines to match a DEL call). A brief description of Truvari bench methodology is linked below. + +Finally, Truvari does not handle multi-allelic VCF entries and as of v4.0 will throw an error if multi-allelics are encountered. Please use `bcftools norm` to split multi-allelic entries. + +# Index + +- [[Updates|Updates]] +- [[Installation|Installation]] +- Truvari Commands: + - [[anno|anno]] + - [[bench|bench]] + - [[collapse|collapse]] + - [[consistency|consistency]] + - [[divide|divide]] + - [[ga4gh|ga4gh]] + - [[phab|phab]] + - [[refine|refine]] + - [[segment|segment]] + - [[stratify|stratify]] + - [[vcf2df|vcf2df]] +- [[Development|Development]] +- [[Citations|Citations]] \ No newline at end of file diff --git a/docs/v4.3.1/Installation.md b/docs/v4.3.1/Installation.md new file mode 100644 index 00000000..a8aae743 --- /dev/null +++ b/docs/v4.3.1/Installation.md @@ -0,0 +1,72 @@ +Recommended +=========== +For stable versions of Truvari, use pip +``` +python3 -m pip install truvari +``` +Specific versions can be installed via +``` +python3 -m pip install truvari==3.2.0 +``` +See [pypi](https://pypi.org/project/Truvari/#history) for a history of all distributed releases. + +Manual Installation +=================== +To build Truvari directly, clone the repository and switch to a specific tag. +``` +git clone https://github.com/ACEnglish/truvari.git +git checkout tags/v3.0.0 +python3 -m pip install . +``` + +To see a list of all available tags, run: +``` +git tag -l +``` + +If you have an older clone of the repository and don't see the version you're looking for in tags, make sure to pull the latest changes: +``` +git pull +git fetch --all --tags +``` + +Mamba / Conda +============= +NOTE!! There is a very old version of Truvari on bioconda that - for unknown reasons - supersedes the newer, supported versions. Users may need to specify to conda which release to build. See [this ticket](https://github.com/ACEnglish/truvari/issues/130#issuecomment-1196607866) for details. + +Truvari releases are automatically deployed to bioconda. +Users can follow instructions here (https://mamba.readthedocs.io/en/latest/installation.html) to install mamba. (A faster alternative conda compatible package manager.) + +Creating an environment with Truvari and its dependencies. +``` +mamba create -c conda-forge -c bioconda -n truvari truvari +``` + +Alternatively, see the [conda page](https://anaconda.org/bioconda/truvari) for details +``` +conda install -c bioconda truvari +``` + +Building from develop +===================== +The default branch is `develop`, which holds in-development changes. This is for developers or those wishing to try experimental features and is not recommended for production. Development is versioned higher than the most recent stable release with an added suffix (e.g. Current stable release is `3.0.0`, develop holds `3.1.0-dev`). If you'd like to install develop, repeat the steps above but without `git checkout tags/v3.0.0`. See [wiki](https://github.com/spiralgenetics/truvari/wiki/Development#git-flow) for details on how branching is handled. + +Docker +====== +See [Development](https://github.com/spiralgenetics/truvari/wiki/Development#docker) for details on building a docker container. + +edlib error +=========== +Some environments have a hard time installing edlib due to a possible cython bug ([source](https://bugs.launchpad.net/ubuntu/+source/cython/+bug/2006404)). If you seen an error such as the following: +``` +edlib.bycython.cpp:198:12: fatal error: longintrepr.h: No such file or directory + 198 | #include "longintrepr.h" +``` + +One method to prevent the error message is to run the following commands ([source](https://github.com/Martinsos/edlib/issues/212)) +```bash +python3 -m venv my_env +source my_env/bin/activate +python3 -m pip install --no-cache-dir cython setuptools wheel +EDLIB_USE_CYTHON=1 python3 -m pip install truvari --no-cache +``` \ No newline at end of file diff --git a/docs/v4.3.1/MatchIds.md b/docs/v4.3.1/MatchIds.md new file mode 100644 index 00000000..ca52076f --- /dev/null +++ b/docs/v4.3.1/MatchIds.md @@ -0,0 +1,74 @@ +MatchIds are used to tie base/comparison calls together in post-processing for debugging or other exploring. MatchIds have a structure of `{chunkid}.{callid}`. The chunkid is unique id per-chunk of calls. All calls sharing chunkid were within `--chunksize` distance and were compared. The callid is unique to a call in a chunk for each VCF. Because `bench` processes two VCFs (the base and comparison VCFs), the `MatchId` has two values: the first is the base variant's MatchId and the second the comparison variant's MatchId. + +For `--pick single`, the two MatchIds will be identical in the e.g. tp-base.vcf.gz and tp-comp.vcf.gz. However, for `--pick ac|multi`, it's possible to have cases such as one base variant matching to multiple comparison variants. That would give us MatchIds like: + +``` +# tp-base.vcf +MatchId=4.0,4.1 + +# tp-comp.vcf +MatchId=4.0,4.1 +MatchId=4.0,4.2 +``` + +This example tells us that the tp-comp variants are both pointing to `4.0` in tp-base. The tp-base variant has a higher match to the tp-comp `4.1` variant. + +One easy way to combine matched variants is to use `truvari vcf2df` to convert a benchmarking result to a pandas DataFrame and leverage pandas' merge operation. First, we convert the `truvari bench` result. + +```bash +truvari vcf2df --info --bench-dir bench_result/ data.jl +``` + +Next, we combine rows of matched variants: +```python +import joblib +import pandas as pd + +# Load the data +data = joblib.load("data.jl") + +# Separate out the variants from the base VCF and add new columns of the base/comp ids +base = data[data['state'].isin(['tpbase', 'fn'])].copy() +base['base_id'] = base['MatchId'].apply(lambda x: x[0]) +base['comp_id'] = base['MatchId'].apply(lambda x: x[1]) + +# Separate out the variants from the comparison VCF and add new columns of the base/comp ids +comp = data[data['state'].isin(['tp', 'fp'])].copy() +comp['base_id'] = comp['MatchId'].apply(lambda x: x[0]) +comp['comp_id'] = comp['MatchId'].apply(lambda x: x[1]) + +# Merge the base/comparison variants +combined = pd.merge(base, comp, left_on='base_id', right_on='comp_id', suffixes=('_base', '_comp')) + +# How many comp variants matched to multiple base variants? +counts1 = combined['base_id_comp'].value_counts() +print('multi-matched comp count', (counts1 != 1).sum()) + +# How many base variants matched to multiple comp variants? +counts2 = combined['comp_id_base'].value_counts() +print('multi-matched base count', (counts2 != 1).sum()) +``` + +The `MatchId` is also used by `truvari collapse`. However there are two differences. First, in the main `collapse` output, the relevant INFO field is named `CollapsedId`. Second, because collapse only has a single input VCF, it is much easier to merge DataFrames. To merge collapse results kept variants with those that were removed, we again need to convert the VCFs to DataFrames: + +```bash +truvari vcf2df -i kept.vcf.gz kept.jl +truvari vcf2df -i removed.vcf.gz remov.jl +``` + +Then we combine them: +```python +import joblib +import pandas as pd + +# Load the kept variants and set the index. +kept = joblib.load("kept.jl").set_index('CollapseId') + +# Load the removed variants and set the index. +remov = joblib.load("remov.jl") +remov['CollapseId'] = remov['MatchId'].apply(lambda x: x[0]) +remov.set_index('CollapseId', inplace=True) + +# Join the two sets of variants +result_df = kept.join(remov, how='right', rsuffix='_removed') +``` \ No newline at end of file diff --git a/docs/v4.3.1/Multi-allelic-VCFs.md b/docs/v4.3.1/Multi-allelic-VCFs.md new file mode 100644 index 00000000..fd0eb23e --- /dev/null +++ b/docs/v4.3.1/Multi-allelic-VCFs.md @@ -0,0 +1,11 @@ +Truvari only compares the first alternate allele in VCFs. If a VCF contains multi-allelic sites such as: + +``` +chr2 1948201 . T TACAACACGTACGATCAGTAGAC,TCAACACACAACACGTACGATCAGTAGAC .... +``` + +Then pre-process the VCFs with bcftools: + +```bash +bcftools norm -m-any base_calls.vcf.gz | bgzip > base_calls_split.vcf.gz +``` \ No newline at end of file diff --git a/docs/v4.3.1/Updates.md b/docs/v4.3.1/Updates.md new file mode 100644 index 00000000..1cb41cfb --- /dev/null +++ b/docs/v4.3.1/Updates.md @@ -0,0 +1,297 @@ +# Truvari 4.3.1 +*September 9, 2024* +* `bench` + * Correctly filtering `ALT=*` alleles ([details](https://github.com/ACEnglish/truvari/discussions/219)) and monomorphic reference + * including test coverage this time +* `stratify` + * Default behavior is to count variants within ([#221](https://github.com/ACEnglish/truvari/issues/221)) +* `collapse` + * Faster sub-chunking operations by dropping use of pyintervaltree +* `anno chunks` + * New command for identifying windows with a high number of SVs ([details](https://github.com/ACEnglish/truvari/wiki/Collapse-on-Regions-with-a-High%E2%80%90Density-of-SVs)) + + +# Truvari 4.3.0 +*July 31, 2024* +* `refine` & `stratify` + * Fixed variant and bed boundary overlapping issue +* general + * Definition of variants within a region now includes replacement style from TR callers having an anchor base 1bp upstream of catalog/includebed regions + * Propagating MAFFT errors ([#204](https://github.com/ACEnglish/truvari/issues/204)) + * FIPS compliance ([#205](https://github.com/ACEnglish/truvari/issues/205)) + * Allow csi-indexed vcf files ([#209](https://github.com/ACEnglish/truvari/issues/209)) + * bcftools sort tempfile ([#213](https://github.com/ACEnglish/truvari/issues/213)) + +# Truvari 4.2.2 +*March 28, 2024* + +* `collapse` + * Fewer comparisons needed per-chunk on average + * Fixed `--chain` functionality ([details](https://github.com/ACEnglish/truvari/issues/196)) + * Fixed `--gt` consolidation of format fields +* `bench` + * Faster result at the cost of less complete annotations with `--short` flag +* `refine` + * Assures variants are sequence resolved before incorporating into consensus + * `bench --passonly --sizemax` parameters are used when building consensus for a region. Useful for `refine --use-original-vcfs` + * When a refined region has more than 5k variants, it is skipped and a warning of the region is written to the log + * Flag `--use-region-coords` now expands `--region` coordinates by 100bp (`phab --buffer` default) to allow variants to harmonize out of regions. +* general + * Dynamic bed/vcf parsing tries to choose faster of streaming/fetching variants + +# Truvari 4.2.1 +*February 6, 2024* +* `collapse` + * Faster handling of genotype data for `--gt` and `--keep common` +* general + * Fix to bed end position bug for including variants ([details](https://github.com/ACEnglish/truvari/issues/193)) + * Fix to Dockerfile +* `refine` + * Changes to `--recount` that accompany the fix to bed end positions. +* New command `ga4gh` to convert Truvari results into GA4GH truth/query VCFs with intermediates tags + +# Truvari 4.2 +*January 12, 2024* +* `collapse` + * New parameter `--gt` disallows intra-sample events to collapse ([details](https://github.com/ACEnglish/truvari/wiki/collapse#--gt)) + * New parameter `--intra` for consolidating SAMPLE information during intra-sample collapsing ([details](https://github.com/ACEnglish/truvari/wiki/collapse#--intra)) + * Preserve phasing information when available + * Faster O(n-1) algorithm instead of O(n^2) + * Faster sub-chunking strategy makes smaller chunks of variants needing fewer comparisons + * Fixed rare non-determinism error in cases where multiple variants are at the same position and equal qual/ac could be ordered differently. +* `phab` + * Correct sample handling with `--bSamples` `--cSamples` parameters + * Faster generation of consensus sequence + * Resolved 'overlapping' variant issue causing variants to be dropped + * New `poa` approach to harmonization. Faster than mafft but less accurate. Slower than wfa but more accurate. +* `bench` + * New, easier `MatchId` field to track which baseline/comparison variants match up [details](https://github.com/ACEnglish/truvari/wiki/MatchIds) + * `entry_is_present` method now considers partial missing variants (e.g. `./1`) as present + * Removed the 'weighted' metrics from `summary.json` +* `consistency` + * Fixed issue with counting duplicate records + * Added flag to optionally ignore duplicate records +* `anno svinfo` now overwrites existing SVLEN/SVTYPE info fields +* general + * Reduced fn matches for unroll sequence similarity by reporting maximum of multiple manipulations of variant sequence (roll up/down/none). Comes at a small, but reasonable, expense of some more fp matches. + * Bump pysam version + * Fixed bug in `unroll` sequence similarity that sometimes rolled from the wrong end + * Fixed bug for handling of None in ALT field + * `truvari.compress_index_vcf` forces overwriting of tabix index to prevent annoying crashes + + +# Truvari 4.1 +*August 7, 2023* + +* `bench` + * Creates `candidate.refine.bed` which hooks into `refine` on whole-genome VCFs [details](https://github.com/ACEnglish/truvari/wiki/bench#refining-bench-output) + * `--recount` for correctly assessing whole-genome refinement results + * experimental 'weighted' summary metrics [details](https://github.com/ACEnglish/truvari/wiki/bench#weighted-performance) + * Unresolved SVs (e.g. `ALT == `) are filtered when `--pctseq != 0` +* `phab` + * ~2x faster via reduced IO from operating in stages instead of per-region + * Removed most external calls (e.g. samtools doesn't need to be in the environment anymore) + * new `--align wfa` allows much faster (but slightly less accurate) variant harmonization + * increased determinism of results [detals](https://github.com/ACEnglish/truvari/commit/81a9ab85b91b0c530f9faeedfa4e7e0d68a5e8c2) +* `refine` + * Faster bed file intersection of `--includebed` and `--regions` + * Refine pre-flight check + * Correct refine.regions.txt end position from IntervalTree correction + * Better refine region selection with `--use-original` + * `--use-includebed` switched to `--use-region-coords` so that default behavior is to prefer the includebed's coordinates + * `--use-original-vcfs` to use the original pre-bench VCFs + * `refine.variant_summary.json` is cleaned of uninformative metrics +* `stratify` + * parallel parsing of truvari directory to make processing ~4x faster +* `msa2vcf` Fixed REPL decomposition bug to now preserve haplotypes +* `anno grpaf` - expanded annotation info fields +* `anno density` - new parameter `--stepsize` for sliding windows +* `collapse` + * New optional `--median-info` fields [#146](https://github.com/ACEnglish/truvari/issues/146) +* Minor updates + * Fix some `anno` threading on macOS [#154](https://github.com/ACEnglish/truvari/issues/154) + * Monomorphic/multiallelic check fix in `bench` + * `PHAB_WRITE_MAFFT` environment variable to facilitate updating functional test answer key + * Slightly slimmer docker container + +# Truvari 4.0 +*March 13, 2023* + +As part of the GIAB TR effort, we have made many changes to Truvari's tooling to enable comparison of variants in TR regions down to 5bp. Additionally, in order to keep Truvari user friendly we have made changes to the UI. Namely, we've updated some default parameters, some command-line arguments, and some outputs. There are also a few new tools and how a couple of tools work has changed. Therefore, we decided to bump to a new major release. If you're using Truvari in any kind of production capacity, be sure to test your pipeline before moving to v4.0. + +* New `refine` command for refining benchmarking results. [Details](refine) +* `bench` + * [Unroll](bench#unroll) is now the default sequence comparison approach. + * New `--pick` parameter to control the number of matches a variant can participate in [details](bench#controlling-the-number-of-matches) + * The `summary.txt` is now named `summary.json` + * Outputs parameters to `params.json` + * Output VCFs are sorted, compressed, and indexed + * Ambiguous use of 'call' in outputs corrected to 'comp' (e.g. `tp-call.vcf.gz` is now `tp-comp.vcf.gz`) + * Renamed `--pctsim` parameter to `--pctseq` + * Fixed bug where FP/FN weren't getting the correct, highest scoring match reported + * Fixed bug where `INFO/Multi` wasn't being properly applied + * Fixed bug where variants spanning exactly one `--includebed` region were erroneously being counted. + * Removed parameters: `--giabreport`, `--gtcomp`,`--multimatch`, `--use-lev`, `--prog`, `--unroll` +* `collapse` + * Renamed `--pctsim` parameter to `--pctseq` + * Runtime reduction by ~40% with short-circuiting during `Matcher.build_match` + * Better output sorting which may allow pipelines to be a little faster. +* `vcf2df` + * More granular sizebins for `[0,50)` including better handling of SNPs + * `--multisample` is removed. Now automatically add all samples with `--format` + * key index column removed and replaced by chrom, start, end. Makes rows easier to read and easier to work with e.g. pyranges +* `anno` + * Simplified ui. Commands that work on a single VCF and can stream (stdin/stdout) no longer use `--input` but a positional argument. + * Added `addid` +* `consistency` + * Slight speed improvement + * Better json output format +* `segment` + * Added `--passonly` flag + * Changed UI, including writing to stdout by default + * Fixed END and 1bp DEL bugs, now adds N to segmented variants' REF, and info fields SVTYPE/SVLEN +* API + * Began a focused effort on improving re-usability of Truvari code. + * Entry point to run benchmarking programmatically with [Bench object](https://truvari.readthedocs.io/en/latest/truvari.html#bench). + * Better development version tracking. [details](https://github.com/ACEnglish/truvari/commit/4bbf8d9a5be3b6a3f935afbd3a9b323811b676a0) + * Improved developer documentation. See [readthedocs](https://truvari.readthedocs.io/) +* general + * msa2vcf now left-trims and decomposes variants into indels + * Functional tests reorganization + * Fix for off-by-one errors when using pyintervaltree. See [ticket](https://github.com/ACEnglish/truvari/issues/137) + * Removed progressbar and Levenshtein dependencies as they are no longer used. + +# Truvari 3.5 +*August 27, 2022* + +* `bench` + * `--dup-to-ins` flag automatically treats SVTYPE==DUP as INS, which helps compare some programs/benchmarks + * New `--unroll` sequence comparison method for `bench` and `collapse` ([details](bench#unroll)) +* Major `anno trf` refactor (TODO write docs) including: + * annotation of DEL is fixed (was reporting the ALT copy numbers, not the sample's copy numbers after incorporating the ALT + * allow 'denovo' annotation by applying any TRF annotations found, not just those with corresponding annotations +* New `anno grpaf` annotates vcf with allele frequency info for groups of samples +* New `phab` for variant harmonization ([details](../phab)) +* backend + * `truvari.entry_size` returns the length of the event in the cases where len(REF) == len(ALT) (e.g. SNPs entry_size is 1) + * New key utility for `truvari.build_anno_trees` +* general + * Float metrics written to the VCF (e.g. PctSizeSimilarity) are rounded to precision of 4 + * Nice colors in some `--help` with [rich](https://github.com/Textualize/rich/) +* `divide` + * output shards are now more easily sorted (i.e. `ls divide_result/*.vcf.gz` will return the shards in the order they were made) + * compression/indexing of sub-VCFs in separate threads, reducing runtime +* user issues + * Monomorphic reference ALT alleles no longer throw an error in `bench` ([#131](https://github.com/ACEnglish/truvari/issues/131)) + * `SVLEN Number=A` fix ([#132](https://github.com/ACEnglish/truvari/issues/132)) + +# Truvari 3.4 +*July 7, 2022* + +* Improved performance of `consistency` (see [#127](https://github.com/ACEnglish/truvari/pull/127)) +* Added optional json output of `consistency` report +* Allow GT to be missing, which is allowed by VCF format specification +* TRF now uses `truvari.entry_variant_type` instead of trying to use `pysam.VariantRecord.info["SVLEN"]` +directly which allows greater flexibility. +* vcf2df now parses fields with `Number=\d` (e.g. 2+), which is a valid description +* `truvari.seqsim` is now case insensitive (see [#128](https://github.com/ACEnglish/truvari/issues/128)) +* Collapse option to skip consolidation of genotype information so kept alleles are unaltered +* `truvari anno dpcnt --present` will only count the depths of non ./. variants +* New collapse annotation `NumConsolidate` records how many FORMATs were consolidated +* Official [conda](https://anaconda.org/bioconda/truvari) support + +# Truvari 3.3 +*May 25, 2022* + +* New utilities `vcf_ranges` and `make_temp_filename` +* New annotations `dpcnt` and `lcr` +* Fixed a bug in `truvari collapse --keep` that prevented the `maxqual` or `common` options from working +* Increased determinism for `truvari collapse` so that in cases of tied variant position the longer allele is returned. If the alleles also have the same length, they are sorted alphabetically by the REF +* New `truvari bench --extend` functionality. See [discussion](https://github.com/ACEnglish/truvari/discussions/99) for details + +# Truvari 3.2 +*Apr 1, 2022* + +* Removed `truvari.copy_entry` for `pysam.VariantRecord.translate` a 10x faster operation +* Faster `truvari collapse` ([@c8b319b](https://github.com/ACEnglish/truvari/commit/c8b319b0e717a9e342f52e4a5e927f154eeb0e4a)) +* When building `MatchResult` between variants with shared start/end positions, we save processing work by skipping haplotype creation and just compare REFs/ALTs directly. +* Updated documentation to reference the paper https://doi.org/10.1101/2022.02.21.481353 +* New `truvari anno density` for identifying regions with 'sparse' and 'dense' overlapping SVs ([details](https://github.com/spiralgenetics/truvari/wiki/anno#truvari-anno-density)) +* Better `bench` genotype reporting with `summary.txt` having a `gt_matrix` of Base GT x Comp GT for all Base calls' best, TP match. +* New `truvari anno bpovl` for intersecting against tab-delimited files ([details](https://github.com/spiralgenetics/truvari/wiki/anno#truvari-anno-bpovl)) +* New `truvari divide` command to split VCFs into independent parts ([details](https://github.com/ACEnglish/truvari/wiki/divide)) +* Replaced `--buffer` parameter with `--minhaplen` for slightly better matching specificity +* Bugfix - `truvari anno trf` no longer duplicates entries spanning multple parallelization regions +* Bugfix - `collapse` MatchId/CollapseId annotation wasn't working +* Bugfixes - from [wwliao](https://github.com/wwliao) ([@4dd9968](https://github.com/ACEnglish/truvari/commit/4dd99683912236f433166889bb0b5667e9fa936d) [@ef2cfb3](https://github.com/ACEnglish/truvari/commit/ef2cfb366b60a5af4671d65d3ed987b08da72227)) +* Bugfixes - Issues [#107](https://github.com/ACEnglish/truvari/issues/107), [#108](https://github.com/ACEnglish/truvari/issues/108) + +# Truvari 3.1 +*Dec 22, 2021* + +* `bench` now annotates FPs by working a little differently. See [[bench|bench#methodology]] for details. +* Recalibrated TruScore and new reciprocal overlap measurement for sequence resolved `INS` ([details](https://github.com/spiralgenetics/truvari/discussions/92)) +* Match objects are now usable via the SDK. See [#94](https://github.com/spiralgenetics/truvari/discussions/94) for an example of using Truvari programmatically +* `file_zipper` VCF iteration strategy (`GenomeTree` -> `RegionVCFIterator`) that improves speed, particularly when using `--includebed` +* `collapse` refactored to use Match object and for prettier code, cleaner output. +* `anno remap` now optionally adds `INFO` field of the location of the top N hits. +* An experimental tool `truvari segment` added to help SV association analysis. +* `vcf2df` now supports pulling `FORMAT` fields from multiple samples. +* `vcf2df` now adds `('_ref', '_alt')`, or `('_ref', '_het', '_hom')` for `INFO,Number=[R|G]` fields, respectively. +* Improved documentation, including http://truvari.readthedocs.io/ for developers. +* Increasing/diversifying test coverage exposed minor bugs which were fixed. +* `bench --no-ref --cSample` bug fixes. +* Minor usability feature implemented in `help_unknown_cmd`. + +# Truvari 3.0 +*Sep 15, 2021* + +As Truvari's adoption and functionality grows, we decided to spend time working on sustainability and performance of the tool. Multiple [Actions](https://github.com/spiralgenetics/truvari/actions) for CI/CD have been added. Many components have been refactored for speed, and other 'cruft' code has been removed. Some of these changes (particularly the switch to using edlib for sequence similarity) affects the results. Therefore, we've bumped to a new major release version. + +* Working on speed improvements +* Added edlib as the default when calculating pctseq_sim, keeping Levenstein as an option (`--use-lev`). +* `truvari bench` summary's gt_precision/gt_recall are replaced by gt_concordance, which is just the percent of TP-comp calls with a concordant genotype. `--no-ref` has better functionality. `--giabreport` is different. +* Added `—keep common` to `truvari collapse`, which allows one to choose to keep the allele with the highest MAC. +* `truvari collapse --hap` wasn't working correctly. The assumptions about the calls being phased wasn't being +properly used (e.g. don't collapse 1|1) and the NumCollapsed was being populated before the single-best +match was chosen. The latter is a reporting problem, but the former had an effect on the results with +~3% of collapsed calls being mis-collapsed. +* `truvari anno trf` is now faster and simpler in its approach and whats reported.. and hopefully more useful. +* `truvari anno grm` has min_size and regions arguments added. +* truv2df has become `truvari vcf2df` where the default is vcf conversion with options to run on a `truvari bench` output directory. It also allows a specific sample to be parsed with `--format` and better Number=A handling. +* NeighId added to `truvari anno numneigh`, which works like bedtools cluster. +* The method af_calc now makes MAC/AC. +* Added 'partial' to `truvari anno remap`. +* Added `truvari anno svinfo`. +* Removed `truvari stats` as `truvari vcf2df` is better and began building [community-driven summaries](https://github.com/spiralgenetics/truvari/discussions/categories/vcf2df-recipes). +* Ubiquitous single version. +* Added a Dockerfile and instructions for making a Truvari docker container. +* Code and repository cleaning. +* Github actions for automated pylint, testing, and releases to pypi. +* Preserving per-version documentation from the wiki in `docs/`. + + +# Truvari 2.1 +*Jan 27, 2021* + +We've expanded and improved Truvari's [annotations](https://github.com/spiralgenetics/truvari/wiki/anno). We've added an [SV "collapsing" tool](https://github.com/spiralgenetics/truvari/wiki/collapse). And we've added a way to [turn VCFs into pandas DataFrames](https://github.com/spiralgenetics/truvari/wiki/truv2df) easily for downstream analysis/QC. + +# Truvari 2.0 +*May 14, 2020* + +After performing a drastic code refactor, we were able to create several helper methods from Truvari's core functionality around SV comparisons and VCF manipulations. This reusable code gave us an opportunity to create tools relevant for SV analysis. + +Truvari now contains multiple subcommands. In addition to the original benchmarking functionality (`truvari bench`), Truvari can generate SV relevant summary statistics, compute consistency of calls within VCFs, and we've begun to develop annotations for SVs. Details on these tools are on the [WIKI](https://github.com/spiralgenetics/truvari/wiki). + +We are committed to continually improving Truvari with the hopes of advancing the study and analysis of structural variation. + +# Truvari 1.3 +*September 25th, 2019* + +Truvari has some big changes. In order to keep up with the o deement of Python 2.7 https://pythonclock.org/ +We're now only supporting Python 3. + +Additionally, we now package Truvari so it and its dependencies can be installed directly. See Installation +below. This will enable us to refactor the code for easier maintenance and reusability. + +Finally, we now automatically report genotype comparisons in the summary stats. \ No newline at end of file diff --git a/docs/v4.3.1/anno.md b/docs/v4.3.1/anno.md new file mode 100644 index 00000000..55835902 --- /dev/null +++ b/docs/v4.3.1/anno.md @@ -0,0 +1,494 @@ + +Truvari annotations: +* [gcpct](anno#truvari-anno-gcpct) - GC Percent +* [gtcnt](anno#truvari-anno-gtcnt) - Genotype Counts +* [trf](anno#truvari-anno-trf) - Tandem Repeats +* [grm](anno#truvari-anno-grm) - Mappability +* [repmask](anno#truvari-anno-repmask) - Repeats +* [remap](anno#truvari-anno-remap) - Allele Remapping +* [hompct](anno#truvari-anno-hompct) - Homozygous Percent +* [numneigh](anno#truvari-anno-numneigh) - Number of Neighbors +* [svinfo](anno#truvari-anno-svinfo) - SVINFO Fields +* [bpovl](anno#truvari-anno-bpovl) - Annotation Intersection +* [density](anno#truvari-anno-density) - Call Density +* [dpcnt](anno#truvari-anno-dpcnt) - Depth (DP) and Alt-Depth (AD) Counts +* [lcr](anno#truvari-anno-lcr) - Low-complexity Regions +* [grpaf](anno#truvari-anno-grpaf) - Sample Group Allele-Frequency Annotations + +# truvari anno gcpct + +This will add an INFO tag `GCPCT` to each element in a VCF of the GC percent of the call's sequence. + +For deletions, this is the GC percent of the reference range of the call. For insertions, the ALT sequence is analyzed. +``` +usage: gcpct [-h] [-o OUTPUT] -r REFERENCE [input] + +Annotates GC content of SVs + +positional arguments: + input VCF to annotate (stdin) + +options: + -h, --help show this help message and exit + -o OUTPUT, --output OUTPUT + Output filename (stdout) + -r REFERENCE, --reference REFERENCE + Reference fasta +``` + +# truvari anno gtcnt +This will add an INFO tag `GTCNT` to each element in a VCF with the count of genotypes found across all samples. The value is a list of Counts of genotypes for the allele across all samples (UNK, REF, HET, HOM). This is most useful for pVCFs. + +``` +usage: gtcnt [-h] [-o OUTPUT] [input] + +Annotates GTCounts of alleles + +positional arguments: + input VCF to annotate (stdin) + +options: + -h, --help show this help message and exit + -o OUTPUT, --output OUTPUT + Output filename (stdout) +``` + +# truvari anno trf +Adds a tandem repeat annotation to sequence resolved Insertion/Deletion variants a VCF. + +### Annotations added +| Field Name | Description | +|------------|-------------------------------------------------------------| +| TRF | Entry hits a tandem repeat region | +| TRFdiff | ALT TR copy difference from reference | +| TRFrepeat | Repeat motif | +| TRFovl | Percent of ALT covered by TRF annotation | +| TRFstart | tart position of discovered repeat | +| TRFend | End position of discovered repeat | +| TRFperiod | eriod size of the repeat | +| TRFcopies | Number of copies aligned with the consensus pattern | +| TRFscore | Alignment score | +| TRFentropy | Entropy measure | +| TRFsim | Similarity of ALT sequence to generated motif faux sequence | + +### Details +Given a set of tandem repeat regions and a VCF, this annotate the tandem repeat motif and copy number change of insertions and deletions as expansions/contraction. The expected input catalog of tandem repeats is from a subset of columns in the Adotto TR catalog ([link](https://github.com/ACEnglish/adotto/blob/main/regions/DataDescription.md)). This file can be formatted for `truvari anno trf` via: +```bash +zcat adotto_TRregions_v1.1.bed.gz | cut -f1-3,18 | bgzip > anno.trf.bed.gz +tabix anno.trf.bed.gz +``` +For deletions, the tool simply finds the motif annotation with the highest overlap over the variant's boundaries. It then removes that sequence from the reference and calculates how many copies of the motif are removed with the formula `round(-(ovl_pct * svlen) / anno["period"], 1)`. If a deletion overlaps multiple motifs, the highest scoring motif is chosen based on higher reciprocal overlap percent first and TRF score second (see [code](https://github.com/ACEnglish/truvari/blob/2219f52850252c18dcd8c679da6644bb1cee5b68/truvari/annotations/trf.py#L29)]. + +For insertions, by default the tool first tries to estimate which motif is contained in the alternate sequence. For each overlapping annotation, the copy number difference of the motif is calculated via `copy_diff = len(entry.alts[0][1:]) / anno["period"]`. Next, a 'feaux sequence' is made from `copy_diff` number of the motif. If the sequence is above the `--motif-similarity` with the insertion sequence, that is considered the insertion's motif. If no estimate is above the `--motif-similarity`, the insertion is incorporated into the reference and TRF is run. If the discovered TRF hits match a motif in the tandem repeat regions file, that annotation is used. If the highest scoring TRF hit doesn't match the tandem repeats region file, the nominally de novo annotation is added to the insertion's vcf entry. + +``` +usage: trf [-h] -i INPUT [-o OUTPUT] [-e EXECUTABLE] [-T TRF_PARAMS] -r REPEATS -f REFERENCE [-s MOTIF_SIMILARITY] + [-m MIN_LENGTH] [-R] [--no-estimate] [-C CHUNK_SIZE] [-t THREADS] [--debug] + +Intersect vcf with reference tandem repeats and annotate +variants with the best fitting repeat motif and its copy number +relative to the reference + +options: + -h, --help show this help message and exit + -i INPUT, --input INPUT + VCF to annotate + -o OUTPUT, --output OUTPUT + Output filename (stdout) + -e EXECUTABLE, --executable EXECUTABLE + Path to tandem repeat finder (trf409.linux64) + -T TRF_PARAMS, --trf-params TRF_PARAMS + Default parameters to send to trf (3 7 7 80 5 40 500 -h -ngs) + -r REPEATS, --repeats REPEATS + Reference repeat annotations + -f REFERENCE, --reference REFERENCE + Reference fasta file + -s MOTIF_SIMILARITY, --motif-similarity MOTIF_SIMILARITY + Motif similarity threshold (0.9) + -m MIN_LENGTH, --min-length MIN_LENGTH + Minimum size of entry to annotate (50) + -R, --regions-only Only write variants within --repeats regions (False) + --no-estimate Skip INS estimation procedure and run everything through TRF. (False) + -C CHUNK_SIZE, --chunk-size CHUNK_SIZE + Size (in mbs) of reference chunks for parallelization (5) + -t THREADS, --threads THREADS + Number of threads to use (1) + --debug Verbose logging +``` + +# truvari anno grm + +For every SV, we create a kmer over the the upstream and downstream reference and alternate breakpoints. +We then remap that kmer to the reference genome and report alignment information. +This does not alter the VCF traditional annotations, but instead will create a pandas +DataFrame and save it to a joblib object. + +There are four queries made per-SV. For both reference (r), alternate (a) we create upstream (up) and downstream (dn) kmers. +So the columns are all prefixed with one of "rup_", "rdn_", "aup_", "adn_". + +In the alignment information per-query, there are three 'hit' counts: +- nhits : number of query hits +- dir_hits : direct strand hit count +- com_hits : compliment strand hit count + +The rest of the alignment information is reported by average (avg), maximum (max), and minimum (min) + +The suffixes are: +- q : mapping quality score of the hits +- ed : edit distance of the hits +- mat : number of matches +- mis : number of mismatches + +For example, "aup_avg_q", is the alternate's upstream breakend kmer's average mapping quality score. + +``` +usage: grm [-h] -i INPUT -r REFERENCE [-R REGIONS] [-o OUTPUT] [-k KMERSIZE] [-m MIN_SIZE] [-t THREADS] [--debug] + +Maps graph edge kmers with BWA to assess Graph Reference Mappability + +options: + -h, --help show this help message and exit + -i INPUT, --input INPUT + Input VCF + -r REFERENCE, --reference REFERENCE + BWA indexed reference + -R REGIONS, --regions REGIONS + Bed file of regions to parse (None) + -o OUTPUT, --output OUTPUT + Output dataframe (results.jl) + -k KMERSIZE, --kmersize KMERSIZE + Size of kmer to map (50) + -m MIN_SIZE, --min-size MIN_SIZE + Minimum size of variants to map (25) + -t THREADS, --threads THREADS + Number of threads (1) + --debug Verbose logging +``` + +# truvari anno repmask + +``` +usage: repmask [-h] -i INPUT [-o OUTPUT] [-e EXECUTABLE] [-m MIN_LENGTH] [-M MAX_LENGTH] [-t THRESHOLD] [-p PARAMS] [-T THREADS] + [--debug] + + Wrapper around RepeatMasker to annotate insertion sequences in a VCF + +options: + -h, --help show this help message and exit + -i INPUT, --input INPUT + VCF to annotate (None) + -o OUTPUT, --output OUTPUT + Output filename (/dev/stdout) + -e EXECUTABLE, --executable EXECUTABLE + Path to RepeatMasker (RepeatMasker) + -m MIN_LENGTH, --min-length MIN_LENGTH + Minimum size of entry to annotate (50) + -M MAX_LENGTH, --max-length MAX_LENGTH + Maximum size of entry to annotate (50000) + -t THRESHOLD, --threshold THRESHOLD + Threshold for pct of allele covered (0.8) + -p PARAMS, --params PARAMS + Default parameter string to send to RepeatMasker (-pa {threads} -qq -e hmmer -species human -lcambig + -nocut -div 50 -no_id -s {fasta}) + -T THREADS, --threads THREADS + Number of threads to use (1) + --debug Verbose logging +``` + +# truvari anno remap + +Taking the Allele’s sequence, remap it to the reference and annotate based on the closest alignment. + +![](https://github.com/spiralgenetics/truvari/blob/develop/imgs/remap_example.png) + +``` +usage: remap [-h] -r REFERENCE [-o OUTPUT] [-m MINLENGTH] [-t THRESHOLD] [-d DIST] [-H HITS] [--debug] [input] + +Remap VCF'S alleles sequence to the reference to annotate REMAP + +- novel : Allele has no hits in reference +- tandem : Allele's closest hit is within len(allele) bp of the SV's position +- interspersed : Allele's closest hit is not tandem +- partial : Allele only has partial hit(s) less than --threshold + +Which alleles and alignments to consider can be altered with: +- --minlength : minimum SV length to considred (50) +- --dist : For deletion SVs, do not consider alignments that hit within Nbp of the SV's position +(a.k.a. alignments back to the source sequence) (10) +- --threshold : Minimum percent of allele's sequence used by alignment to be considered (.8) + +positional arguments: + input Input VCF (/dev/stdin) + +options: + -h, --help show this help message and exit + -r REFERENCE, --reference REFERENCE + BWA indexed reference + -o OUTPUT, --output OUTPUT + Output VCF (/dev/stdout) + -m MINLENGTH, --minlength MINLENGTH + Smallest length of allele to remap (50) + -t THRESHOLD, --threshold THRESHOLD + Threshold for pct of allele covered to consider hit (0.8) + -d DIST, --dist DIST Minimum distance an alignment must be from a DEL's position to be considered (10)) + -H HITS, --hits HITS Report top hits as chr:start-end.pct (max 0) + --debug Verbose logging +``` +# truvari anno hompct + +``` +usage: hompct [-h] -i INPUT [-o OUTPUT] [-b BUFFER] [-m MINANNO] [-M MAXGT] [-c MINCOUNT] [--debug] + +Calcluate the the Hom / (Het + Hom) of variants in the region of SVs +Requires the VCF to contain SVs beside SNPs/Indels + +options: + -h, --help show this help message and exit + -i INPUT, --input INPUT + Compressed, indexed VCF to annotate + -o OUTPUT, --output OUTPUT + Output filename (stdout) + -b BUFFER, --buffer BUFFER + Number of base-pairs up/dn-stream to query (5000) + -m MINANNO, --minanno MINANNO + Minimum size of event to annotate (50) + -M MAXGT, --maxgt MAXGT + Largest event size to count for genotyping (1) + -c MINCOUNT, --mincount MINCOUNT + Minimum number of genotyping events to report HOMPCT (0) + --debug Verbose logging +``` + +# truvari anno numneigh + +``` +usage: numneigh [-h] [-o OUTPUT] [-r REFDIST] [-s SIZEMIN] [--passonly] [--debug] [input] + +For every call within size boundaries, +Add NumNeighbors info field of how many calls are within the distance +Add NeighId clustering field in the same chained neighborhood +For example, +:: + -- is a call, refdist is 2 + - - - - - - + nn: 1 2 1 0 1 1 + id: 0 0 0 1 2 2 + +positional arguments: + input VCF to annotate + +options: + -h, --help show this help message and exit + -o OUTPUT, --output OUTPUT + Output vcf (stdout) + -r REFDIST, --refdist REFDIST + Max reference location distance (1000) + -s SIZEMIN, --sizemin SIZEMIN + Minimum variant size to consider for annotation (50) + --passonly Only count calls with FILTER == PASS + --debug Verbose logging +``` + +# truvari anno svinfo + +Uses `truvari.entry_size` and `truvari.entry_variant_type` on entries >= `args.minsize` to add 'SVLEN' and ‘SVTYPE’ annotations to a VCF’s INFO. + +How SVLEN is determined: +- Starts by trying to use INFO/SVLEN +- If SVLEN is unavailable and ALT field is an SV (e.g. \, \, etc), use abs(vcf.start - vcf.end). The INFO/END tag needs to be available, especially for INS. +- Otherwise, return the size difference of the sequence resolved call using abs(len(vcf.REF) - len(str(vcf.ALT[0]))) + +How SVTYPE is determined: +- Starts by trying to use INFO/SVTYPE +- If SVTYPE is unavailable, infer if entry is a insertion or deletion by looking at the REF/ALT sequence size differences +- If REF/ALT sequences are not available, try to parse the \, \, etc from the ALT column. +- Otherwise, assume 'UNK' + +``` +usage: svinfo [-h] [-o OUTPUT] [-m MINSIZE] [input] + +Adds SVTYPE and SVLEN INFO fields + +positional arguments: + input VCF to annotate (stdin) + +options: + -h, --help show this help message and exit + -o OUTPUT, --output OUTPUT + Output filename (stdout) + -m MINSIZE, --minsize MINSIZE + Minimum size of entry to annotate (50) +``` + +# truvari anno bpovl + +After turning a tab-delimited annotation file into an IntervalTree, intersect the start/end and overlap of SVs. +The output is a light-weight pandas DataFrame saved with joblib. The columns in the output are: + +- vcf_key : Variant key from `truvari.entry_to_key` +- intersection : Type of intersection between the SV and the annotation + - start_bnd - SV's start breakpoints hits the annotation + - end_bnd - SV's end breakpoint hits the annotation + - overlaps - Annotation's start/end boundaries are completely within the SV + - contains - Annotation's start/end boundaries span the entire SV +- anno_key : Annotation file's line index + +The idea with this tool is to annotate variants against tab-delimited files, especially when there's a 1-to-N variant to annotations. This tool is useful when used in conjunction with `truvari vcf2df` and pandas DataFrames. + +For example, if we have a VCF of SVs and a GTF of genes/gene features from Ensmbl. Any SV may intersect multiple features, which doesn't lend itself well to directly annotating the VCF's INFO. After using `bpovl`, we'll use Truvari to convert the SVs to a DataFrame. + +```bash +truvari anno bpovl -i variants.vcf.gz -a genes.gtf.gz -o anno.jl -p gff +truvari vcf2df variants.vcf.gz variants.jl +``` + +We can then treat the files similar to a database and do queries and joins to report which variants intersect which annotations. + +```python +import joblib +from gtfparse import read_gtf +variants = joblib.load("variants.jl") +genes = read_gtf("genes.gtf.gz") +annos = joblib.load("anno.jl") +to_check = annos.iloc[0] + +print(to_check) +# vcf_key chr20:958486-958487.A +# intersection start_bnd +# anno_key 11 + +print(variants.loc[to_check['vcf_key']]) +# id None +# svtype INS +# ... etc + +print(annos.loc[to_check['anno_key']]) +# seqname chr20 +# source ensembl_havana +# feature exon +# start 958452 +# ... etc +``` + +Similar to tabix, `bpovl` has presets for known file types like bed and gff. But any tab-delimited file with sequence/chromosome, start position, and end position can be parsed. Just set the "Annotation File Arguments" to the 0-based column indexes. For example, a bed file +has arguments `-s 0 -b 1 -e 2 -c #`. + +``` +usage: bpovl [-h] -a ANNO -o OUTPUT [--sizemin SIZEMIN] [--spanmax SPANMAX] [-p {bed,gff}] [-c COMMENT] [-s SEQUENCE] [-b BEGIN] + [-e END] [-1] + [input] + +Creates intersection of features in an annotation file with SVs' breakpoints and overlap + +positional arguments: + input VCF to annotate (stdin) + +options: + -h, --help show this help message and exit + -a ANNO, --anno ANNO Tab-delimited annotation file + -o OUTPUT, --output OUTPUT + Output joblib DataFrame + --sizemin SIZEMIN Minimum size of variant to annotate (50) + --spanmax SPANMAX Maximum span of SVs to annotate (50000) + +Annotation File Arguments: + -p {bed,gff}, --preset {bed,gff} + Annotation format. This option overwrites -s, -b, -e, -c and -1 (None) + -c COMMENT, --comment COMMENT + Skip lines started with character. (#) + -s SEQUENCE, --sequence SEQUENCE + Column of sequence/chromosome name. (0) + -b BEGIN, --begin BEGIN + Column of start chromosomal position. (1) + -e END, --end END Column of end chromosomal position. (2) + -1, --one-based The position in the anno file is 1-based rather than 0-based. (False) +``` +# truvari anno density +Partitions a `--genome` into `--windowsize` regions and count how many variants overlap. Annotate +regions with no variants as 'sparse' and with greater than or equal to (mean + `--threshold` * standard +deviation) number of variants as 'dense'. Outputs a joblib DataFrame with columns +`chrom, start, end, count, anno`. + +``` +usage: density [-h] -g GENOME -o OUTPUT [-m MASK] [-w WINDOWSIZE] [-s STEPSIZE] [-t THRESHOLD] [input] + +Identify 'dense' and 'sparse' variant windows of the genome + +positional arguments: + input Input VCF (/dev/stdin) + +optional arguments: + -h, --help show this help message and exit + -g GENOME, --genome GENOME + Genome bed file + -o OUTPUT, --output OUTPUT + Output joblib DataFrame + -m MASK, --mask MASK Mask bed file + -w WINDOWSIZE, --windowsize WINDOWSIZE + Window size (10000) + -s STEPSIZE, --stepsize STEPSIZE + Window step size (10000) + -t THRESHOLD, --threshold THRESHOLD + std for identifying 'dense' regions (3) +``` + +# truvari anno dpcnt + +For multi-sample VCFs, it is often useful to have summarized depth (DP) information across samples per-variant. This adds a `INFO/DPCNT` with counts of how many samples have `FORMAT/DP` for each of the user-defined bins. Bins are incremented using `bisect` e.g. `pos = bisect.bisect(bins, dp); bins[pos] += 1; + +``` +usage: dpcnt [-h] [-b BINS] [--no-ad] [-p] [-o OUTPUT] [input] + +Quick utility to count how many samples have >= Nx coverage per-variant + +positional arguments: + input VCF to annotate (stdin) + +options: + -h, --help show this help message and exit + -b BINS, --bins BINS Coverage bins to bisect left the counts (0,5,10,15) + --no-ad Skip adding ADCNT bins + -p, --present Only count sites with present (non ./.) genotypes + -o OUTPUT, --output OUTPUT + Output filename (stdout) +``` + +# truvari anno lcr + +``` +usage: lcr [-h] [-o OUTPUT] [input] + +Annotate low complexity region entropy score for variants +Credit: https://jszym.com/blog/dna_protein_complexity/ + +positional arguments: + input VCF to annotate (stdin) + +options: + -h, --help show this help message and exit + -o OUTPUT, --output OUTPUT + Output filename (stdout) +``` + +# truvari anno grpaf + +Add INFO tags of allele frequency annotations for groups of samples. For every group in `--labels` tab-delimited file, calculate the AF,MAF,ExcHet,HWE,MAC,AC for the samples in the group. Adds INFO tags with suffix of the group identifier (e.g. `AF_EAS`). `--strict` will hard fail if there are samples in the `--labels` not present in the vcf header. + +``` +usage: grpaf [-h] [-o OUTPUT] -l LABELS [-t TAGS] [--strict] [--debug] [input] + +Add allele frequency annotations for subsets of samples + +positional arguments: + input VCF to annotate + +options: + -h, --help show this help message and exit + -o OUTPUT, --output OUTPUT + Output filename (stdout) + -l LABELS, --labels LABELS + Tab-delimited file of sample and group + -t TAGS, --tags TAGS Comma-separated list of tags to add from AF,MAF,ExcHet,HWE,MAC,AC,AN (all) + --strict Exit if sample listed in labels is not present in VCF (False) + --debug Verbose logging +``` \ No newline at end of file diff --git a/docs/v4.3.1/bench.md b/docs/v4.3.1/bench.md new file mode 100644 index 00000000..145bf974 --- /dev/null +++ b/docs/v4.3.1/bench.md @@ -0,0 +1,295 @@ + +Quick start +=========== +Run this command where base is your 'truth set' SVs and comp is the comparison set of SVs. +```bash +truvari bench -b base_calls.vcf -c comp_calls.vcf -o output_dir/ +``` + +Matching Parameters +=================== +Picking matching parameters can be more of an art than a science. It really depends on the precision of your callers and the tolerance you wish to allow them such that it is a fair comparison. + +For example, depth of coverage callers (such as CNVnator) will have very 'fuzzy' boundaries, and don't report the exact deleted sequence but only varying regions. So thresholds of `pctseq=0`, `pctsize=.5`, `pctovl=.5`, `refdist=1000` may seem fair. + +[BioGraph](https://github.com/spiralgenetics/biograph) and many long-read callers report precise breakpoints and full alternate allele sequences. When benchmarking those results, we want to ensure our accuracy by using the stricter default thresholds. + +If you're still having trouble picking thresholds, it may be beneficial to do a few runs of Truvari bench over different values. Start with the strict defaults and gradually increase the leniency. From there, you can look at the performance metrics and manually inspect differences between the runs to find out what level you find acceptable. Truvari is meant to be flexible for comparison. More importantly, Truvari helps one clearly report the thresholds used for reproducibility. + +Here is a rundown of each matching parameter. +| Parameter | Default | Definition | +|------------|---------|------------| +| refdist | 500 | Maximum distance comparison calls must be within from base call's start/end | +| pctseq | 0.7 | Edit distance ratio between the REF/ALT haplotype sequences of base and
comparison call. See "Comparing Sequences of Variants" below. | +| pctsize | 0.7 | Ratio of min(base_size, comp_size)/max(base_size, comp_size) | +| pctovl | 0.0 | Ratio of two calls' (overlapping bases)/(longest span) | +| typeignore | False | Types don't need to match to compare calls. | + +Below are matching parameter diagrams to illustrate (approximately) how they work. + +``` + █ = Deletion ^ = Insertion + +--refdist REFDIST (500) + Max reference location distance + + ACTGATCATGAT + |--████--| + █████ + + Calls are within reference distance of 2 + +--pctsize PCTSIZE (0.7) + Min pct allele size similarity + + ACTGATCATGA sizes + █████ -> 5bp + ████ -> 4bp + + variants have 0.8 size similarity + + +--pctovl PCTOVL (0.0) + Min pct reciprocal overlap + + ACTGATCATGA ranges + █████ [2,7) + ████ [4,8) + + variants have 0.6 reciprocial overlap + + +--pctseq PCTSEQ (0.7) + Min percent allele sequence similarity + + A-CTG-ACTG + ^ ^ haplotypes + | └ACTG -> CTGACTGA + └CTGA -> CTGACTGA + + haplotypes have 100% sequence similarity +``` + +Outputs +======= +Truvari bench writes the following files to the `--output` directory. +| File | Description | +|----------------------|---------------------------------------------| +| tp-base.vcf.gz | True positive calls form the base VCF | +| tp-comp.vcf.gz | True positive calls from the comparison VCF | +| fp.vcf.gz | False positive calls from comparison | +| fn.vcf.gz | False negative calls from base | +| summary.json | Json output of performance stats | +| params.json | Json output of parameters used | +| candidate.refine.bed | Bed file of regions for `refine` | +| log.txt | Run's log | + +summary.json +------------ +Stats generated by benchmarking are written to `summary.json`. + +| Metric | Definition | +|----------------|------------------------------------------------------------| +| TP-base | Number of matching calls from the base vcf | +| TP-comp | Number of matching calls from the comp vcf | +| FP | Number of non-matching calls from the comp vcf | +| FN | Number of non-matching calls from the base vcf | +| precision | TP-comp / (TP-comp + FP) | +| recall | TP-base / (TP-base + FN) | +| f1 | 2 * ((recall * precision) / (recall + precision)) | +| base cnt | Number of calls in the base vcf | +| comp cnt | Number of calls in the comp vcf | +| TP-comp_TP-gt | TP-comp with genotype match | +| TP-comp_FP-gt | TP-comp without genotype match | +| TP-base_TP-gt | TP-base with genotype match | +| TP-base_FP-gt | TP-base without genotype match | +| gt_concordance | TP-comp_TP-gt / (TP-comp_TP-gt + TP-comp_FP-gt) | +| gt_matrix | Base GT x Comp GT Matrix of all Base calls' best, TP match | +| weighted | Metrics weighed by variant sequence/size similarity | + +The `gt_matrix` is a table. For example: +```json +"gt_matrix": { + "(0, 1)": { + "(0, 1)": 500, + "(1, 1)": 10 + }, + "(1, 1)": { + "(1, 1)": 800, + "(0, 1)": 20 + } +} +``` +Represents -> +``` +comp (0,1) (1,1) +base +(0,1) 500 10 +(1,1) 20 800 +``` + +Added annotations +----------------- +The output vcfs are annotated with INFO fields and then sorted, compressed, and indexed inside of the output directory. + +| Anno | Definition | +|-------------------|-----------------------------------------------------------------------------------------------------------------| +| TruScore | Truvari score for similarity of match. `((pctseq + pctsize + pctovl) / 3 * 100)` | +| PctSeqSimilarity | Pct sequence similarity between this variant and its closest match | +| PctSizeSimilarity | Pct size similarity between this variant and its closest match | +| PctRecOverlap | Percent reciprocal overlap percent of the two calls | +| StartDistance | Distance of the base call's start from comparison call's start | +| EndDistance | Distance of the base call's end from comparison call's end | +| SizeDiff | Difference in size of base and comp calls | +| GTMatch | Base/comp calls' Genotypes match | +| MatchId | Id to help tie base/comp calls together {chunkid}.{baseid}.{compid} See [[MatchIds wiki\|MatchIds]] for details. | + + +Refining bench output +===================== +As described in the [[refine wiki|refine]], a limitation of Truvari bench is 1-to-1 variant comparison. However, `truvari refine` can harmonize the variants to give them more consistent representations. A bed file named `candidate.refine.bed` is created by `truvari bench` and holds a set of regions which may benefit from refinement. To use it, simply run +```bash +truvari bench -b base.vcf.gz -c comp.vcf.gz -o result/ +truvari refine --regions result/candidate.refine.bed \ + --reference reference.fasta \ + --recount --use-region-coords \ + result/ +``` +See [[refine wiki|refine]] for details. + +Comparing Sequences of Variants +=============================== + +Truvari has implemented two approaches to compare variant sequences. The default comparison is called 'unroll'. Optionally, a `--reference` can be provided and Truvari will use the reference context of a pair of variants for comparison. + +## Unroll +The method of giving a pair of calls the same reference context can be achieved using an 'unroll' method. For a formal description, see [this gist](https://gist.github.com/ACEnglish/1e7421c46ee10c71bee4c03982e5df6c). + +The main idea is that in order to move variants upstream/downstream, the reference sequence flanking the variant will need to be moved downstream/upstream respectively. Or, to say this another way, we can think of the alternate sequences as being circular instead of linear. This means that in order to move the variant e.g. 1bp downstream for an INS, we could remove the first base from the ALT and append it to the end. So in the 'ab' example used to describe "Reference context" below, we only need to unroll the insertion at a position by the distance between it and another variant e.g. the INS `ab` at POS 2 becomes identical to the INS `ba` at POS 1 by rolling `2-1 = 1` bases from the start to the end. + +This unroll method has a number of benefits and a couple of considerations, including: +* not needing a `--reference` for comparison, which saves I/O time +* increasing the number of correctly matching SVs +* decreasing the number of 'suspect' matches in smaller size regimes +* providing a simpler pattern between PctSizeSimilarity and PctSeqSimilarity + +## Reference context +For the reference context method, consider a hypothetical tandem repeat expansion of the reference sequence 'AB'. Here, we'll represent the 'insertion' sequence as lower-case 'ab', though it should be considered equivalent to 'AB'. Three equally valid descriptions of this +variant would be: + +```text +#POS INS Haplotype + 0 ab abAB + 1 ba AbaB + 2 ab ABab +``` + +Therefore, to compare the sequence similarity, Truvari builds the haplotypes over the range of a pair of calls' +`min(starts):max(ends)` before making the the sequence change introduced by the variants. In python, this line +looks like: + +``` python +hap1_seq = ref.get_seq(a1_chrom, start + 1, a1_start).seq + a1_seq + ref.get_seq(a1_chrom, a1_end + 1, end).seq +``` + +Where `a1_seq1` is the longer of the REF or ALT allele. + +## SVs without sequences + +If the base or comp vcfs do not have sequence resolved calls (e.g. ``, simply set `--pctseq=0` to turn off +sequence comparison. The `--reference` does not need to be provided when not using sequence comparison. If +`--pctseq != 0` and an unresolved SV is encountered, a warning will be raised and the variant will not be compared. + +Controlling the number of matches +================================= + +How many matches a variant is allowed to participate in is controlled by the `--pick` parameter. The available pickers are `single`, `ac`, and `multi`. + +* `single` (the default option) allows each variant to participate in up to one match. +* `ac` uses the genotype allele count to control how many matches a variant can have. This means a homozygous alternate variant can participate in two matches (its GT is 1/1 so AC=2). A heterozygous variant can only participate in one match (GT 0/1, AC=1). And, a homozygous reference variant cannot be matched. Note that missing genotypes are considered reference alleles and do not add to the AC e.g. (GT ./1, AC=1). +* `multi` variants can participate in all matches available. + +As an example, imagine we have three variants in a pVCF with two samples we want to compare. + +``` +CHROM POS ID REF ALT base comp +chr20 17785968 ID1 A ACGCGCGCGCG 1/1 1/0 +chr20 17785968 ID2 A ACGCGCGCGCGCG 0/0 0/1 +chr20 17785969 ID3 C CGCGCGCGCGCGC 0/0 1/1 +``` + +To compare samples inside the same vcf, we would use the command: +```bash +truvari bench -b input.vcf.gz -c input.vcf.gz -o output/ --bSample base --cSample comp --no-ref a +``` + +This VCF makes different results depending on the `--pick` parameter + +| Parameter | ID1 State | ID2 State | ID3 State | +|-----------|-----------|-----------|-----------| +| single | TP | FP | FP | +| ac | TP | TP | FP | +| multi | TP | TP | TP | + +--dup-to-ins +============ + +Most SV benchmarks only report DEL and INS SVTYPEs. The flag `--dup-to-ins` will interpret SVs with SVTYPE == DUP to SVTYPE == INS. Note that DUPs generally aren't sequence resolved (i.e. the ALT isn't a sequence) like INS. Therefore, `--dup-to-ins` typically should be used without sequence comparison via `--pctseq 0` + +Size filtering +============== + +`--sizemax` is the maximum size of a base or comparison call to be considered. + +`--sizemin` is the minimum size of a base call to be considered. + +`--sizefilt` is the minimum size of a comparison call that will be matched to base calls. It can +be less than `sizemin` for edge case variants. + +For example: Imagine `sizemin` is set at 50 and `sizefilt` at 30, and a 50bp base call is 98% similar to a 49bp comparison +call at the same position. + +These two calls could be considered matching. However, if we removed comparison calls less than `sizemin`, +we'd incorrectly classify the 50bp base call as a false negative. Instead, we allow comparison calls between `[sizefilt,sizemin)` to find matches. + +This has the side effect of artificially inflating specificity. For example, if that same 49bp call described +above were below the similarity threshold, it would not be classified as a FP since it is below the `sizemin` +threshold. So we're giving the call a better chance to be useful and less chance to be detrimental +to final statistics. + +Include Bed & VCF Header Contigs +================================ + +If an `--includebed` is provided, only base and comp calls contained within the defined regions are used +for comparison. This is similar to pre-filtering your base/comp calls using: + +```bash +(zgrep "#" my_calls.vcf.gz && bedtools intersect -u -a my_calls.vcf.gz -b include.bed) | bgzip > filtered.vcf.gz +``` + +with the exception that Truvari requires the start and the end to be contained in the same includebed region +whereas `bedtools intersect` does not. + +If an `--includebed` is not provided, the comparison is restricted to only the contigs present in the base VCF +header. Therefore, any comparison calls on contigs not in the base calls will not be counted toward summary +statistics and will not be present in any output vcfs. + +Extending an Include Bed +------------------------ +The option `--extend` extends the regions of interest (set in `--includebed` argument) by the given number of bases on each side, allowing base variants to match comparison variants that are just outside of the original region. If a comparison variant is in the extended regions it can potentially match a base variant that is in the original regions turning it to TP. Comparison variants in the extended regions that don't have a match are not counted as FP. This strategy is similar to the one implemented for size matching where only the base variants longer than sizemin (equal to 50 by default) are considered, but they are allowed to match shorter comparison variants sizefilt (30bp by default) or longer. + +See this [discussion](https://github.com/ACEnglish/truvari/discussions/99)for details. + +Methodology +=========== +Here is a high-level pseudocode description of the steps Truvari bench conducts to compare the two VCFs. +``` +* zip the Base and Comp calls together in sorted order +* create chunks of all calls overlapping within ±`--chunksize` basepairs +* make a |BaseCall| x |CompCall| match matrix for each chunk +* build a Match for each call pair in the chunk - annotate as TP if >= all thresholds +* if the chunk has no Base or Comp calls +** return them all as FNs/FPs +* use `--pick` method to sort and annotate variants with their best match +``` +![](https://github.com/acenglish/truvari/blob/develop/imgs/TruvariBenchMethod.png) \ No newline at end of file diff --git a/docs/v4.3.1/collapse.md b/docs/v4.3.1/collapse.md new file mode 100644 index 00000000..7ac22adb --- /dev/null +++ b/docs/v4.3.1/collapse.md @@ -0,0 +1,163 @@ +`collapse` is Truvari's approach to SV merging. After leveraging `bcftools` to merge VCFs, `truvari collapse` can then iterate over the calls and create clusters of SVs that match over the [provided thresholds](https://github.com/spiralgenetics/truvari/wiki/bench#matching-parameters). This is also useful when removing potentially redundant calls within a single sample. + +Example +======= +To start, we merge multiple VCFs (each with their own sample) and ensure there are no multi-allelic entries via: +```bash +bcftools merge -m none one.vcf.gz two.vcf.gz | bgzip > merge.vcf.gz +``` + +This will `paste` SAMPLE information between vcfs when calls have the exact same chrom, pos, ref, and alt. +For example, consider two vcfs: + + >> one.vcf: + chr1 1 ... GT 0/1 + chr1 5 ... GT 1/1 + >> two.vcf: + chr1 1 ... GT 1/1 + chr1 7 ... GT 0/1 + +`bcftools merge` creates: + + >> merge.vcf: + chr1 1 ... GT 0/1 1/1 + chr1 5 ... GT 1/1 ./. + chr1 7 ... GT ./. 0/1 + +This VCF can then be collapsed to allow 'fuzzier' matching than the exact merge just performed. + +```bash +truvari collapse -i merge.vcf.gz -o truvari_merge.vcf -c truvari_collapsed.vcf +``` + +For example, if we collapsed our example merge.vcf by matching any calls within 3bp, we'd create: + + >> truvari_merge.vcf + chr1 1 ... GT 0/1 1/1 + chr1 5 ... GT 1/1 0/1 + >> truvari_collapsed.vcf + chr1 7 ... GT ./. 0/1 + +--choose behavior +================= +When collapsing, the default `--choose` behavior is to take the `first` variant by position from a cluster to +be written to the output while the others will be placed in the collapsed output. +Other choosing options are `maxqual` (the call with the highest quality score) or `common` (the call with the highest minor allele count). + +Samples with no genotype information in the kept variant will be filled by the first +collapsed variant containing genotype information. + +--gt +==== +For some results, one may not want to collapse variants with conflicting genotypes from a single sample. With the `--gt all` parameter, variants which are present (non `0/0` or `./.`) in the same sample are not collapsed. With the `-gt het` parameter, only variants which are both heterozygous in a sample (e.g. `0/1` and `0/1`) are prevented from collapsing. The `--gt het` is useful for some SV callers which will redundantly call variants and typically genotype them all as `1/1`. + +--intra +======= +When a single sample is run through multiple SV callers, one may wish to consolidate those results. After the `bcftools merge` of the VCFs, there will be one SAMPLE column per-input. With `--intra`, collapse will consolidate the sample information so that only a single sample column is present in the output. Since the multiple callers may have different genotypes or other FORMAT fields with conflicting information, `--intra` takes the first column from the VCF, then second, etc. For example, if we have an entry with: +``` +FORMAT RESULT1 RESULT2 +GT:GQ:AD ./.:.:3,0 1/1:20:0,30 +``` +The `--intra` output would be: +``` +FORMAT RESULT1 +GT:GQ:AD 1/1:20:3,0 +``` +As you can see in this example, 1) The first sample name is the only one preserved. 2) conflicting FORMAT fields can be consolidated in a non-useful way (here the AD of `3,0` isn't informative to a `1/1` genotype). We're working to provide an API to help users write custom intra-sample consolidation scripts. + +--hap mode +========== +When using `--hap`, we assume phased variants from a single individual. Only the +single best matching call from the other haplotype will be collapsed, +and the consolidated genotype will become 1/1 + +For example, if we collapse anything at the same position: + + chr1 1 .. GT 0|1 + chr1 1 .. GT 1|0 + chr1 2 .. GT 1|0 + +will become: + + chr1 1 .. GT 1/1 + chr1 2 .. GT 1|0 + +--chain mode +============ +Normally, every variant in a set of variants that are collapsed together matches every other variant in the set. However, when using `--chain` mode, we allow 'transitive matching'. This means that all variants match to only at least one other variant in the set. In situations where a 'middle' variant has two matches that don't match each other, without `--chain` the locus will produce two variants whereas using `--chain` will produce one. +For example, if we have + + chr1 5 .. + chr1 7 .. + chr1 9 .. + +When we collapse anything within 2bp of each other, without `--chain`, we output: + + chr1 5 .. + chr1 9 .. + +With `--chain`, we would collapse `chr1 9` as well, producing + + chr1 5 .. + +Annotations +=========== +`collapse` produces two files. The output file has kept variants along with unanalyzed (< sizemin) variants. The collapsed file contains the variants that were collapsed into the kept variants. + +The output file has only two annotations added to the `INFO`. +- `CollapseId` - Identifier of the variant when comparing to the collapse outputs. +- `NumCollapsed` - Number of variants collapsed into this variant +- `NumConsolidated` - Number of samples' genotypes consolidated into this call's genotypes + +The collapsed file has all of the annotations added by [[bench|bench#definition-of-annotations-added-to-tp-vcfs]]. Note that `MatchId` is tied to the output file's `CollapseId`. See [MatchIds](https://github.com/spiralgenetics/truvari/wiki/MatchIds) for details. + +``` +usage: collapse [-h] -i INPUT [-o OUTPUT] [-c COLLAPSED_OUTPUT] [-f REFERENCE] [-k {first,maxqual,common}] [--debug] + [-r REFDIST] [-p PCTSIM] [-B MINHAPLEN] [-P PCTSIZE] [-O PCTOVL] [-t] [--use-lev] [--hap] [--chain] + [--no-consolidate] [--null-consolidate NULL_CONSOLIDATE] [-s SIZEMIN] [-S SIZEMAX] [--passonly] + +Structural variant collapser + +Will collapse all variants within sizemin/max that match over thresholds + +options: + -h, --help show this help message and exit + -i INPUT, --input INPUT + Comparison set of calls + -o OUTPUT, --output OUTPUT + Output vcf (stdout) + -c COLLAPSED_OUTPUT, --collapsed-output COLLAPSED_OUTPUT + Where collapsed variants are written (collapsed.vcf) + -f REFERENCE, --reference REFERENCE + Indexed fasta used to call variants + -k {first,maxqual,common}, --keep {first,maxqual,common} + When collapsing calls, which one to keep (first) + --debug Verbose logging + --hap Collapsing a single individual's haplotype resolved calls (False) + --chain Chain comparisons to extend possible collapsing (False) + --no-consolidate Skip consolidation of sample genotype fields (True) + --null-consolidate NULL_CONSOLIDATE + Comma separated list of FORMAT fields to consolidate into the kept entry by taking the first non-null + from all neighbors (None) + +Comparison Threshold Arguments: + -r REFDIST, --refdist REFDIST + Max reference location distance (500) + -p PCTSIM, --pctsim PCTSIM + Min percent allele sequence similarity. Set to 0 to ignore. (0.95) + -B MINHAPLEN, --minhaplen MINHAPLEN + Minimum haplotype sequence length to create (50) + -P PCTSIZE, --pctsize PCTSIZE + Min pct allele size similarity (minvarsize/maxvarsize) (0.95) + -O PCTOVL, --pctovl PCTOVL + Min pct reciprocal overlap (0.0) for DEL events + -t, --typeignore Variant types don't need to match to compare (False) + --use-lev Use the Levenshtein distance ratio instead of edlib editDistance ratio (False) + +Filtering Arguments: + -s SIZEMIN, --sizemin SIZEMIN + Minimum variant size to consider for comparison (50) + -S SIZEMAX, --sizemax SIZEMAX + Maximum variant size to consider for comparison (50000) + --passonly Only consider calls with FILTER == PASS +``` \ No newline at end of file diff --git a/docs/v4.3.1/consistency.md b/docs/v4.3.1/consistency.md new file mode 100644 index 00000000..17ba23b7 --- /dev/null +++ b/docs/v4.3.1/consistency.md @@ -0,0 +1,179 @@ + +In addition to looking at performance of a single set of variation against a baseline, one may wish to measure the consistency between multiple sets of variation. The tool `truvari consistency` can automatically create that result. + +Running +======= + +``` +usage: consistency [-h] [-d] [-j] [-o OUTPUT] VCFs [VCFs ...] + +Over multiple vcfs, calculate their intersection/consistency. + +Calls will match between VCFs if they have a matching key of: + CHROM POS ID REF ALT + +positional arguments: + VCFs VCFs to intersect + +optional arguments: + -h, --help show this help message and exit + -d, --no-dups Disallow duplicate SVs + -j, --json Output report in json format + -o OUTPUT, --output OUTPUT + Write tsv of variant keys and their flag +``` + +Example +======= + +```bash +truvari consistency fileA.vcf fileB.vcf fileC.vcf +``` + +Matching Entries +================ + +VCF entries will be considered matching if and only if they have an exact same key of `CHROM POS ID REF ALT`. Because of this stringency, it is recommend that you compare the tp-base.vcf or fn.vcf results from each individual VCF's Truvari output. The key and flags can be written with the `--output` option. + +Duplicates +========== + +If there are VCFs with duplicate keys, they are handled by appending a e.g. `.1` to the key. If you'd like to ignore duplicates, just add `--no-dups` + +Output Report +============= + +Below is an example report: + +```text +# +# Total 5534 calls across 3 VCFs +# +#File NumCalls +fileA.vcf 4706 +fileB.vcf 4827 +fileC.vcf 4882 +# +# Summary of consistency +# +#VCFs Calls Pct +3 3973 71.79% +2 935 16.90% +1 626 11.31% +# +# Breakdown of VCFs' consistency +# +#Group Total TotalPct PctOfFileCalls +111 3973 71.79% 84.42% 82.31% 81.38% +011 351 6.34% 7.27% 7.19% +101 308 5.57% 6.54% 6.31% +110 276 4.99% 5.86% 5.72% +001 250 4.52% 5.12% +010 227 4.10% 4.70% +100 149 2.69% 3.17% +``` + +At the top we see that we compared 5,534 unique variants between the 3 files, with fileC.vcf having the most calls at 4,882. + +The "Summary of consistency" shows us that 3,973 (71.79%) of all the calls are shared between the 3 VCFs, while 626 (11.31%) are only found in one of the VCFs. + +Reading the "Breakdown of VCFs' consistency", a `Group` is a unique key for presence (1) or absence (0) of a call within each of the listed `#Files`. For example: `Group 111` is calls present in all VCFs; `Group 011` is calls present in only the 2nd and 3rd VCFs (i.e. fileB.vcf and fileC.vcf). + +We see that `Group 101` has calls belonging to the 1st and 3rd `#Files` (i.e. fileA.vcf and fileC.vcf). This group has a total of 308 calls that intersect, or 5.57% of all calls in all VCFs. This 308 represents 6.54% of calls in fileA.vcf and 6.31% of calls in fileC.vcf. + +Finally, we see that fileA.vcf has the least amount of calls unique to it on the `Group 100` line. + +Json +==== +Below is a consistency report in json format. +```json +{ + "vcfs": [ + "repo_utils/test_files/variants/input1.vcf.gz", + "repo_utils/test_files/variants/input2.vcf.gz", + "repo_utils/test_files/variants/input3.vcf.gz" + ], + "total_calls": 3513, + "num_vcfs": 3, + "vcf_counts": { + "repo_utils/test_files/variants/input1.vcf.gz": 2151, + "repo_utils/test_files/variants/input2.vcf.gz": 1783, + "repo_utils/test_files/variants/input3.vcf.gz": 2065 + }, + "shared": [ + { + "vcf_count": 3, + "num_calls": 701, + "call_pct": 0.1995445488186735 + }, + { + "vcf_count": 2, + "num_calls": 1084, + "call_pct": 0.3085681753487048 + }, + { + "vcf_count": 1, + "num_calls": 1728, + "call_pct": 0.4918872758326217 + } + ], + "detailed": [ + { + "group": "111", + "total": 701, + "total_pct": 0.1995445488186735, + "repo_utils/test_files/variants/input1.vcf.gz": 0.32589493258949326, + "repo_utils/test_files/variants/input2.vcf.gz": 0.393157599551318, + "repo_utils/test_files/variants/input3.vcf.gz": 0.3394673123486683 + }, + { + "group": "001", + "total": 645, + "total_pct": 0.18360375747224594, + "repo_utils/test_files/variants/input1.vcf.gz": 0, + "repo_utils/test_files/variants/input2.vcf.gz": 0, + "repo_utils/test_files/variants/input3.vcf.gz": 0.31234866828087166 + }, + { + "group": "100", + "total": 598, + "total_pct": 0.17022487902077996, + "repo_utils/test_files/variants/input1.vcf.gz": 0.2780102278010228, + "repo_utils/test_files/variants/input2.vcf.gz": 0, + "repo_utils/test_files/variants/input3.vcf.gz": 0 + }, + { + "group": "101", + "total": 487, + "total_pct": 0.1386279533162539, + "repo_utils/test_files/variants/input1.vcf.gz": 0.22640632264063226, + "repo_utils/test_files/variants/input2.vcf.gz": 0, + "repo_utils/test_files/variants/input3.vcf.gz": 0.2358353510895884 + }, + { + "group": "010", + "total": 485, + "total_pct": 0.13805863933959578, + "repo_utils/test_files/variants/input1.vcf.gz": 0, + "repo_utils/test_files/variants/input2.vcf.gz": 0.27201346045989905, + "repo_utils/test_files/variants/input3.vcf.gz": 0 + }, + { + "group": "110", + "total": 365, + "total_pct": 0.10389980074010817, + "repo_utils/test_files/variants/input1.vcf.gz": 0.1696885169688517, + "repo_utils/test_files/variants/input2.vcf.gz": 0.2047111609646663, + "repo_utils/test_files/variants/input3.vcf.gz": 0 + }, + { + "group": "011", + "total": 232, + "total_pct": 0.06604042129234272, + "repo_utils/test_files/variants/input1.vcf.gz": 0, + "repo_utils/test_files/variants/input2.vcf.gz": 0.13011777902411667, + "repo_utils/test_files/variants/input3.vcf.gz": 0.11234866828087167 + } + ] +} +``` \ No newline at end of file diff --git a/docs/v4.3.1/divide.md b/docs/v4.3.1/divide.md new file mode 100644 index 00000000..3bbad3cb --- /dev/null +++ b/docs/v4.3.1/divide.md @@ -0,0 +1,58 @@ +Divide a VCF into independent shards. + +Unfortunately, `pysam.VariantRecord` objects aren't pickle-able. This means that if we wanted to have Truvari leverage python's multiprocessing we'd need to make a custom VCF parser. However, the command `truvari divide` allows us to take an input VCF and divide it into multiple independent parts (or shards) which can the be processed over multiple processes. + +`truvari divide` works by parsing a VCF and splitting it into multiple, smaller sub-VCFs. If any variants are within `--buffer` base-pairs, they're output to the same sub-VCF. This allows variants in the same region which would need to be compared to one-another (see `--refdist`) to stay in the same sub-VCF. The `--min` parameter allows us to control the minimum number of variants per sub-VCF so that we don't make too many tiny VCFs. Once the sub-VCFs are created, we can process each independently through whatever truvari command. + +For example, let's say we want to run `truvari collapse` on a very large VCF with many variants and many samples. First, we divide the VCF: + +```bash +truvari divide big_input.vcf.gz sub_vcfs_directory/ +``` + +Inside of `sub_vcfs_directory/` we'll have multiple VCFs, which we can process with a simple bash script + +```bash +NJOBS=$(nproc) # use all processors by default +mkdir -p output_vcfs/ +mkdir -p collap_vcfs/ +mkdir -p logs/ + +for in_vcf in sub_vcfs_directory/*.vcf.gz +do + # Setup file names + base_name=$(basename $in_vcf) + base_name=${base_name%.vcf.gz} + output_name=output_vcfs/${base_name}.vcf + collap_name=collap_vcfs/${base_name}.vcf + log_name=logs/${base_name}.log + # Run the command and send it to the background + truvari collapse -i $in_vcf -o $output_name -c $collap_name -f reference.fa &> logs/${log_name}.log & + # If too many jobs are running, wait for one to finish + while [ $( jobs | wc -l ) -ge ${NJOBS} ] + do + sleep 5 + done +done +``` + +Obviously the logs and `while` loop are tricks for running on a single machine. If you have access to a cluster, I'm sure you can imagine how to create/submit the commands. + +``` +usage: divide [-h] [-b BUFFER] [-m MIN] [--no-compress] [-T THREADS] VCF DIR + +Divide a VCF into independent parts + +positional arguments: + VCF VCF to split + DIR Output directory to save parts + +options: + -h, --help show this help message and exit + -b BUFFER, --buffer BUFFER + Buffer to make mini-clusters (1000) + -m MIN, --min MIN Minimum number of entries per-vcf (100) + --no-compress Don't attempt to compress/index sub-VCFs + -T THREADS, --threads THREADS + Number of threads for compressing/indexing sub-VCFs (1) +``` \ No newline at end of file diff --git a/docs/v4.3.1/ga4gh.md b/docs/v4.3.1/ga4gh.md new file mode 100644 index 00000000..864c24cd --- /dev/null +++ b/docs/v4.3.1/ga4gh.md @@ -0,0 +1,3 @@ +The ga4gh command will consolidate tp-base/fn and tp-comp/fp VCF files into a single truth and query vcf, respectively. Additional tags are added that are compliant with the [ga4gh intermediates](https://github.com/ga4gh/benchmarking-tools/blob/master/doc/ref-impl/intermediate.md). + +When consolidating a result which has `truvari refine`, the `truvari ga4gh --with-refine` flag will extract original variants from unharmonized regions and altered variants from harmonized regions inside the `phab_bench/` sub-directory. \ No newline at end of file diff --git a/docs/v4.3.1/phab.md b/docs/v4.3.1/phab.md new file mode 100644 index 00000000..bdcaec9e --- /dev/null +++ b/docs/v4.3.1/phab.md @@ -0,0 +1,157 @@ +Introduction +------------ + +Truvari's comparison engine can match variants using a wide range of thresholds. However, some alleles can produce radically different variant representations. We could dramatically lower our thresholds to identify the match, but this would cause variants from unidentical alleles to be falsely matched. + +This problem is easiest to conceptualize in the case of 'split' variants: imagine a pipeline calls a single 100bp DEL that can also be represented as two 50bp DELs. To match these variants, we would need to loosen our thresholds to `--pick multi --pctsim 0.50 --pctsize 0.50`. Plus, these thresholds leave no margin for error. If the variant caller erroneously deleted an extra base to make a 101bp DEL we would have to lower our thresholds even further. These thresholds are already too low because there's plenty of distinct alleles with >= 50% homology. + +So how do we deal with inconsistent representations? In an ideal world, we would simply get rid of them by harmonizing the variants. This is the aim of `truvari phab` + +`truvari phab` is designed to remove variant representation inconsistencies through harmonization. By reconstructing haplotypes from variants, running multiple-sequence alignment of the haplotypes along with the reference, and then recalling variants, we expect to remove discordance between variant representations and simplify the work required to perform variant comparison. + +Requirements +------------ +Since `truvari phab` uses mafft v7.505 via a command-line call, it expects it to be in the environment path. Download mafft and have its executable available in the `$PATH` [mafft](https://mafft.cbrc.jp/alignment/software/) + +Alternatively, you can use the Truvari [Docker container](Development#docker) which already has mafft ready for use. + +Also, you can use wave front aligner (pyWFA) or partial order alignment (pyabpoa). While wfa is the fastest approach, it will independently align haplotypes and therefore may produce less parsimonous aligments. And while poa is more accurate than wfa and faster than mafft, it is less accurate than mafft. + +Example +------- +As an example, we'll use Truvari's test files in `repo_utils/test_files/phab*` which were created from real data over a tandem repeat at GRCh38 chr1:26399065-26401053 and translated to a small test genome with coordinates chr1:1-1988. + +* `phab_base.vcf.gz` - an 86 sample squared-off pVCF +* `phab_comp.vcf.gz` - a single sample's VCF +* `phab_ref.fa` - a subset of the GRCh38 reference + +This dataset is interesting because the `HG002` sample in `phab_base.vcf.gz` uses the same sequencing experiment ([HPRC](https://github.com/human-pangenomics/HPP_Year1_Assemblies)) as the sample `syndip` in `phab_comp.vcf.gz`, but processed with a different pipeline. And as we will see, the pipeline can make all the difference. + +To start, let's use `truvari bench` to see how similar the variant calls are in this region. +```bash +truvari bench --base phab_base.vcf.gz \ + --comp phab_comp.vcf.gz \ + --sizemin 1 --sizefilt 1 \ + --bSample HG002 \ + --cSample syndip \ + --no-ref a \ + --output initial_bench +``` +This will compare all variants greater than 1bp ( `-S 1 -s 1` which includes SNPs) from the `HG002` sample to the `syndip` sample. We're also excluding any uncalled or reference homozygous sites with `--no-ref a`. The report in `initial_bench/summary.txt` shows: +```json +{ + "TP-base": 5, + "TP-comp": 5, + "FP": 2, + "FN": 22, + "precision": 0.7142857142857143, + "recall": 0.18518518518518517, + "f1": 0.2941176470588235, +} +``` + +These variants are pretty poorly matched, especially considering the `HG002` and `syndip` samples are using the same sequencing experiment. We can also inspect the `initial_bench/fn.vcf.gz` and see a lot of these discordant calls are concentrated in a 200bp window. Let's use `truvari phab` to harmonize the variants in this region. +```bash +truvari phab --base phab_base.vcf.gz \ + --comp phab_comp.vcf.gz \ + --bSample HG002 \ + --cSample syndip \ + --reference phab_ref.fa \ + --region chr1:700-900 \ + -o harmonized.vcf.gz +``` + +In our `harmonized.vcf.gz` we can see there are now only 9 variants. Let's run `truvari bench` again on the output to see how well the variants match after being harmonized. + +```bash +truvari bench -b harmonized.vcf.gz \ + -c harmonized.vcf.gz \ + -S 1 -s 1 \ + --no-ref a \ + --bSample HG002 \ + --cSample syndip \ + -o harmonized_bench/ +``` +Looking at `harmonized_bench/summary.txt` shows: +```json +{ + "TP-base": 8, + "TP-comp": 8, + "FP": 0, + "FN": 0, + "precision": 1.0, + "recall": 1.0, + "f1": 1.0, +} +``` +Now there is no difference between our two sets of variants in this region. + +For this variant call-set, `truvri phab` makes `truvari bench` overkill since the variants create identical haplotypes. In fact, we can benchmark simply by counting the genotypes. +```bash +$ bcftools query -f "[%GT ]\n" phab_result/output.vcf.gz | sort | uniq -c + 1 0/1 1/0 + 1 1/0 0/1 + 6 1/1 1/1 +``` +(We can ignore the phasing differences (`0/1` vs. `1/0`). These pipelines reported the parental alleles in a different order) + +MSA +--- + +If you read the `truvari phab --help` , you may have noticed that the `--comp` VCF is optional. This is by design so that we can also harmonize the variants inside a single VCF. By performing a multiple-sequence alignment across samples, we can better represent variation across a population. To see this in action, let's run `phab` on all 86 samples in the `repo_utils/test_files/phab_base.vcf.gz` +```bash +truvari phab -b phab_base.vcf.gz \ + -f phab_ref.fa \ + -r chr1:700-900 \ + -o msa_example.vcf.gz +``` + +As a simple check, we can count the number of variants before/after `phab`: +```bash +bcftools view -r chr1:700-900 phab_base.vcf.gz | grep -vc "#" +bcftools view -r chr1:700-900 msa_example.vcf.gz | grep -vc "#" +``` +The `160` original variants given to `phab` became just `60`. + +Better yet, these fewer variants occur on fewer positions: +```bash + +bcftools query -r chr1:700-900 -f "%POS\n" phab_base.vcf.gz | sort | uniq | wc -l +bcftools query -r chr1:700-900 -f "%POS\n" msa_example.vcf.gz | sort | uniq | wc -l +``` +This returns that the variants were over `98` positions but now sit at just `16` + +We can also observe changes in the allele frequency after running `phab`: +```bash +bcftools +fill-tags -r chr1:700-900 phab_base.vcf.gz | bcftools query -f "%AC\n" | sort -n | uniq -c +bcftools +fill-tags -r chr1:700-900 msa_example.vcf.gz | bcftools query -f "%AC\n" | sort -n | uniq -c +``` +The allele-count (AC) shows a 15% reduction in singletons and removal of all variants with an AF > 0.50 which would have suggested the reference holds a minor allele. +```txt + original phab + # AC # AC + 39 1 33 1 + 18 2 4 2 + 3 3 2 3 + 3 4 2 4 + 2 5 1 5 + ... + 3 69 1 35 + 1 89 1 40 + 8 109 1 53 + 1 132 1 56 + 1 150 1 81 +``` + +(TODO: pull the adotto TR region annotations and run this example through `truvari anno trf`. I bet we'll get a nice spectrum of copy-diff of the same motif in the `phab` calls.) + +`--align` +========= +By default, `phab` will make the haplotypes and use an external call `mafft` to perform a multiple sequence alignment between them and the reference to harmonize the variants. While this is the most accurate alignment technique, it isn't fast. If you're willing to sacrifice some accuracy for a huge speed increase, you can use `--align wfa`, which also doesn't require an external tool. Another option is `--align poa` which performs a partial order alignment which is faster than mafft but less accurate and slower than wfa but more accurate. However, `poa` appears to be non-deterministic which is not ideal for some benchmarking purposes. + +Limitations +----------- +* Creating and aligning haplotypes is impractical for very long sequences and maybe practically impossible for entire human chromosomes. Therefore, `truvari phab` is recommended to only be run on sub-regions. +* By giving the variants new representations, variant counts will likely change. +* Early testing on `phab` is on phased variants. While it can run on unphased variants, we can't yet recommend it. If regions contain unphased Hets or overlapping variants, it becomes more difficult to build a consensus sequence. So you can try out unphased variants, but proceed with caution. + diff --git a/docs/v4.3.1/refine.md b/docs/v4.3.1/refine.md new file mode 100644 index 00000000..eb2ef07a --- /dev/null +++ b/docs/v4.3.1/refine.md @@ -0,0 +1,142 @@ +As described in the [[phab|phab]] documentation, a constraint on Truvari `bench` finding matches is that there needs to be some consistency in how the variants are represented. To help automate the process of running Truvari `phab` on a benchmarking result and recomputing benchmarking performance on harmonized variants, we present the tool `refine`. + +Quick Start +=========== + +Basic +----- +After making a `bench` result: +```bash +truvari bench -b base.vcf.gz -c comp.vcf.gz -o result/ +``` +Use `refine` on the `result/` +```bash +truvari refine -r subset.bed -f ref.fa result/ +``` + +The regions spanned by `subset.bed` should be shorter and focused around the breakpoints of putative FNs/FPs. Haplotypes from these boundaries are fed into a realignment procedure which can take an extremely long time on e.g entire chromosomes. Also, the genotypes within these regions must be phased. + +Whole genome +------------ +After making a `bench` result: +```bash +truvari bench -b base.vcf.gz -c comp.vcf.gz --includebed hc_regions.bed -o result/ +``` +Use `refine` on the `result/` analyzing only the regions with putative FP/FN that would benefit from harmonization +```bash +truvari refine -R -U -f ref.fa --regions result/candidate.refine.bed result/ +``` + +Tandem Repeats +-------------- +For benchmarks such as the [GIAB TR](https://www.biorxiv.org/content/10.1101/2023.10.29.564632v1), a TR caller may analyze a different subset of regions. In order to avoid unnecessarily penalizing the performance with FNs from unanalyzed regions: + +```bash +truvari bench -b base.vcf.gz -c comp.vcf.gz --includebed hc_regions.bed -o result/ +``` + +Use `refine` on the `result/` analyzing only the `hc_regions.bed` covered by the TR caller's `tool_regions.bed` +``` +truvari refine -f ref.fa --regions tool_regions.bed result/ +``` + +Output +====== +* `refine.variant_summary.json` - result of re-evaluating calls within the specified regions. Same structure as [[summary.json|bench#summaryjson]] +* `refine.regions.txt` - Tab-delimited file with per-region variant counts +* `refine.region_summary.json` - Per-region performance metrics +* `phab_bench/` - Bench results on the subset of variants harmonized + +To see an example output, look at [test data](https://github.com/ACEnglish/truvari/tree/develop/answer_key/refine/refine_output_one) + +Using `refine.regions.txt` +========================== +| Column | Description | +| ----------------- | --------------------------------------- | +| chrom | Region's chromosome | +| start | Region's start | +| end | Region's end | +| in_tpbase | Input's True Positive base count | +| in_tp | Input's True Positive comparison count | +| in_fp | Input's false positive count | +| in_fn | Input's false negative count | +| refined | Boolean for if region was re-evaluated | +| out_tpbase | Output's true positive base count | +| out_tp | Output's true positive comparison count | +| out_fn | Outputs false positive count | +| out_fp | Output's false negative count | +| state | True/False state of the region | + + +Performance by Regions +====================== + +Because `truvari phab` can alter variant counts during harmonization, one may wish to assess the performance on a per-region basis rather than the per-variant basis. In the `refine.regions.txt`, a column `state` will have a TP/FN/FP value as defined by the following rules: + +```python +false_pos = (data['out_fp'] != 0) +false_neg = (data['out_fn'] != 0) +any_false = false_pos | false_neg + +true_positives = (data['out_tp'] != 0) & (data['out_tpbase'] != 0) & ~any_false + +true_negatives = (data[['out_tpbase', 'out_tp', 'out_fn', 'out_fp']] == 0).all(axis=1) + +baseP = (data['out_tpbase'] != 0) | (data['out_fn'] != 0) +compP = (data['out_tp'] != 0) | (data['out_fp'] != 0) +``` + +This logic has two edge cases to consider. 1) a region with at least one false-positive and one false-negative will be counted as both a false-positive and a false-negative. 2) Regions within `--refdist` may experience 'variant bleed' where they e.g. have an out_tp, but no other variants because a neighboring region actually contains the the corresponding `out_tpbase`. For the first case, we simply count the region twice and set its state in `refine.regions.txt` to "FP,FN". For the second case, we set the state to 'UNK' and ignore it when calculating the region summary. Future versions may figure out exactly how to handle (prevent?) 'UNK' regions. + +These by-region state counts are summarized and written to `refine.region_summary.json`. The definition of metrics inside this json are: +| Key | Definition | Formula | +|--------|----------------------------------------------|---------------------------------| +| TP | True Positive region count | | +| TN | True Negative region count | | +| FP | False Positive region count | | +| FN | False Negative region count | | +| base P | Regions with base variant(s) | | +| base N | Regions without base variant(s) | | +| comp P | Regions with comparison variant(s) | | +| comp N | Regions without comparison variant(s) | | +| PPV | Positive Predictive Value (a.k.a. precision) | TP / comp P | +| TPR | True Positive Rate (a.k.a. recall) | TP / base P | +| TNR | True Negative Rate (a.k.a. specificity) | TN / base N | +| NPV | Negative Predictive Value | TN / comp N | +| ACC | Accuracy | (TP + TN) / (base P + base N) | +| BA | Balanced Accuracy | (TPR + TNR) / 2 | +| F1 | f1 score | 2 * ((PPV * TPR) / (PPV + TPR)) | +| UND | Regions without an undetermined state | | + +Even though PPV is synonymous with precision, we use these abbreviated names when dealing with per-region performance in order to help users differentiate from the by-variant performance reports. + +`--align` +========= +By default, Truvari will make the haplotypes and use an external call `mafft` to perform a multiple sequence alignment between them and the reference to harmonize the variants. While this is the most accurate alignment technique, it isn't fast. If you're willing to sacrifice some accuracy for a huge speed increase, you can use `--align wfa`, which also doesn't require an external tool. Another option is `--align poa` which performs a partial order alignment which is faster than mafft but less accurate and slower than wfa but more accurate. However, `poa` appears to be non-deterministic which is not ideal for some benchmarking purposes. + +`--use-original-vcfs` +===================== + +By default, `refine` will use the base/comparison variants from the `bench` results `tp-base.vcf.gz`, `fn.vcf.gz`, `tp-comp.vcf.gz`, and `fp.vcf.gz` as input for `phab`. However, this contains a filtered subset of variants originally provided to `bench` since it removes variants e.g. below `--sizemin` or not `--passonly`. + +With the `--use-original-vcfs` parameter, all of the original calls from the input vcfs are fetched. This parameter is useful in recovering matches in situations when variants in one call set are split into two variants which are smaller than the minimum size analyzed by `bench`. For example, imagine a base VCF with a 20bp DEL, a comp VCF with two 10bp DEL, and `bench --sizemin 20` was used. `--use-original-vcfs` will consider the two 10bp comp variants during phab harmonization with the 20bp base DEL. + + +`--regions` +=========== + +This parameter specifies which regions to re-evaluate. If this is not provided, the original `bench` result's `--includebed` is used. If both `--regions` and `--includebed` are provided, the `--includebed` is subset to only those intersecting `--regions`. + +This parameter is helpful for cases when the `--includebed` is not the same set of regions that a caller analyzes. For example, if a TR caller only discovers short tandem repeats (STR), but a benchmark has TRs of all lengths, it isn't useful to benchmark against the non-STR variants. Therefore, you can run `bench` on the full benchmark's regions (`--includebed`), and automatically subset to only the regions analyzed by the caller with `refine --regions`. + +Note that the larger these regions are the slower MAFFT (used by `phab`) will run. Also, when performing the intersection as described above, there may be edge effects in the reported `refine.variant_summary.json`. For example, if a `--region` partially overlaps an `--includebed` region, you may not be analyzing a subset of calls looked at during the original `bench` run. Therefore, the `*summary.json` should be compared with caution. + +`--use-region-coords` +===================== + +When intersecting `--includebed` with `--regions`, use `--regions` coordinates. By default, `refine` will prefer the `--includebed` coordinates. However, the region's coordinates should be used when using the `candidates.refine.bed` to limit analysis to only the regions with putative FP/FN that would benefit from harmonization - for example, when performing whole genome benchmarking. + +`--reference` +============= + +By default, the reference is pulled from the original `bench` result's `params.json`. If a reference wasn't used with `bench`, it must be specified with `refine` as it's used by `phab` to realign variants. \ No newline at end of file diff --git a/docs/v4.3.1/segment.md b/docs/v4.3.1/segment.md new file mode 100644 index 00000000..953e5822 --- /dev/null +++ b/docs/v4.3.1/segment.md @@ -0,0 +1,18 @@ +Segmentation: Normalization of SVs into disjointed genomic regions + +For SVs with a span in the genome (currently only DELs), split the overlaps into disjointed regions. This is an experimental tool that explores the possibility of assisting SV association analysis. + +This tool adds an INFO field `SEGCNT` which holds the number of original SVs that overlap the newly reported region. It also adds a FORMAT field `SEG`, which is the 'allele coverage' per-sample. For example, if a sample has two overlapping heterozygous deletions, the shared region will have `SEG=2`. If the two deletions were homozygous then `SEG=4`. + +In the below example, we have three deletions found across three samples. + +![](https://github.com/spiralgenetics/truvari/blob/develop/imgs/segment_example.png) + +The `segment` added annotations for the regions would then be: +| Region | INFO/SEGCNT | S1/SEG | S2/SEG | S3/SEG | +|--------|-------------|--------|--------|--------| +| A | 1 | 2 | 0 | 0 | +| B | 2 | 2 | 1 | 0 | +| C | 3 | 2 | 2 | 2 | +| D | 2 | 2 | 1 | 0 | +| E | 1 | 0 | 1 | 0 | \ No newline at end of file diff --git a/docs/v4.3.1/stratify.md b/docs/v4.3.1/stratify.md new file mode 100644 index 00000000..353dcab5 --- /dev/null +++ b/docs/v4.3.1/stratify.md @@ -0,0 +1,58 @@ +`stratify` is a helper utility for counting variants within bed regions which is essentially the same as running `bedtools intersect -c`. When working with benchmarking results, there are are four vcfs to count (tp-base, tp-comp, fn, fp). Instead of running bedtools four times and collating the results, `stratify` can be given a single `bench` result directory to generate the counts. + +For example: +```bash +$ truvari stratify input.bed bench/ +chrom start end tpbase tp fn fp +chr20 280300 280900 0 0 0 0 +chr20 100000 200000 1 1 0 0 +chr20 642000 642350 1 1 2 1 +``` + +The output from this can then be parsed to generate more details: + +```python +import pandas as pd +import truvari + +df = pd.read_csv("stratify.output.txt", sep='\t') + +# If the input.bed didn't have a header and so we couldn't use the `--header` parameter, we need to name columns +df.columns = ['chrom', 'start', 'end', 'tpbase', 'tp', 'fn', 'fp'] + +# Create the precision, recall, and f1 for each row +metrics = df[["tpbase", "tp", "fn", "fp"]].apply((lambda x: truvari.performance_metrics(*x)), axis=1) + +# metrics is now a DataFrame with a single column of tuples, lets separate them into columns +metrics = pd.DataFrame(metrics.to_list(), columns=["precision", "recall", "f1"]) + +# Extend the dataframe's columns +df = df.join(metrics) +df.head() +``` +Which gives the result: +``` + chrom start end tpbase tp fn fp precision recall f1 +0 chr20 135221 239308 1 1 0 0 1.00 1.00 1.000000 +1 chr20 260797 465632 3 3 3 1 0.75 0.50 0.600000 +2 chr20 465866 622410 1 1 0 0 1.00 1.00 1.000000 +3 chr20 623134 655257 1 1 3 1 0.50 0.25 0.333333 +4 chr20 708338 732041 1 1 1 0 1.00 0.50 0.666667 +``` + +``` +usage: stratify [-h] [-o OUT] [--header] [-w] [--debug] BED VCF + +Count variants per-region in vcf + +positional arguments: + BED Regions to process + VCF Truvari bench result directory or a single VCF + +optional arguments: + -h, --help show this help message and exit + -o OUT, --output OUT Output bed-like file + --header Input regions have header to preserve in output + -w, --within Only count variants contained completely within region boundaries + --debug Verbose logging +``` \ No newline at end of file diff --git a/docs/v4.3.1/vcf2df.md b/docs/v4.3.1/vcf2df.md new file mode 100644 index 00000000..14d9f06c --- /dev/null +++ b/docs/v4.3.1/vcf2df.md @@ -0,0 +1,81 @@ +We enjoy using [pandas](https://pandas.pydata.org/)/[seaborn](https://seaborn.pydata.org/) for python plotting, so we've made the command `truvari vcf2df`. This will turn a VCF into a pandas DataFrame and save it to a file using joblib. The resulting DataFrame will always have the columns: +* chrom: variant chromosome +* start: 0-based start from pysam.VariantRecord.start +* end: 0-based end from pysam.VariantRecord.stop +* id : VCF column ID +* svtype : SVTYPE as determined by `truvari.entry_variant_type` +* svlen : SVLEN as determined by `truvari.entry_size` +* szbin : SVLEN's size bin as determined by `truvari.get_sizebin` +* qual : VCF column QUAL +* filter : VCF column FILTER +* is_pass : boolean of if the filter is empty or PASS + +Optionally, `vcf2df` can attempt to pull `INFO` and `FORMAT` fields from the VCF and put each field into the DataFrame as a new column. For FORMAT fields, the VCF header definition's `Number` is considered and multiple columns may be added. For example, the `AD` field, typically holding Allele Depth has `Number=A`, indicating that there will be one value for each allele. Truvari assumes that all VCFs hold one allele per-line, so there are only 2 alleles described per-line, the reference and alternate allele. Therefore, two columns are added to the DataFrame, `AD_ref` and `AD_alt` corresponding to the 0th and 1st values from the AD field's list of values. Similarity, for PL (genotype likelihood) with `Number=G`, there's three values and columns are created named `PL_ref`, `PL_het`, `PL_hom`. + +After you've created your benchmarking results with `truvari bench`, you'll often want to plot different views of your results. `vcf2df --bench-dir` can parse a truvari output directory's multiple VCF files and add a 'state' column +* state : The truvari state assigned to the variant + * tpbase : Parsed from the tp-base.vcf + * tp : Parsed from the tp-comp.vcf + * fp : Parsed from the fp.vcf + * fn : Parsed from the fn.vcf + +The created DataFrame is saved into a joblib file, which can then be plotted as simply as: +```python +import joblib +import seaborn as sb +import matplotlib.pyplot as plt + +data = joblib.load("test.jl") +p = sb.countplot(data=data[data["state"] == 'tp'], x="szbin", hue="svtype", hue_order=["DEL", "INS"]) +plt.xticks(rotation=45, ha='right') +p.set(title="True Positives by svtype and szbin") +``` +![](https://github.com/spiralgenetics/truvari/blob/develop/imgs/truv2df_example.png) + +This enables concatenation of Truvari results across multiple benchmarking experiments for advanced comparison. For example, imagine there's multiple parameters used for SV discovery over multiple samples. After running `truvari bench` on each of the results with the output directories named to `params/sample/` and each converted to DataFrames with `truvari vcf2df`, we can expand/concatenate the saved joblib DataFrames with: + +```python +import glob +import joblib +import pandas as pd + +files = glob.glob("*/*/data.jl") +dfs = [] +for f in files: + params, sample, frame = f.split('/') + d = joblib.load(f) + d["params"] = params + d["sample"] = sample + dfs.append(d) +df = pd.concat(dfs) +joblib.dump(df, "results.jl") +``` + +To facilitate range queries, PyRanges is helpful. `vcf2df` results can be parsed quickly by pyranges with the command: +```python +result = pyranges.PyRanges(df.rename(columns={'chrom':"Chromosome", "start":"Start", "end":"End"})) +``` + +``` +usage: vcf2df [-h] [-b] [-i] [-f] [-s SAMPLE] [-n] [-S] [-c LVL] [--debug] VCF JL + +Takes a vcf and creates a data frame. Can parse a bench output directory + +positional arguments: + VCF VCF to parse + JL Output joblib to save + +optional arguments: + -h, --help show this help message and exit + -b, --bench-dir Input is a truvari bench directory + -i, --info Attempt to put the INFO fields into the dataframe + -f, --format Attempt to put the FORMAT fileds into the dataframe + -s SAMPLE, --sample SAMPLE + SAMPLE name to parse when building columns for --format + -n, --no-prefix Don't prepend sample name to format columns + -S, --skip-compression + Skip the attempt to optimize the dataframe's size + -c LVL, --compress LVL + Compression level for joblib 0-9 (3) + --debug Verbose logging +``` \ No newline at end of file