From 3f05bf5e24971c8c4e536fdebcb02c90aac3ba35 Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Wed, 31 Jul 2024 13:00:35 -0400
Subject: [PATCH 01/16] readme version

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a2a1ae41..bdbc7227 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 [![pylint](imgs/pylint.svg)](https://github.com/acenglish/truvari/actions/workflows/pylint.yml)
 [![FuncTests](https://github.com/acenglish/truvari/actions/workflows/func_tests.yml/badge.svg?branch=develop&event=push)](https://github.com/acenglish/truvari/actions/workflows/func_tests.yml)
 [![coverage](imgs/coverage.svg)](https://github.com/acenglish/truvari/actions/workflows/func_tests.yml)
-[![develop](https://img.shields.io/github/commits-since/acenglish/truvari/v4.2.2)](https://github.com/ACEnglish/truvari/compare/v4.2.2...develop)
+[![develop](https://img.shields.io/github/commits-since/acenglish/truvari/v4.3.0)](https://github.com/ACEnglish/truvari/compare/v4.3.0...develop)
 [![Downloads](https://static.pepy.tech/badge/truvari)](https://pepy.tech/project/truvari)
 
 ![Logo](https://raw.githubusercontent.com/ACEnglish/truvari/develop/imgs/BoxScale1_DarkBG.png)  

From 3150ee654ab69f9ca3e5942db6e26edb4eb4cc10 Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Thu, 1 Aug 2024 19:49:31 -0400
Subject: [PATCH 02/16] correctly filtering star alleles

---
 .../bench_starallele/candidate.refine.bed     |   1 +
 .../bench/bench_starallele/fn.vcf.gz          | Bin 0 -> 2192 bytes
 .../bench/bench_starallele/fn.vcf.gz.tbi      | Bin 0 -> 128 bytes
 .../bench/bench_starallele/fp.vcf.gz          | Bin 0 -> 1814 bytes
 .../bench/bench_starallele/fp.vcf.gz.tbi      | Bin 0 -> 129 bytes
 .../answer_key/bench/bench_starallele/log.txt |  50 ++++++++++++++++++
 .../bench/bench_starallele/params.json        |   1 +
 .../bench/bench_starallele/summary.json       |  17 ++++++
 .../bench/bench_starallele/tp-base.vcf.gz     | Bin 0 -> 1608 bytes
 .../bench/bench_starallele/tp-base.vcf.gz.tbi | Bin 0 -> 72 bytes
 .../bench/bench_starallele/tp-comp.vcf.gz     | Bin 0 -> 1606 bytes
 .../bench/bench_starallele/tp-comp.vcf.gz.tbi | Bin 0 -> 72 bytes
 repo_utils/sub_tests/bench.sh                 |   7 +++
 .../test_files/variants/star.base.vcf.gz      | Bin 0 -> 1812 bytes
 .../test_files/variants/star.base.vcf.gz.tbi  | Bin 0 -> 129 bytes
 .../test_files/variants/star.comp.vcf.gz      | Bin 0 -> 1543 bytes
 .../test_files/variants/star.comp.vcf.gz.tbi  | Bin 0 -> 128 bytes
 truvari/matching.py                           |   2 +-
 18 files changed, 77 insertions(+), 1 deletion(-)
 create mode 100644 repo_utils/answer_key/bench/bench_starallele/candidate.refine.bed
 create mode 100644 repo_utils/answer_key/bench/bench_starallele/fn.vcf.gz
 create mode 100644 repo_utils/answer_key/bench/bench_starallele/fn.vcf.gz.tbi
 create mode 100644 repo_utils/answer_key/bench/bench_starallele/fp.vcf.gz
 create mode 100644 repo_utils/answer_key/bench/bench_starallele/fp.vcf.gz.tbi
 create mode 100644 repo_utils/answer_key/bench/bench_starallele/log.txt
 create mode 100644 repo_utils/answer_key/bench/bench_starallele/params.json
 create mode 100644 repo_utils/answer_key/bench/bench_starallele/summary.json
 create mode 100644 repo_utils/answer_key/bench/bench_starallele/tp-base.vcf.gz
 create mode 100644 repo_utils/answer_key/bench/bench_starallele/tp-base.vcf.gz.tbi
 create mode 100644 repo_utils/answer_key/bench/bench_starallele/tp-comp.vcf.gz
 create mode 100644 repo_utils/answer_key/bench/bench_starallele/tp-comp.vcf.gz.tbi
 create mode 100644 repo_utils/test_files/variants/star.base.vcf.gz
 create mode 100644 repo_utils/test_files/variants/star.base.vcf.gz.tbi
 create mode 100644 repo_utils/test_files/variants/star.comp.vcf.gz
 create mode 100644 repo_utils/test_files/variants/star.comp.vcf.gz.tbi

diff --git a/repo_utils/answer_key/bench/bench_starallele/candidate.refine.bed b/repo_utils/answer_key/bench/bench_starallele/candidate.refine.bed
new file mode 100644
index 00000000..015059aa
--- /dev/null
+++ b/repo_utils/answer_key/bench/bench_starallele/candidate.refine.bed
@@ -0,0 +1 @@
+chr1	6004926	6005445
\ No newline at end of file
diff --git a/repo_utils/answer_key/bench/bench_starallele/fn.vcf.gz b/repo_utils/answer_key/bench/bench_starallele/fn.vcf.gz
new file mode 100644
index 0000000000000000000000000000000000000000..5f9c77cd978dcd0f977fccc4ba9f48230705ff7f
GIT binary patch
literal 2192
zcmV;B2ygcviwFb&00000{{{d;LjnME2+desbK<xXeP(}!YUW`dHV{evhU;79g@j2}
zz81Wxou`yz3s8-*S<57o)c*L}vcWc30kV7R-bob+;M1*E_vxcnK7XF^n9b5`8RYK6
z@bX#j(VsqlzP!5jFDCB4u14;?H=Ul1m<Thz%6Xc&pS(CmV3o5>pj99Qi$47uY{E3j
z`Mj_R7a2K=Su)QTF4b+vFm+1X7+PZ}aWsQEZH%_X5XYvDVHgUGUSlvNI)K_3qr%w4
z&}^#mV^$ao+t_d{1;(l{hNJ0#(b^ch!srIkY*RDa7^lK$w&v&-u@pJPHAu4@OPNGM
zYLJ0#(^P_J!xOa(4J)=r>Rc0DGj)yH3Q(PFY++rqhze9^8xzMN#8xd=rQ5+8BtchH
zja2EjHELM8P866b-8R-N2wLT*O1EK~#4?GZBDBi3slyL36&0aXvUStKj$<pTPpe!*
zIf$*BiUj>pv&h6ch7M_CzE*B5N-dofkjuNtjprATv43{EUOutRC1?KYin-G~<H>xt
zSH~<#b9ne^&)6GnjVGn?Rg$wg%UYezVjiN1t@4FHAQLEGFqE+w3X%u~aSktlFoAed
zJW8{OWhmw%->5=ENxoe8WY_-}1|pKaz0T8h<zdD%k)uVhiW`CNIHAX#2nM-~-uE*0
z?)O6ZJqhJjq7%aa2lTLmV<C*L?*G{JFya8^5t96nUXnEF-g^^;;n#N!zwR48g|j?O
zWAT`P#y@~$OAFE?`Y(%I#QdoX@9lluFkLR8z^;6cy31~myrS;i8VWDgaW2lIESTkw
zPb>)+%OLwE7<@OL3>TXH_+^YS?fTUJjbQvtdbk_$)f@fxDDvkZ>$2}*ypC9O&Vg^)
zl0`hoS;M$8?LL5QY4ivlWNy3QruQ7qdh@pd8iAebFV^TbeMZPZ*z6O%uVF+n)ei3`
zLhO4G=5j~>$WJIxcP4X^uCov}0d2uD$Vs)I-YkGco{ukbb$fYN)YhL|Zu=6KeR&xN
zdo6vD<k>4)@LZq(<q!^ALJ%t!<nU(>kfQEQml2=M_O#+MPJ?`}*{^-%Pf(byUXipY
z_kw0wD*bOhwtSZ>pP)pOGQ?$?^I6MF`Wzn*@SZ`MlN_ZpBx4QJXNW06Pp=U6B?uBI
zU68BxFY+MU4^gRiQwc<?RB-uOE);RGU8me>%Q;J;Q(Rww7JSU{ie)@K$@hJMM1sFD
z>AI9?rB5g>XX1FOOMrA%T7b;+c>=M%@O)7$0OBTCBGz&Q%%PPPSr5~Uof5t$eJ^EY
z3ll|#^9)FNJnD-Qh?Xo6YlS1hm&ct<`I5&$#`D)L(FFT*U9>`;qB%=g2HhN$L`Jh<
z{k;OT1y?PkZ>XF82wIwFGubsgv5I;AxP)-{MqhuV=-yR)yRR92qb;J-2mkB+#R-yB
z0(WULeYn23JxMDhaMyNn^LR4pK@M*M@eMt3CGEl0N0ha21y{Im0Z*094-Ugv%!2(6
zIOK41s)R9I^3K7O$FI9iuZNT44RxKqfLbh9G5ao?syw66rl}iyuNvo>vzNxCVmBrg
zr$_dY+ZL1WQ8$xA5b2v3>o!me{=$HbQLlF{f|nT=3nhCS6l~MYTh9?9h&X*s?;AwY
zO+OsO9V$8;rVP33ZA&#is=Zr%L*II@Wh+}xPvsO~VF$d-X)&#f9Wg5ivRlFXFrTtN
zYt1|16o$#EQpAS;Czii3CTCwRkPIULL6^4#IjDqjDj3Yu??eCx*M~!phuwc?`3Vc}
z;3STN)#)=x#zMZzQV=VYmbPeBk0<hg%wJL@wM_gB?1yU+Po)0siH)2$MsQgQAepCB
zc#GXG@)QVGtn#yfj*7}*O*R`OKtPvm9u))7?$R)i{0|m}M|(SlvW-p}M@RJt3Rjx#
zY*VZ&%PQ9@^Sisi*uRlQK4nk8g0J&o9rJv1R~E<{HYVU*|AqIfYAox$ucH1My1f;1
zBR_~o0=+DFxIizkRLM@Zm#`aoDj&TQ3=4N~Ll0tDD_R$`io6CEhx+zlke-KMCU-ZT
z`@3o9YSfurTy|i>>HPM)cikzkm>s!tr~FSrrx3}Xq@s4DqB@$@>2<u0Hy)4u@z|5U
z<d5eU@4WHQ_vNp#1jw&C8U_QHyat<Fh>Ls1pt?1<CAKy=;8=m%!n|RL8k83amkhQu
zzgt92i!XqR@}r^K)}WfMUCS9%W3g-XibsWfx`Z0qpq@VABr=Q7pgFJ$hZ=)&d~mOB
zr-Nb`aCJO@6A_+ThH2@}U|(G26WgYPmgK<$n}GWrF>>)>+bwR_8SH3<+tsKwX!J0A
zVd24s{9w5~HeA@mhO)2&d<L)1*zbFz4*3iB$r-7tYhcZWVnHHR$@UuA_GF3FHStcZ
z-LS?D58zJrCvjfw&u`<#70%!B=byCQj)Kcw?MEiFc9hH;{*ErLH~giRy(O??QH!bw
z3}5R?0u!rE;2tq7t7b8!EgDm5?$Fp#ahT!;hshxhQ&X2k`a2c=TLkp#`gy~Suiyyz
zkvU>~oFjy6Ibs79G^NNHn&T)rq7$Rd5zTZO95MGPf_L~~87hM4dV?UeK@j{G>Ek9r
z^s+*oqhhl8{Ej${TR6p0_fPBYHN;ZBypqZds(LM|RvI{bm#Tmp{-34%5KewyM8AOz
zPo%kF+6|g(#BT7MHdt<8y2o<bKFRIsB)1$;C<Vt&+g4KCgf@Er9M{MmyNS80e}rmE
zf;Ufb?T`ZMnjuf|&N0qz)7!3ojVsIUzf=8>;|{N@?me&FSoBAhROcfzq}65~#WnR{
zA|op9X@;X7BB_3Wd&Fpw&mw!|vyUFX`kuN<^8E){9qg|~AOHX#iwFb&00000{{{d;
SLjnLB00RI30000000017I4g4i

literal 0
HcmV?d00001

diff --git a/repo_utils/answer_key/bench/bench_starallele/fn.vcf.gz.tbi b/repo_utils/answer_key/bench/bench_starallele/fn.vcf.gz.tbi
new file mode 100644
index 0000000000000000000000000000000000000000..9b6c2bc815fa5a0f178daffb410a3fcdb8208a8b
GIT binary patch
literal 128
zcmb2|=3rp}f&Xj_PR>jW$qd|upHfm%5)u-ak|cPUP6f;o?U-!b#dA<7Zn6Sbh|7Xa
zJgPDVt|t<98yqnxDgD{Qu`#e}iriB7m&jD6`(Xx#U;5jvK5{cK$fMaI&A<${5<~z1
D^SC7s

literal 0
HcmV?d00001

diff --git a/repo_utils/answer_key/bench/bench_starallele/fp.vcf.gz b/repo_utils/answer_key/bench/bench_starallele/fp.vcf.gz
new file mode 100644
index 0000000000000000000000000000000000000000..8e13e350313f891ebec8fd869d3a4baeab192bdb
GIT binary patch
literal 1814
zcmV+x2kH19iwFb&00000{{{d;LjnN#2F+MmZ`(K!es+Ha(LC(K#*Rqc)@lK-uLQ{9
z+G@}|1&T!3L}kepY3IQH_#H}?d?~SuT`X!K^`URR84hPY4tsVMv6x0_Hur^l-yN*9
z7Ws5`Hn_U>`eXNxtDZaROeW_&%7ct81WOb5b0>}wSP7bOwD38n;pacWCP)*(rnyZp
z%kX(jlc|`wM6+z&&<HuikPU`lTh)nuh*3)nwk=}ox?Y3PHW-Cq4M2w&y~0>nS1nS<
zV^kPZu@v1lYcOVo(QQ=&jCzQ%DvYLM)iPA$5Mx&u)lzNE#AZznl?_NWZL>Cs7;iwj
zVi`s)h-@(tQ&*LmrQuDkv8EcDN~{{tCfSOqXsU_pK$~nU*tRjY>fBZ7wiOkUpw;Xe
zuF`F(L^m}J*I=r2TZ(Ez&~-djx^>IIrh#jAgjCr!G<ad7W=BYsY|Stg+qP<UpH#Vq
z?ZB31)FkNJriu(jQ=mf*F%K1uNr<W895T2Y-*mhjGW5=Gm-8o@x%k|BT~K!-GL}qt
zdv!>YRKUZ}d&Zq!X*@2BuM$D0G&|^Y8nXa}bRlLOflMGVqbQ>h@{<txv4B5-P>xuV
zKT5NZW+-M{tW{x4NiO>y-sQifAR_78>oi?f3=zw?Kr??4Zw0<XLXMMg3b_p5XBl=z
zd!c+y0=bm%#4vz>d`xgGgx=NY+b%<o0hEPE)`w(C(xf@+jB5-Z-ZgyKH+%{rk)|<!
zOhDuBL9(R<Zei{Bncy*dYQkr0AJ<LibJ$>4en!oC(@$Pe^KJ<n&zG^_7h&c{;_-<l
z!EEkl-#LZfhU4x`wH_~rib9$m@xEY1S>cw_4B6t1e1GI(<!4R$62!}ph8GO#Et}Jj
z`GRg4SEkK-uq})pfkEcB2{f%$5VfXn4(frO>&=$vHeDfPBgJTAt*t7EC`A2uJt3km
zejwzE{uNKaQ8SV`Ntamwi-5ME8RVo|Pb>1l$?CY1tJ}d{zPH|ZQ1+!#^yMJ-_gcE2
zi0l>3nBd4q0>Ytl2x38f0dG@)<omu&8L}wa(~3cy`eLuyuRY|AQIIZPk+dk*f}$*y
z{Ohj=+=b^8lxSRpI8OzO4p`Dvd_2Grg)}Dur4f>`2I&f73enRmggpR30;Tgs-TJxk
zv;7cj^==}8XpwRzze`6U<E5Wsr3d6R2~VN!gBJW8@q%V7J&AjiBN1nBR8kibUCRlZ
z%P2pd>H;9i3JZ{VHccSbXC`L30}wY(6TXxqpnz7c$$F4x^px;Bl6x+9HaDrsu*je&
zk4N1vfM`yAzN{hQY<`?%!saaYGbUckLgVzuGH(TuqA5*i2HhMMg^VJ9`BDK61-D*E
z-%vCC6|_`jk?filU&Ks2&LJGO*4JMtx_1@dYO8u%KNQi)z4tKcpCCyEaJNh*_t*W~
zle9ttcWuWvk0%#B$l;C8zoREsTlPTpGs>D%!4)o;!PDC22f`rck-y&o9}%ukl^}*o
z-UXPl_;r``x;sAJP}k`*sKtB{(-+xP#TkXxO}(}Ds&Q6~K5tDb*4CtAx9~o)Y%%#6
zH8VK`;kKbDngyi=?=WC1sMWgQ{&U3mthRc$O7O7UI?E|S7-74uYHc0Ma??Hz;tu5<
zE*D+)Q~9};S5w=%Rq}2GD_c%ZttP-i?5xOUKBw~?8Ouv!ckf1ln9v^^wL9Vbg_)_+
z!Y%$MDxN8oQ!nF4hT(&h%lm;GQGz(-6ej6+B7h_7r$dm(+<)VKOoKZ(gJXYj`t*^}
zfGx5VWD2E)Em~{_6L~a>XV`mCG5#BDJ6woZB9(7XX5^&NgKLrx$vkC;OKj=n$>%ip
z@*9U<3X3sKRv9FKLw~Iw<)hE8YnVj-2gmSeZ^f`}y^{vf-e&lO%S<`d<bLJ0N?%2O
zcO4A98@b7+tm!Y{y6A3Vp001n9Cf-|v+pi{&VE%5WUcr0(_4br9mKqqAH+hAo@XqW
zp=WSaR43g_*o{1okKP8lIUU^1{TO_O%X~_a7r^{DFOLMt+`Sy%-84pblg3rAG42l<
zFxxb~eC=E}ipym~dTtc|N6^US@srfjhSX4EXpL6G^M>-(@rGXh*7b&+Vg3)iiZ`zV
zZ}MrwYp9B}84sNG6~qMv*PZIl;128Thim&SKlQj0*69vCr?^76*eR!dSDpYi$$x-S
zlwWntGM#F!c1_!<24Yul<&SdtbTQFYXES%g8DywVF)O&e{<V{z{H|s@aQwMS%OWOG
zZD(Ik;fZNk#5qtoX@Y@qM}J(!DI3IX+RlzVxJ{Lq&Q{yP52oU*>m4l1lfs3#*HsEZ
zE9dan7<%nauc7>n+wlF<KhV0udhZng03VA81ONa4009360763o02=@U0000000000
E01Iew=Kufz

literal 0
HcmV?d00001

diff --git a/repo_utils/answer_key/bench/bench_starallele/fp.vcf.gz.tbi b/repo_utils/answer_key/bench/bench_starallele/fp.vcf.gz.tbi
new file mode 100644
index 0000000000000000000000000000000000000000..5707f8481071aa6da9ba98e9833d2cc3b947a72b
GIT binary patch
literal 129
zcmb2|=3rp}f&Xj_PR>jWDGc0&pHfm%5)u-ak|cPUP6f;o?U-!b#dA<7Zn6T`6PE>>
zc=q3rSRgg=U|C{^#I9X6ZY?)7Y`yv~E&hT`ZCRYiz>p$#yyhb}1A{!8CDIJcU^_ts
E0Mv6R00000

literal 0
HcmV?d00001

diff --git a/repo_utils/answer_key/bench/bench_starallele/log.txt b/repo_utils/answer_key/bench/bench_starallele/log.txt
new file mode 100644
index 00000000..516d6b93
--- /dev/null
+++ b/repo_utils/answer_key/bench/bench_starallele/log.txt
@@ -0,0 +1,50 @@
+2024-08-01 19:48:18,699 [INFO] Truvari v4.3.0
+2024-08-01 19:48:18,699 [INFO] Command /Users/english/code/truvari/truvari/__main__.py bench -b repo_utils/test_files/variants/star.base.vcf.gz -c repo_utils/test_files/variants/star.comp.vcf.gz -s 0 -o test_results/bench_starallele/
+2024-08-01 19:48:18,700 [INFO] Params:
+{
+    "base": "/Users/english/code/truvari/repo_utils/test_files/variants/star.base.vcf.gz",
+    "comp": "/Users/english/code/truvari/repo_utils/test_files/variants/star.comp.vcf.gz",
+    "output": "test_results/bench_starallele/",
+    "includebed": null,
+    "extend": 0,
+    "debug": false,
+    "reference": null,
+    "refdist": 500,
+    "pctseq": 0.7,
+    "minhaplen": 50,
+    "pctsize": 0.7,
+    "pctovl": 0.0,
+    "typeignore": false,
+    "chunksize": 1000,
+    "bSample": "HG002",
+    "cSample": "HG002",
+    "dup_to_ins": false,
+    "sizemin": 0,
+    "sizefilt": 0,
+    "sizemax": 50000,
+    "passonly": false,
+    "no_ref": false,
+    "pick": "single",
+    "check_monref": true,
+    "check_multi": true
+}
+2024-08-01 19:48:18,705 [INFO] Zipped 7 variants Counter({'base': 6, 'comp': 1})
+2024-08-01 19:48:18,705 [INFO] 1 chunks of 7 variants Counter({'base': 5, '__filtered': 1, 'comp': 1})
+2024-08-01 19:48:18,721 [INFO] Stats: {
+    "TP-base": 0,
+    "TP-comp": 0,
+    "FP": 1,
+    "FN": 5,
+    "precision": 0.0,
+    "recall": 0.0,
+    "f1": null,
+    "base cnt": 5,
+    "comp cnt": 1,
+    "TP-comp_TP-gt": 0,
+    "TP-comp_FP-gt": 0,
+    "TP-base_TP-gt": 0,
+    "TP-base_FP-gt": 0,
+    "gt_concordance": 0,
+    "gt_matrix": {}
+}
+2024-08-01 19:48:18,721 [INFO] Finished bench
diff --git a/repo_utils/answer_key/bench/bench_starallele/params.json b/repo_utils/answer_key/bench/bench_starallele/params.json
new file mode 100644
index 00000000..5b9a1694
--- /dev/null
+++ b/repo_utils/answer_key/bench/bench_starallele/params.json
@@ -0,0 +1 @@
+{"base": "/Users/english/code/truvari/repo_utils/test_files/variants/star.base.vcf.gz", "comp": "/Users/english/code/truvari/repo_utils/test_files/variants/star.comp.vcf.gz", "output": "test_results/bench_starallele/", "includebed": null, "extend": 0, "debug": false, "reference": null, "refdist": 500, "pctseq": 0.7, "minhaplen": 50, "pctsize": 0.7, "pctovl": 0.0, "typeignore": false, "chunksize": 1000, "bSample": "HG002", "cSample": "HG002", "dup_to_ins": false, "sizemin": 0, "sizefilt": 0, "sizemax": 50000, "passonly": false, "no_ref": false, "pick": "single", "check_monref": true, "check_multi": true}
\ No newline at end of file
diff --git a/repo_utils/answer_key/bench/bench_starallele/summary.json b/repo_utils/answer_key/bench/bench_starallele/summary.json
new file mode 100644
index 00000000..aab0d861
--- /dev/null
+++ b/repo_utils/answer_key/bench/bench_starallele/summary.json
@@ -0,0 +1,17 @@
+{
+    "TP-base": 0,
+    "TP-comp": 0,
+    "FP": 1,
+    "FN": 5,
+    "precision": 0.0,
+    "recall": 0.0,
+    "f1": null,
+    "base cnt": 5,
+    "comp cnt": 1,
+    "TP-comp_TP-gt": 0,
+    "TP-comp_FP-gt": 0,
+    "TP-base_TP-gt": 0,
+    "TP-base_FP-gt": 0,
+    "gt_concordance": 0,
+    "gt_matrix": {}
+}
\ No newline at end of file
diff --git a/repo_utils/answer_key/bench/bench_starallele/tp-base.vcf.gz b/repo_utils/answer_key/bench/bench_starallele/tp-base.vcf.gz
new file mode 100644
index 0000000000000000000000000000000000000000..dd02c744f5fd29a66e328d8c42e129dd3461b033
GIT binary patch
literal 1608
zcmV-O2DkYiiwFb&00000{{{d;LjnLR2F+O8ZsJG~eP+I*HF?;F4aDuc*<~e`OM;XO
z157k|ij3XHt+Cx6cN;>!eyi<ZZcbpdtJOq`F!8BV)zwv}t8;M?(1-*qi(S!q><yQC
zOZ{|lF&y2_29wU8qkd=HolY<NgnJoH1!YO+b2o|*SP7DGl)9V~|MOp9<FQ20MPcKG
z8NQ52vJhcM)osTxbyYpas9TJR9nDakV~n=JV8>P+!!T+v`WB<8SO?HC#;7nhHZ)tU
z<1s6YrPzw$STz`{!WfRG14cW>*cC=Mux6W@d5m!?jAm<&Zegn?hsqYDS&mhkM2xo}
zL$OV>7NqVlsg|KBHA}<WTw`4`bxpNvK-*+1mZEDGt^;ketzgH&*sgO|rQ1<7NP=Fo
zYq(0ct*M5k>$nC}rQ23C3xclWsnTuOCbmpmvm<JiZBvIIY}V|ES|wXIEyZ!{n%!5c
zT*G!?TQ_SG^le*3rlKp*A;*}fipEk^OUDId_%ONa&I-u&?D9U3=OpXk%h@_5ovFxZ
zve@s{YmzVlFFzj`cl#UTNohPv1X+;msMASAJ>-*AgdBlPAQ2LjkpQ`gkK9PWFF*)K
zG$~%O%qJO&C>L+4u%je5gBjlEzabzZ>~+0hdBqUWj0+UHX|xmg9*KIIgcHc6|31rg
zcYF}a*Tj=c@y`qc2-J@WPKD4Pjlb<P^eI58k7RvFmV_nEad%Q<`1G#f)1l$q3j|{k
ze@;N-A3?IE1#V&e_fYVN&YSRTAL4o}j$wm4@)<Sbrkku$^C5?g=XoUfm7lqRc%GBQ
z3u8C?&I$bOdeRFu`}yWtQPk#4oqfTIvcxT=>C^O8{r=3w(#@LW#fx&E_*WF_EsKdy
zT|st?E7Rs9*p^1mz#wzm1e(^;3tEfUKTsd+I<qiG_iTxfgA}uk^|q!URZ+E%*W)Ad
z;(9`^=wI=895n-(6P9NlECSksWRR0;J*~h6C(F}LM)$*qVsB@Y;ifN@vM+~`d(hH@
zL}Y6eQo)gn1cXCk2qGn}fR6=0ihbXu^l1<rXvHvMt~hA++ZmcokjK(Bk{0D!P>?an
z|MvBWyYzg95>3hwV<u>D#KM-*=>W$B(wqpy0wiPg*b-v$(R>YI4?&PX>0D8_elFbX
zFvMEDn@S)`8K?5QbmUXM@l&q!h@2$;In)Etf}bN!Nk-XO+~Wd?IDI9Ox|HZzPS{*Z
z_~}%a07+I_fXvfH0<o^B2nz=wZk!}Mmm{EnR<6mq$1-wG_+7~z%bhJuYBDS`D9Y1O
z4@w}4iOcgEB2MGeBvTsG$jzu&Zwif*A9>LVf}sUTNCw^PmxYW1H-D)B$AVigq_3#S
zeg!QRSs=Tn#nXt2=NQ7FZ~FQxMfa}a+ilHg8^<C#eVjdw2WLo93EVG}>ErF-{w%GK
zz<t}v-SgQ+4{~_t^6zL)Ys(&}enwdfD!9UVA-t_^ejxNB9=L}c@DbtLsp3U&$-4qm
z8m;$9Z+nx|4Ry;_pcZi&kr&xi<r#(EntEsLRpTrvS?x?J_Rgf@wD2Kv(_-=&H8VK`
z;kKzLx(%fTpD<u6sMWgS?kb=>tgYUi5<D)qZoWVWBkZ&_y=`DwZu-YTJfNb(VakxJ
z-n7)t*LrrVZRi{Jt!!oa^xT>PEUbW*IW4Ajz9(iyf$Z<#*b`IoW2<>*oWd|URf@R7
zKPO^Eh@5>XM=}f-1YO<|<e=h3j1!or--!SYuAdG;9(MnY`w8(L;3SUR^!ynlBOXmN
z24aO+X^YbBcp?u-v0_MS8UGEcAFf3-k@|NaHgewR!)3{ZWS+Cb8|=o(+~uV3@*9Va
z^2=dOHX9^>Lzlh1Dh8l^*D#O#502s0!HQwq`e%)!{p|<}SDMXiQ}~tJDt(pt-FI+3
zyOW!I&YJ!PuB+ZQ=Ed7xnWJuRX9C{mFW5)bSk`)9MQ1s5`%%n0`9aj@XcbZ~L@RJr
zRwp?~*quC;Pu>Z71s&Yb-3Wa7c`>WVYhZDxZw>~@+`E}P+%?7z)5fUZm<)yum~a|j
zzIJaL<rT9bJvYk#6f_Ev{PYi=ihx}t6aWApiwFb&00000{{{d;LjnLB00RI300000
G0001S9S=JI

literal 0
HcmV?d00001

diff --git a/repo_utils/answer_key/bench/bench_starallele/tp-base.vcf.gz.tbi b/repo_utils/answer_key/bench/bench_starallele/tp-base.vcf.gz.tbi
new file mode 100644
index 0000000000000000000000000000000000000000..b9e0990fab2fca9dab7a72ca96023b80994bf088
GIT binary patch
literal 72
zcmb2|=3rp}f&Xj_PR>jW+6>%<pHfm7nA8-~S}rLpIMcw%@b2^9`8q%)@@Q(M8JNLZ
GK?DG4KMuwK

literal 0
HcmV?d00001

diff --git a/repo_utils/answer_key/bench/bench_starallele/tp-comp.vcf.gz b/repo_utils/answer_key/bench/bench_starallele/tp-comp.vcf.gz
new file mode 100644
index 0000000000000000000000000000000000000000..294ce0b975385efac220a98a16b2d35b4b1f5495
GIT binary patch
literal 1606
zcmV-M2D$kkiwFb&00000{{{d;LjnLP2F+MmZ`(K!es+Ha(LC(K#*V04-t_`rABlk+
zuI&cRQy@!}MO0Tw%8v8vcPPd3rNl0Fv1oxJPW{a{!#O?<dvOuahy*E%UD0{$4VQXL
z{d93L9No?alg^)`erMdBPA~g}dyHm+rb*{>H;NEg3BowaT+WIA`7f~X(nQcjVdI4i
zUq&QZh_IvTwquyOsvcw1Ek?zTW~k0FM%!SpW2=r~7&RDui&0dp1LzoIR2Ul@nyuFH
zm=(rSY{hV_8jMw83`f%eqa9=H3ZolXvrWxB#yAy5vo%MzuvL>oWed_Q$Er;t##@k~
z*rr(vQg@hC%g~gXqv36?v96iArrI^2ZL$?h(KQR#fwtLJu;XBC*SV|G?I;=~L9eMA
zuF`F5s$uCmuEA94wiV5SpzC<5bQ`vbEfd#NM6I%I>hK4fH5E~-Wb3A-IF4OYeYMIp
zlmpwkS(Bh|+b%K{U4aQX#ynLtma1AhE+E5)$z69=K(1$(_jx=gtb;FS>x^`!g3)BL
zKdaXyNd<iQ`M|i_-xyCy<542Wg0Q1WClU3KPcjj51Tuj{NDw0dauXl9k$}GdAso@9
z_$p;SVJM<pys5&DlH3etc%T1<fQY28>qVMZ3;|_apwP{toxt}<)YBxKKra3FS+2X|
zgHXOEo;-?wW*9)AeoSyGg#Kv!ZJ(h}0ZM%&`$MuMY0?~bCpCso?;1WG8qU2yq-n&T
z6VUiakZfszTUh@+6g;ByCVaLJalJH-p}-ya88zdko2*guA&0{AJQDoMXKo;#=OppM
z*k#{2fq%Q6^g_*kzPVNuwK-E~U$CMqaZ73XG<#LQKXb8kS(ChYQSKA}ibB7!nE2Ec
zWXHHNZ9al+Y4i*XGPg~jX)V2=wRrsl^}(()3v+azE)jB&Vz#l~))b^Fs`l}Dd_-Pc
zPskJfD;|%dW*~Et=FEdbz*rClIjPRm3S4lqJndw3KYS=^JDUtQbE%YbIgH$cksc(1
ztx-q?M=lZ&4v8U%jJN`REC5o}eV5XwL2#fI!zgvd!LZ-X&}@RdG+QHSQJw__ES3Ck
zzaDXyp3hLCNf~0C3K|@-q|4}ZfMWt_P6SE=BxCi`CB)>T`5M9=f*^s?xuWj;T)6Bo
z#9F<ZN+8NoPUY{?kx%)?PkGWKa+3JxP!B*0evUXJjHYLCj|(K?^p!~JQle`)p}3Ur
z)2S{2lB~1<nWu{cVqH-Y77jq%I7xUeS3m)yT$6P#W#pXjyOKMWl`TwaGAtN0<>{yg
zB@o5L<#`Per}1f$DUE65GAh=aM&sm1UW|fB(Sjs|!8H42Bcs60Un;<{;MNQ2D{7{{
zf|d#v$f;@ZETZB$hH&VczWz$ly{q_kTQl0mv4~C|XHVn78In{2_uFLpcssa1ODiOB
z-*$5Me0I}=9NxM7JDStlwg;-8QPzSAo^W0W-_{O45PA_0+`|d@i16)J@gjKSU4bc$
z*88Nly~*i;x=mN07I7Al7dcer9fjV8dgtub;4CRw?QAOc&ZgqD@F8+DV)8R;GPwld
zwy7w(4Xp(~VZl~Vt98ZQRX}-I+r2w2c-(H?e1Q-~*lBBe+rYBj^pAshK*fa1LvQA3
z^SPE+*V=}?;oS-r%ctjd6JQ~BT4uA@)A^o^6|J#fyRj#x<i}R+&bWVJXR5Sthks7Q
ziV(T=QjTO8E=ak2AIKHOi&9QtlYS=xxUzma1bNN<H|{6Idw@GQa<lWd4@Nwiu@qzq
zrKK&(wu_0p8pR4~4=ToggR;Yeh$d3`4rE4d8hv;sxsc3rD!jpNoXlNL3NOELm?ghl
z(`1)H0yxaq+gHWvv+o)<k^jLle06YQC|m!mMYO+NKH)L5*=h>EvaHfqncsZ}*RwlW
z<a5sSH*j6`wlOc>UdbGFdpo=DK7YYJsur@=`}#S{LF|rV-pLQ5K1ZvNdLdeYtFk-E
zLBj6jeSGpY&@1TRZSF?k%g>7~MLqzF>wI%1Nao(n<l(L{ewa2!{l;W4Y`|{Q`0}-T
z+bAEG4e7a2{vSc3kjGE|04@YkTpknv03VA81ONa4009360763o02=@U0000000000
E0Hmu6f&c&j

literal 0
HcmV?d00001

diff --git a/repo_utils/answer_key/bench/bench_starallele/tp-comp.vcf.gz.tbi b/repo_utils/answer_key/bench/bench_starallele/tp-comp.vcf.gz.tbi
new file mode 100644
index 0000000000000000000000000000000000000000..b9e0990fab2fca9dab7a72ca96023b80994bf088
GIT binary patch
literal 72
zcmb2|=3rp}f&Xj_PR>jW+6>%<pHfm7nA8-~S}rLpIMcw%@b2^9`8q%)@@Q(M8JNLZ
GK?DG4KMuwK

literal 0
HcmV?d00001

diff --git a/repo_utils/sub_tests/bench.sh b/repo_utils/sub_tests/bench.sh
index 79d399a9..02306804 100644
--- a/repo_utils/sub_tests/bench.sh
+++ b/repo_utils/sub_tests/bench.sh
@@ -78,3 +78,10 @@ run test_bench_badparams $truv bench -b nofile.vcf -c nofile.aga -f notref.fa -o
 if [ $test_bench_badparams ]; then
     assert_exit_code 100
 fi
+
+run test_bench_starallele $truv bench -b $INDIR/variants/star.base.vcf.gz \
+                                      -c $INDIR/variants/star.comp.vcf.gz \
+                                      -s 0 -o $OD/bench_starallele/
+if [ $test_bench_starallele ]; then
+    bench_assert _starallele
+fi
diff --git a/repo_utils/test_files/variants/star.base.vcf.gz b/repo_utils/test_files/variants/star.base.vcf.gz
new file mode 100644
index 0000000000000000000000000000000000000000..38a60475b459a2b70a5a7cc6dabc0803f30c2138
GIT binary patch
literal 1812
zcmV+v2kZDBiwFb&00000{{{d;LjnK_1<hC6ZsJG~eP+I*l{{`t+`gM#Rzd-iNVzb;
zM3X1v*lpYzU)b)3gna!}+reDi6BzBwL_$C~b-L<Q)uA~*k7-Kcf@Oi|K3xr0R@?Y;
zem=atn-0d^e{TETQExK2=o20?x)8L;yJx*LMGz$j<7g3ZPNK7aAto$xLFZ*mm@s^i
zl6)?bu3>q;?O2A<#28zQfqm09{3gcSV6g8QzHQqIjJ3t+2DSjy#Mm{)!?x)eDjuiC
zxVop?zN^5vHOBT$3ovFA<JB0;#-`_(P7~wT7}GO-%f+rjLwyS}UEfuTi18L=>z?B%
zLB<Y~;o7FIC=GA5#+K<=rr{|-Td{Rlw@eqSKwGtS?E4sds^e<8ecc2REJfFFEw^VH
zwrg2ffvM&8bkl{TRXnxaw&!5i!HSL;wc3saf7nrU#Hhu#99Q>!Ptkp&)*9M@J<CxD
z^nE);j&A9&Ax+G4P2(DdYvB?yd>r5RrX}QhdhxK#UI^>ri|Kknx)Z@@KHuNfYmyfN
zK0G@J?)5jp<0|+z7i3OYW7A1WLllvPNH_wUKq4WCkr)Mego0GS10aMWnwK9HEFui0
zl#35n*m07Z!4&WF-w-g7;(a|YmNi3685bxC7U@pndnCqj5>7zN=+`XQz0pA`-||qF
z5}lX^5E!2m97~~pJNmxQ(5C>U5t8#ES@I&+M!m5j@cEa4&xe68VJwOw<+B_-{s}Bw
zM&LHK{z?Q->5B%hy+hnpk!8@}u6&KOObhZg(jJ%4c)m;pzl>NAi`fgw!z2sX4^H5J
z*W;_i^kz5Lx^8GwWBL{A`U<yoEuxEe<HwAPRlqdz7N*OHM3)rijb$XFfgn4<b*T0P
zu~pCv7^Jou(6m=!+@8PxjrtJRohD24P^=L0k?wS`)iHHs=!W@uJrN>rK`3NJ|BEN&
zNQ<Q=#gc_k1gr&Nph;a%I}YHG)$v1aABK-*Z>QtoW-s+>FNbMxu+oEEur*4k;3z->
z(jgfnu^@qfr#V2%zVA{-G>#9vVwe_zI9T?(DVmN^SS;2^MpU(+m=%)$<86bxI{5@A
z8doW1g`jbRrC6oM6C4qcITxsik<2wLR*+MKUe=KI5DW>NE)c5vxd_-{ib}tmNFZ7i
zoXYp|posF#IaQ?%a*{`<P!GThevf!T7%fiX9+gPM={u3sl|?H#p}Cas<5E`uNmfOG
z&C_`fxxP}7lm~#hagy_;yaEbXWrgZt!N@7;_at{FJ6nb*6c!AI@_5#R3Wzci@TG!?
z)9g6Ogl04i7!~WypmFlwvRnmGpgGA2gKdtgK}K<~e5(OX!&MvUJJO0j!Ak{;<<_+M
zBBf%MK|1uqU;m`+e(CsL$Fw_k(?lmv)92CP1WT%b`(rYBx*I&4<P{RQA3MIEojmkl
zhxY;hfnKO`?1AccoVBEapKxIUpDLFh2*Z@e!QlpcM)>iq5~lDY?-D|3y51+fyBZ%~
zsJr47+#*|~<V`MB^&N#imU^f5dT~~iyzbmoyq%kh-^Pc?&5Fs_NMrI6ggcI|TON!S
zJi`TBNA30{4_;%+6Xo>oj9_!zddoRN7-7F-S{)n9akD;8;t`b_4z~<h^=74Z-m2N%
zj%|IgU)_u!?^~nCN$a-X8V`mo@S)b%Z@s%#brEdI7PPASe5)+w%THcO<DAM5001A0
z2m}BC000301^_}s0s!m*<yKuw!!Q(mjz2|o<fCa?M;<P524M`@g5aZU6$ffZHw3{S
zZ*JSIoo)A_4-Oo(<fbQgx#yhTbo!hz5>gt(LXjQ;0@hk*E#M7*;PMez;~d^B0&t25
z>KC~2P2XR$`O}l8J&EHiStO4uU0|yPI}*3x)-W&y3KOUAFPh<Cxq8f2I^Q+VCz7`r
zhBAm0hKnSdr!%c$45sN~o-DQM<&T!rG)pw&QWWb}S-SYptQY3-a{lV8h`H=f?%l)P
zK;MogemESAbU^z<GxkrF3{?;l>#pFMuupo$F4)<$q)}24c~<<6+p2<ES%6%$7Y~P$
zWGbB5fjc-K;-JdsnmD_`xvhNepmIrzX)eVnyGHDFO=*<t!LT$6SJB!jvEqt%IfaJ-
zQt=ZWe<U+hEP)~CElRLcRze=H1Y>I@BCms}zq9molyLu73Hr&ll?Vzd$lfkh#vNVH
zHgGM6)o%%gS{KF(3eVI$0S!-VnycybgvXgbJ|lMFa6aFC2{Q-!Nrq9)Pl81?L-CrO
zB<0`i#G7Yvv^k4a%)_n*8wiO<G1=#jbtz4zD_lXb>tOnL-0uHv_c=68%ul&b#K(Mb
zsiw-DRx<m~DpqS6sPh*zAVYAv3IG5fiwFb&00000{{{d;LjnLB00RI3000000000+
CA7q;V

literal 0
HcmV?d00001

diff --git a/repo_utils/test_files/variants/star.base.vcf.gz.tbi b/repo_utils/test_files/variants/star.base.vcf.gz.tbi
new file mode 100644
index 0000000000000000000000000000000000000000..599721efdf8512461fc60049b5ac4900d3fec9bf
GIT binary patch
literal 129
zcmb2|=3rp}f&Xj_PR>jWDGc0&pHfm%5)u-ak|cPUP6f;o?U-!b#dA<7ZgK+Gj!4Bz
z_BR|<q?xsD=<yix#KzhwZ@Qsj>(zg0@fT!j%VI?ahIh+P-uuYSz#xxii8KQ<*iH}u
E02c!#@Bjb+

literal 0
HcmV?d00001

diff --git a/repo_utils/test_files/variants/star.comp.vcf.gz b/repo_utils/test_files/variants/star.comp.vcf.gz
new file mode 100644
index 0000000000000000000000000000000000000000..529d2174d4e88ecd068992bd45bc83952dadc6f6
GIT binary patch
literal 1543
zcmV+i2Kf0OiwFb&00000{{{d;LjnK^1<hC6ZsJG~eP+I*l{{`t+`gM#Rzd-iNVzb;
zM3X1v*lpYzU)b)3gna!}+reDi6BzBwM2cd<snb=bst(Qhc}!Ch7c2`z_vvc5vf9R%
z^Yh{D-E=VS{&U;!j(U^HMW67H(S@Kz-aYH3DS{|L7)OhMa}u5X3o&7l3py`j!i3?A
zl;m@fbPdb%ZO1Z<CdSxe4D6e>;Wshn27`Ui@NL^xV5}`hH?ReuCdRHY9=1)-Q1Li5
z#??LD_FV<WtueN5T7WT|7_Y`yHa0!Sbeb5y#+aVzTP}7L8tPk+>H4lxM2xo}TlXAC
z2{Lw=4A(YwMQM1eHMUI0G7V1w+KR2ax@Edp1=^~uW8cTvQyo{!?dv9pU@5wWYq>qs
zuwBc-3QR4xr<*P$t>UTWwmk>C4pwx;sMU5X_`!~%BStN@<+!@<dy4KGwbsxM>{*UN
zpzqr$a&$|F4QXPYYZ})uTnm?w;p6zeH!UI8(~E~?_Ci<}Urg5v(wzuK^ZEX+UX#2K
z@aEY;aIe1!9#_G)xgc}G8k<g18ls3SM8Xl+1QH2BjKnC&BNU_pegQ%_qIvmN!6L#?
zO1b!Og&ilk8BFm${|x~XDc;xfVp%i9lyQNQV3F=5zDHslC*cINjDF2>-5VXG@+}W#
zDba~(0D<v2!Lbzjx1;a-41Eev8X-9!k|i&4ZPXhp0-t{w_<R`n62_t^Qa;PU<DbB?
zWdv?x>#s!cl)h;2>>c8+iY$W$cjYtEGA+p0NPApD<M}cb{4!!eEM_kx50fllKRAK^
zU5~F4)0^E~>$;&$jp<jc>nq&WwTLd>jUO{ERsqw<TbM2*5?xZ5H<poz27>Ga*P+@I
z#8yEwV368sK+|4@aeMy$H|j%NcbY8GL$N~0N4nF&R>#zlp&RDs^+brg1)-1?{V$%7
zBQ2Ji6iXID5wI47fhKi5?Kpr#R>u#yeHcELy`7GSo4wSly&R^&!AcKu!PY3Df};Ql
zNQY#Q#DW9@{>%YV_I;N!qH%oS6~nX$#KE%PP0@6W!eX&TGNP&l#jKG0A1@o+)yXF~
z(YQ)6D+G-jEX684p5Tap%(*~CjAX81v4Wf;^s<JuhhRwHbb(OS&qcrvQ&jrhL;}&G
z;8ebr2St=`&Z#PGkdr(*g?a#9@O#7y!f0_4_ozf7PTz^7t}I%~3C*R1AD6lUNU|yd
zY@W_@$n}+qq&xu3jgy=&<rPrCDl1eE3r0>!zbCmf+1WBgp|D^ul*h9kR6vxGfG-t9
zoMy*KCN!gIz^GVn291;dmgOpl0?kQI7;JM?4Kj*@<y#GC8m`(%-;q}Q30^8#EVri3
z7bz994AP+={`x0n_e;n3I;P#Rn<hGWnm&&PCs<Mi+#i$4)7{|VB(IRb{n+vS?Bt;b
zJG>A05A;HnV-Hln<E$kWe1!`WcvrdnKp3Vx4h}cqGs2Hgl`w@bd6y7M)Ac^--PQQ`
zLfsXw;1<~;C2w-6s?R9&vD7=Y*Nd~F<aOt!;_cj2{5C#BZdOb_BaO*R5bij-Zh0_T
z@E0!FI%>BsdGH!jo+zhxX9Sz$)?3aI!U+2v)9Tn*j+^y)5|60da9Q+bpEl1*Ud!y*
z-Usj1&G_-YHF})1Zu_nAVAui&YJL6IyK7Zfz?KxhRo&!UWf5O~0wWJ&Uc?Uo03VA8
z1ONa4009360763o0N?;^PQhx!Fbq9=-_jb}iIXNl2a~40gfUuC=&>X&4U9QBp)lyj
z*Gf0&p#tgYNqUmHYmh7xLaQugoFQ5=gBBO4(HgsiXiq>Bfw~M?B1p5}Rnhz5a)Gtv
znn-Y?JI`R>>FuBX<<9%L|1ncrq^17$4U&>+K8G3N*J+`ag8S)->U5J69bFiF4|zeP
z^W)$q(0pl4oevJA(#cad2S5IT<XKW3j(GZNVB54YZJSR>i>j^nAoxb*Dk+%7(rlTy
t%Tlc5#{=pzxAHmx001A02m}BC000301^_}s0stET0{{R300000001F&<lg`Q

literal 0
HcmV?d00001

diff --git a/repo_utils/test_files/variants/star.comp.vcf.gz.tbi b/repo_utils/test_files/variants/star.comp.vcf.gz.tbi
new file mode 100644
index 0000000000000000000000000000000000000000..195bab25361b1e8df8b1ba8729e09e2833374282
GIT binary patch
literal 128
zcmb2|=3rp}f&Xj_PR>jW$qd|upHfm%5)u-ak|cPUP6f;o?U-!b#dA<7ZgK+GjtIp|
z>kpbJGi%+@;}LcaoqoPU$IPq$(&8@~wn&u8+YAl{hOHZJGkxS{V30?%Lz;mZY$b>Q
E02n4El>h($

literal 0
HcmV?d00001

diff --git a/truvari/matching.py b/truvari/matching.py
index 6d55f188..60b85958 100644
--- a/truvari/matching.py
+++ b/truvari/matching.py
@@ -155,7 +155,7 @@ def filter_call(self, entry, base=False):
         Returns True if the call should be filtered
         Base has different filtering requirements, so let the method know
         """
-        if self.params.check_monref and entry.alts in (None, '*'):  # ignore monomorphic reference
+        if self.params.check_monref and entry.alts and entry.alts[0] in (None, '*'):  # ignore monomorphic reference
             return True
 
         if self.params.check_multi and len(entry.alts) > 1:

From b5b78b1821039e723ae83e5b9fe88cde9eb5b61d Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Thu, 1 Aug 2024 20:22:30 -0400
Subject: [PATCH 03/16] func test update

refine inputs had star alleles
---
 .../refine_output_one/refine.regions.txt      | 34 +++++++++----------
 .../refine.variant_summary.json               | 14 ++++----
 .../refine.region_summary.json                | 16 ++++-----
 .../refine_output_three/refine.regions.txt    | 24 ++++---------
 .../refine.variant_summary.json               |  8 ++---
 .../refine_output_two/refine.regions.txt      | 20 +++++------
 6 files changed, 53 insertions(+), 63 deletions(-)

diff --git a/repo_utils/answer_key/refine/refine_output_one/refine.regions.txt b/repo_utils/answer_key/refine/refine_output_one/refine.regions.txt
index 41bfa0b8..f617c30b 100644
--- a/repo_utils/answer_key/refine/refine_output_one/refine.regions.txt
+++ b/repo_utils/answer_key/refine/refine_output_one/refine.regions.txt
@@ -6,7 +6,7 @@ chr20	709729	709975	2	2	0	0	False	2	2	0	0	TP
 chr20	1149807	1150220	2	2	0	0	False	2	2	0	0	TP
 chr20	1257933	1260528	2	2	0	0	False	2	2	0	0	TP
 chr20	1301599	1302667	2	2	0	0	False	2	2	0	0	TP
-chr20	2240925	2241726	1	1	5	1	True	5	5	0	0	TP
+chr20	2240925	2241726	1	1	4	1	True	5	5	0	0	TP
 chr20	2442966	2443943	2	2	0	0	False	2	2	0	0	TP
 chr20	2842574	2842997	2	2	0	0	False	2	2	0	0	TP
 chr20	4032219	4033601	2	2	2	1	True	4	4	0	0	TP
@@ -19,7 +19,7 @@ chr20	5554104	5554185	0	0	0	0	False	0	0	0	0	TN
 chr20	5962327	5962410	0	0	0	0	False	0	0	0	0	TN
 chr20	7435215	7435749	2	2	0	0	False	2	2	0	0	TP
 chr20	7719330	7719414	1	1	0	0	False	1	1	0	0	TP
-chr20	7720911	7721003	1	1	1	0	True	1	1	0	0	TP
+chr20	7720911	7721003	1	1	0	0	False	1	1	0	0	TP
 chr20	7721059	7721319	0	0	0	0	False	0	0	0	0	TN
 chr20	8661911	8662657	1	1	3	1	True	9	9	0	0	TP
 chr20	9015548	9016039	1	1	0	0	False	1	1	0	0	TP
@@ -33,7 +33,7 @@ chr20	16226954	16228487	3	3	0	0	False	3	3	0	0	TP
 chr20	16257741	16259345	2	2	2	0	True	4	4	1	0	FN
 chr20	16395149	16395421	3	3	1	0	True	3	3	0	0	TP
 chr20	16894360	16894435	1	1	0	0	False	1	1	0	0	TP
-chr20	17081216	17081410	2	2	1	0	True	2	2	0	0	TP
+chr20	17081216	17081410	2	2	0	0	False	2	2	0	0	TP
 chr20	17197123	17197241	2	2	0	0	False	2	2	0	0	TP
 chr20	17303131	17303609	1	1	0	0	False	1	1	0	0	TP
 chr20	18209097	18210727	3	3	2	1	True	5	5	0	0	TP
@@ -43,9 +43,9 @@ chr20	19663366	19663526	2	2	0	0	False	2	2	0	0	TP
 chr20	20295955	20296565	2	2	2	1	True	3	3	0	0	TP
 chr20	20320285	20320579	1	1	2	1	True	1	1	0	0	TP
 chr20	20337200	20337707	1	1	6	2	True	3	3	0	0	TP
-chr20	20354570	20358323	7	7	9	1	True	16	16	0	0	TP
+chr20	20354570	20358323	7	7	8	1	True	16	16	0	0	TP
 chr20	20458640	20458937	2	2	0	0	False	2	2	0	0	TP
-chr20	21120212	21120539	2	2	1	0	True	1	1	0	0	TP
+chr20	21120212	21120539	2	2	0	0	False	2	2	0	0	TP
 chr20	21721174	21721735	2	2	1	0	True	3	3	0	0	TP
 chr20	22081961	22084122	4	4	3	1	True	9	9	0	0	TP
 chr20	22471831	22472129	2	2	0	0	False	2	2	0	0	TP
@@ -54,7 +54,7 @@ chr20	23155504	23155975	3	3	2	2	True	5	5	0	0	TP
 chr20	23267965	23268087	2	2	0	0	False	2	2	0	0	TP
 chr20	23560883	23561191	1	1	2	2	True	3	3	0	0	TP
 chr20	23942431	23942955	1	1	0	0	False	1	1	0	0	TP
-chr20	24407983	24408997	3	3	1	0	True	2	2	0	0	TP
+chr20	24407983	24408997	3	3	0	0	False	3	3	0	0	TP
 chr20	24681988	24682277	2	2	1	0	True	2	2	0	0	TP
 chr20	24836081	24836384	2	2	0	0	False	2	2	0	0	TP
 chr20	24956961	24957198	2	2	0	0	False	2	2	0	0	TP
@@ -71,10 +71,10 @@ chr20	32723469	32724637	0	0	0	0	False	0	0	0	0	TN
 chr20	32724927	32724999	0	0	0	0	False	0	0	0	0	TN
 chr20	34205619	34205960	1	1	0	0	False	1	1	0	0	TP
 chr20	34235750	34236213	0	0	2	1	True	2	2	0	0	TP
-chr20	35539120	35539724	4	4	1	0	True	4	4	0	0	TP
+chr20	35539120	35539724	4	4	0	0	False	4	4	0	0	TP
 chr20	35580013	35580886	1	1	2	1	True	7	7	0	0	TP
 chr20	36642142	36642222	1	1	0	0	False	1	1	0	0	TP
-chr20	37361752	37362058	2	2	1	0	True	2	2	0	0	TP
+chr20	37361752	37362058	2	2	0	0	False	2	2	0	0	TP
 chr20	38123590	38124705	2	2	1	4	True	7	7	0	0	TP
 chr20	38194533	38194666	2	2	0	0	False	2	2	0	0	TP
 chr20	38313618	38315069	3	3	0	0	False	3	3	0	0	TP
@@ -113,13 +113,13 @@ chr20	53913028	53913352	2	2	0	0	False	2	2	0	0	TP
 chr20	54690771	54690873	2	2	0	0	False	2	2	0	0	TP
 chr20	55532159	55533378	1	1	0	0	False	1	1	0	0	TP
 chr20	55624482	55625807	6	6	6	0	True	11	11	0	0	TP
-chr20	55627344	55628450	7	7	4	0	True	8	8	0	0	TP
+chr20	55627344	55628450	7	7	3	0	True	8	8	0	0	TP
 chr20	55943919	55945807	2	2	2	1	True	5	5	0	0	TP
 chr20	56280422	56282114	4	4	5	1	True	10	10	0	0	TP
 chr20	56284258	56284407	2	2	0	0	False	2	2	0	0	TP
 chr20	56710555	56711193	2	2	0	0	False	2	2	0	0	TP
 chr20	57090672	57091411	1	1	2	0	True	4	4	0	0	TP
-chr20	57110212	57110785	2	2	1	0	True	2	2	0	0	TP
+chr20	57110212	57110785	2	2	0	0	False	2	2	0	0	TP
 chr20	57189946	57190540	0	0	3	1	True	1	1	0	0	TP
 chr20	57350808	57350987	1	1	1	0	True	3	3	0	0	TP
 chr20	57417271	57418234	2	2	0	0	False	2	2	0	0	TP
@@ -143,7 +143,7 @@ chr20	61176725	61176974	2	2	0	0	False	2	2	0	0	TP
 chr20	61201683	61202474	2	2	4	1	True	7	7	0	0	TP
 chr20	61229587	61229717	2	2	0	0	False	2	2	0	0	TP
 chr20	61282506	61283669	4	4	2	0	True	6	6	0	0	TP
-chr20	61289625	61290555	1	1	2	1	True	2	2	0	0	TP
+chr20	61289625	61290555	1	1	1	1	True	2	2	0	0	TP
 chr20	61329258	61329577	0	0	0	2	False	0	0	0	2	FP
 chr20	61331528	61331753	2	2	0	0	False	2	2	0	0	TP
 chr20	61337205	61337590	2	2	0	0	False	2	2	0	0	TP
@@ -152,8 +152,8 @@ chr20	61475406	61475726	2	2	0	0	False	2	2	0	0	TP
 chr20	61561920	61562503	0	0	2	1	True	2	2	0	0	TP
 chr20	61653427	61654067	1	1	0	0	False	1	1	0	0	TP
 chr20	61723833	61724244	1	1	0	0	False	1	1	0	0	TP
-chr20	61744188	61744654	2	2	1	0	True	2	2	0	0	TP
-chr20	61783403	61784839	3	3	1	0	True	4	4	0	0	TP
+chr20	61744188	61744654	2	2	0	0	False	2	2	0	0	TP
+chr20	61783403	61784839	3	3	0	0	False	3	3	0	0	TP
 chr20	61919676	61921374	2	2	0	0	False	2	2	0	0	TP
 chr20	62057573	62059139	1	1	3	1	True	7	7	0	0	TP
 chr20	62212844	62213874	4	4	0	0	False	4	4	0	0	TP
@@ -161,7 +161,7 @@ chr20	62270279	62271094	1	1	3	1	True	4	4	0	0	TP
 chr20	62317837	62318459	2	2	0	0	False	2	2	0	0	TP
 chr20	62321031	62321873	2	2	3	0	True	5	5	0	0	TP
 chr20	62349612	62349932	1	1	5	1	True	6	6	0	0	TP
-chr20	62360190	62360712	0	0	8	2	True	5	5	0	0	TP
+chr20	62360190	62360712	0	0	7	2	True	5	5	0	0	TP
 chr20	62417424	62418062	3	3	0	0	False	3	3	0	0	TP
 chr20	62422444	62422791	2	2	0	0	False	2	2	0	0	TP
 chr20	62504279	62504614	2	2	0	0	False	2	2	0	0	TP
@@ -190,7 +190,7 @@ chr20	63294079	63294266	2	2	0	0	False	2	2	0	0	TP
 chr20	63303655	63304858	2	2	0	0	False	2	2	0	0	TP
 chr20	63334892	63335504	2	2	0	0	False	2	2	0	0	TP
 chr20	63361274	63361471	2	2	0	0	False	2	2	0	0	TP
-chr20	63372065	63372691	2	2	1	0	True	2	2	0	0	TP
+chr20	63372065	63372691	2	2	0	0	False	2	2	0	0	TP
 chr20	63392592	63392812	1	1	0	0	False	1	1	0	0	TP
 chr20	63401702	63402319	1	1	0	0	False	1	1	0	0	TP
 chr20	63450491	63450613	2	2	0	0	False	2	2	0	0	TP
@@ -199,7 +199,7 @@ chr20	63491589	63493064	1	1	3	1	True	4	4	0	0	TP
 chr20	63535674	63536245	1	1	2	0	True	1	1	0	0	TP
 chr20	63546436	63546842	1	1	0	0	False	1	1	0	0	TP
 chr20	63553294	63553757	1	1	0	0	False	1	1	0	0	TP
-chr20	63559288	63559963	1	1	4	1	True	4	4	0	0	TP
+chr20	63559288	63559963	1	1	2	1	True	4	4	0	0	TP
 chr20	63641779	63642135	1	1	2	1	True	2	2	0	0	TP
 chr20	63669579	63670496	2	2	0	0	False	2	2	0	0	TP
 chr20	63693325	63693884	1	1	6	1	True	10	10	0	0	TP
@@ -217,7 +217,7 @@ chr20	64042716	64043061	1	1	0	0	False	1	1	0	0	TP
 chr20	64065853	64066011	0	0	0	1	False	0	0	0	1	FP
 chr20	64090704	64091389	0	0	0	2	False	0	0	0	2	FP
 chr20	64096658	64097164	0	0	0	2	False	0	0	0	2	FP
-chr20	64125109	64127974	3	3	3	0	True	12	12	0	0	TP
+chr20	64125109	64127974	3	3	2	0	True	12	12	0	0	TP
 chr20	64131804	64133955	5	5	8	1	True	24	24	0	0	TP
 chr20	64134883	64136386	3	3	1	0	True	4	4	0	0	TP
 chr20	64157894	64158976	2	2	0	0	False	2	2	0	0	TP
diff --git a/repo_utils/answer_key/refine/refine_output_one/refine.variant_summary.json b/repo_utils/answer_key/refine/refine_output_one/refine.variant_summary.json
index edc91450..b9464c8e 100644
--- a/repo_utils/answer_key/refine/refine_output_one/refine.variant_summary.json
+++ b/repo_utils/answer_key/refine/refine_output_one/refine.variant_summary.json
@@ -1,11 +1,11 @@
 {
-    "TP-base": 593,
-    "TP-comp": 593,
+    "TP-base": 594,
+    "TP-comp": 594,
     "FP": 15,
     "FN": 2,
-    "precision": 0.975328947368421,
-    "recall": 0.9966386554621849,
-    "f1": 0.9858686616791356,
-    "base cnt": 595,
-    "comp cnt": 608
+    "precision": 0.9753694581280788,
+    "recall": 0.9966442953020134,
+    "f1": 0.9858921161825727,
+    "base cnt": 596,
+    "comp cnt": 609
 }
\ No newline at end of file
diff --git a/repo_utils/answer_key/refine/refine_output_three/refine.region_summary.json b/repo_utils/answer_key/refine/refine_output_three/refine.region_summary.json
index af76b54f..20e8fdaa 100644
--- a/repo_utils/answer_key/refine/refine_output_three/refine.region_summary.json
+++ b/repo_utils/answer_key/refine/refine_output_three/refine.region_summary.json
@@ -2,17 +2,17 @@
     "TP": 22,
     "TN": 0,
     "FP": 24,
-    "FN": 58,
-    "base P": 83,
+    "FN": 48,
+    "base P": 73,
     "base N": 9,
-    "comp P": 91,
+    "comp P": 81,
     "comp N": 1,
-    "PPV": 0.24175824175824176,
-    "TPR": 0.26506024096385544,
+    "PPV": 0.2716049382716049,
+    "TPR": 0.3013698630136986,
     "TNR": 0.0,
     "NPV": 0.0,
-    "ACC": 0.2391304347826087,
-    "BA": 0.13253012048192772,
-    "F1": 0.25287356321839083,
+    "ACC": 0.2682926829268293,
+    "BA": 0.1506849315068493,
+    "F1": 0.2857142857142857,
     "UND": 0
 }
\ No newline at end of file
diff --git a/repo_utils/answer_key/refine/refine_output_three/refine.regions.txt b/repo_utils/answer_key/refine/refine_output_three/refine.regions.txt
index 4866a470..18097dbd 100644
--- a/repo_utils/answer_key/refine/refine_output_three/refine.regions.txt
+++ b/repo_utils/answer_key/refine/refine_output_three/refine.regions.txt
@@ -1,37 +1,31 @@
 chrom	start	end	in_tpbase	in_tp	in_fn	in_fp	refined	out_tpbase	out_tp	out_fn	out_fp	state
 chr20	278819	279179	3	3	1	0	False	3	3	1	0	FN
 chr20	641802	642530	3	3	2	0	False	3	3	2	0	FN
-chr20	2240850	2241400	1	1	5	1	True	1	1	4	1	FN,FP
+chr20	2240850	2241400	1	1	4	1	True	1	1	4	1	FN,FP
 chr20	4032247	4033338	2	2	2	1	True	2	2	2	2	FN,FP
 chr20	5040366	5040587	0	0	0	2	False	0	0	0	2	FP
 chr20	5041831	5042378	1	1	1	1	True	2	2	0	0	TP
-chr20	7720842	7721078	1	1	1	0	False	1	1	1	0	FN
 chr20	8661834	8662229	1	1	3	1	True	4	4	0	0	TP
 chr20	10802617	10802954	1	1	1	0	False	1	1	1	0	FN
 chr20	13848162	13848654	3	3	1	0	False	3	3	1	0	FN
 chr20	14861944	14862754	5	5	5	1	True	6	6	1	0	FN
 chr20	16257744	16259315	2	2	2	0	False	2	2	2	0	FN
 chr20	16395091	16395483	3	3	1	0	False	3	3	1	0	FN
-chr20	17081183	17081475	2	2	1	0	False	2	2	1	0	FN
 chr20	18209029	18210244	3	3	2	1	True	6	6	0	2	FP
 chr20	20295904	20296440	2	2	2	1	True	5	5	0	0	TP
 chr20	20320229	20320629	1	1	2	1	True	4	4	0	0	TP
 chr20	20337175	20337734	1	1	6	2	True	3	3	0	0	TP
 chr20	20354802	20355545	3	3	1	0	False	3	3	1	0	FN
-chr20	20356420	20357920	4	4	8	1	True	9	9	2	0	FN
-chr20	21120188	21120571	2	2	1	0	False	2	2	1	0	FN
+chr20	20356420	20357920	4	4	7	1	True	9	9	2	0	FN
 chr20	21721341	21721756	2	2	1	0	False	2	2	1	0	FN
 chr20	22082156	22084015	4	4	3	1	True	3	3	4	1	FN,FP
 chr20	23155468	23155967	3	3	2	2	True	4	4	1	1	FN,FP
 chr20	23560829	23561208	1	1	2	2	True	2	2	0	0	TP
-chr20	24407963	24408930	3	3	1	0	False	3	3	1	0	FN
 chr20	24681956	24682235	2	2	1	0	False	2	2	1	0	FN
 chr20	25781680	25781901	0	0	0	1	False	0	0	0	1	FP
 chr20	32722934	32723155	0	0	1	0	False	0	0	1	0	FN
 chr20	34235788	34236091	0	0	2	1	True	2	2	0	0	TP
-chr20	35539102	35539692	4	4	1	0	False	4	4	1	0	FN
 chr20	35580576	35580866	1	1	2	1	True	3	3	2	1	FN,FP
-chr20	37361675	37361996	2	2	1	0	False	2	2	1	0	FN
 chr20	38123689	38124113	2	2	1	4	True	3	3	3	4	FN,FP
 chr20	38463887	38464454	2	2	2	0	False	2	2	2	0	FN
 chr20	41196260	41196605	3	3	1	0	False	3	3	1	0	FN
@@ -44,11 +38,10 @@ chr20	50775536	50775942	1	1	2	1	True	1	1	0	1	FP
 chr20	51953709	51953930	0	0	0	1	False	0	0	0	1	FP
 chr20	53203989	53204362	2	2	2	1	True	2	2	0	0	TP
 chr20	55624698	55625762	6	6	6	0	False	6	6	6	0	FN
-chr20	55627528	55628415	7	7	4	0	False	7	7	4	0	FN
+chr20	55627528	55628415	7	7	3	0	False	7	7	3	0	FN
 chr20	55944162	55945285	2	2	2	1	True	2	2	2	0	FN
 chr20	56280431	56282023	4	4	5	1	True	4	4	2	1	FN,FP
 chr20	57090758	57091276	1	1	2	0	False	1	1	2	0	FN
-chr20	57110340	57110703	2	2	1	0	False	2	2	1	0	FN
 chr20	57190146	57190538	0	0	3	1	True	1	1	0	0	TP
 chr20	57350746	57351030	1	1	1	0	False	1	1	1	0	FN
 chr20	57948891	57949456	1	1	4	1	True	1	1	4	1	FN,FP
@@ -58,16 +51,14 @@ chr20	60702895	60703197	2	2	2	1	True	3	3	0	0	TP
 chr20	61100811	61102515	1	1	4	1	True	4	4	0	0	TP
 chr20	61201712	61202352	2	2	4	1	True	3	3	0	0	TP
 chr20	61282815	61283589	4	4	2	0	False	4	4	2	0	FN
-chr20	61289552	61290383	1	1	2	1	True	2	2	0	0	TP
+chr20	61289552	61290383	1	1	1	1	True	2	2	0	0	TP
 chr20	61329235	61329551	0	0	0	2	False	0	0	0	2	FP
 chr20	61561999	61562362	0	0	2	1	True	1	1	0	0	TP
-chr20	61744291	61744702	2	2	1	0	False	2	2	1	0	FN
-chr20	61783848	61784808	3	3	1	0	False	3	3	1	0	FN
 chr20	62057492	62058878	1	1	3	1	True	2	2	0	0	TP
 chr20	62270303	62270937	1	1	3	1	True	4	4	0	0	TP
 chr20	62321286	62321840	2	2	3	0	False	2	2	3	0	FN
 chr20	62349531	62349936	1	1	5	1	True	1	1	2	1	FN,FP
-chr20	62360300	62360712	0	0	8	2	True	4	4	2	1	FN,FP
+chr20	62360300	62360712	0	0	7	2	True	4	4	2	1	FN,FP
 chr20	62830540	62830807	2	2	1	1	True	1	1	0	0	TP
 chr20	62875131	62875514	2	2	3	0	False	2	2	3	0	FN
 chr20	63027956	63029140	4	4	1	1	True	3	3	0	0	TP
@@ -75,10 +66,9 @@ chr20	63048983	63049269	3	3	1	0	False	3	3	1	0	FN
 chr20	63154577	63155031	1	1	1	0	False	1	1	1	0	FN
 chr20	63167363	63167674	2	2	1	0	False	2	2	1	0	FN
 chr20	63221399	63221831	1	1	2	1	True	2	2	0	0	TP
-chr20	63372104	63372510	2	2	1	0	False	2	2	1	0	FN
 chr20	63491847	63492500	1	1	3	1	True	4	4	0	0	TP
 chr20	63535641	63536112	1	1	2	0	False	1	1	2	0	FN
-chr20	63559305	63559829	1	1	4	1	True	5	5	0	0	TP
+chr20	63559305	63559829	1	1	2	1	True	5	5	0	0	TP
 chr20	63641737	63642125	1	1	2	1	True	3	3	0	0	TP
 chr20	63693339	63693842	1	1	6	1	True	10	10	1	0	FN
 chr20	63770826	63771124	0	0	0	2	False	0	0	0	2	FP
@@ -87,7 +77,7 @@ chr20	63964695	63966223	1	1	1	0	False	1	1	1	0	FN
 chr20	64065772	64065993	0	0	0	1	False	0	0	0	1	FP
 chr20	64090623	64091117	0	0	0	2	False	0	0	0	2	FP
 chr20	64096929	64097150	0	0	0	2	False	0	0	0	2	FP
-chr20	64125250	64127985	3	3	3	0	False	3	3	3	0	FN
+chr20	64125250	64127985	3	3	2	0	False	3	3	2	0	FN
 chr20	64131803	64133966	5	5	8	1	True	5	5	6	5	FN,FP
 chr20	64134880	64136440	3	3	1	0	False	3	3	1	0	FN
 chr20	64173328	64176440	4	4	7	3	True	9	9	3	6	FN,FP
diff --git a/repo_utils/answer_key/refine/refine_output_three/refine.variant_summary.json b/repo_utils/answer_key/refine/refine_output_three/refine.variant_summary.json
index abb55304..13ae86d6 100644
--- a/repo_utils/answer_key/refine/refine_output_three/refine.variant_summary.json
+++ b/repo_utils/answer_key/refine/refine_output_three/refine.variant_summary.json
@@ -2,10 +2,10 @@
     "TP-base": 452,
     "TP-comp": 452,
     "FP": 43,
-    "FN": 107,
+    "FN": 95,
     "precision": 0.9131313131313131,
-    "recall": 0.8085867620751341,
-    "f1": 0.857685009487666,
-    "base cnt": 559,
+    "recall": 0.8263254113345521,
+    "f1": 0.8675623800383877,
+    "base cnt": 547,
     "comp cnt": 495
 }
\ No newline at end of file
diff --git a/repo_utils/answer_key/refine/refine_output_two/refine.regions.txt b/repo_utils/answer_key/refine/refine_output_two/refine.regions.txt
index 0902bf25..8e0d4009 100644
--- a/repo_utils/answer_key/refine/refine_output_two/refine.regions.txt
+++ b/repo_utils/answer_key/refine/refine_output_two/refine.regions.txt
@@ -18,15 +18,15 @@ chr20	13848064	13848749	3	3	1	0	True	3	3	0	0	TP
 chr20	14861837	14862841	5	5	5	1	True	15	15	0	0	TP
 chr20	16226854	16228587	3	3	0	0	False	3	3	0	0	TP
 chr20	16395049	16395521	3	3	1	0	True	3	3	0	0	TP
-chr20	17081116	17081510	2	2	1	0	True	2	2	0	0	TP
+chr20	17081116	17081510	2	2	0	0	False	2	2	0	0	TP
 chr20	17197023	17197341	2	2	0	0	False	2	2	0	0	TP
 chr20	18208997	18210827	3	3	2	1	True	5	5	0	0	TP
 chr20	18675550	18676015	2	2	0	0	False	2	2	0	0	TP
 chr20	19663266	19663626	2	2	0	0	False	2	2	0	0	TP
 chr20	20337100	20337807	1	1	6	2	True	3	3	0	0	TP
-chr20	20354470	20358423	7	7	9	1	True	16	16	0	0	TP
+chr20	20354470	20358423	7	7	8	1	True	16	16	0	0	TP
 chr20	20458540	20459037	2	2	0	0	False	2	2	0	0	TP
-chr20	21120112	21120639	2	2	1	0	True	1	1	0	0	TP
+chr20	21120112	21120639	2	2	0	0	False	2	2	0	0	TP
 chr20	21721074	21721835	2	2	1	0	True	3	3	0	0	TP
 chr20	22471731	22472229	2	2	0	0	False	2	2	0	0	TP
 chr20	22882602	22882930	2	2	0	0	False	2	2	0	0	TP
@@ -61,11 +61,11 @@ chr20	50853539	50854028	2	2	0	0	False	2	2	0	0	TP
 chr20	51500492	51500980	2	2	0	0	False	2	2	0	0	TP
 chr20	53203887	53204573	2	2	2	1	True	4	4	0	0	TP
 chr20	53415542	53415960	2	2	0	0	False	2	2	0	0	TP
-chr20	55627244	55628550	7	7	4	0	True	8	8	0	0	TP
+chr20	55627244	55628550	7	7	3	0	True	8	8	0	0	TP
 chr20	55943819	55945907	2	2	2	1	True	5	5	0	0	TP
 chr20	56284158	56284507	2	2	0	0	False	2	2	0	0	TP
 chr20	56710455	56711293	2	2	0	0	False	2	2	0	0	TP
-chr20	57110112	57110885	2	2	1	0	True	2	2	0	0	TP
+chr20	57110112	57110885	2	2	0	0	False	2	2	0	0	TP
 chr20	57417171	57418334	2	2	0	0	False	2	2	0	0	TP
 chr20	57974704	57975302	2	2	0	0	False	2	2	0	0	TP
 chr20	58196600	58197036	2	2	0	0	False	2	2	0	0	TP
@@ -82,8 +82,8 @@ chr20	61329158	61329677	0	0	0	2	False	0	0	0	2	FP
 chr20	61331428	61331853	2	2	0	0	False	2	2	0	0	TP
 chr20	61337105	61337690	2	2	0	0	False	2	2	0	0	TP
 chr20	61475306	61475826	2	2	0	0	False	2	2	0	0	TP
-chr20	61744088	61744754	2	2	1	0	True	2	2	0	0	TP
-chr20	61783303	61784939	3	3	1	0	True	4	4	0	0	TP
+chr20	61744088	61744754	2	2	0	0	False	2	2	0	0	TP
+chr20	61783303	61784939	3	3	0	0	False	3	3	0	0	TP
 chr20	61919576	61921474	2	2	0	0	False	2	2	0	0	TP
 chr20	62057473	62059239	1	1	3	1	True	7	7	0	0	TP
 chr20	62212744	62213974	4	4	0	0	False	4	4	0	0	TP
@@ -110,11 +110,11 @@ chr20	63293979	63294366	2	2	0	0	False	2	2	0	0	TP
 chr20	63303555	63304958	2	2	0	0	False	2	2	0	0	TP
 chr20	63334792	63335604	2	2	0	0	False	2	2	0	0	TP
 chr20	63361174	63361571	2	2	0	0	False	2	2	0	0	TP
-chr20	63371965	63372791	2	2	1	0	True	2	2	0	0	TP
+chr20	63371965	63372791	2	2	0	0	False	2	2	0	0	TP
 chr20	63450391	63450713	2	2	0	0	False	2	2	0	0	TP
 chr20	63486663	63487516	2	2	0	0	False	2	2	0	0	TP
 chr20	63491489	63493164	1	1	3	1	True	4	4	0	0	TP
-chr20	63559188	63560063	1	1	4	1	True	4	4	0	0	TP
+chr20	63559188	63560063	1	1	2	1	True	4	4	0	0	TP
 chr20	63641679	63642235	1	1	2	1	True	2	2	0	0	TP
 chr20	63669479	63670596	2	2	0	0	False	2	2	0	0	TP
 chr20	63693225	63693984	1	1	6	1	True	10	10	0	0	TP
@@ -123,7 +123,7 @@ chr20	63824531	63825244	2	2	0	0	False	2	2	0	0	TP
 chr20	63948414	63948858	2	2	1	1	True	3	3	0	0	TP
 chr20	64090604	64091489	0	0	0	2	False	0	0	0	2	FP
 chr20	64096558	64097264	0	0	0	2	False	0	0	0	2	FP
-chr20	64125009	64128074	3	3	3	0	True	12	12	0	0	TP
+chr20	64125009	64128074	3	3	2	0	True	12	12	0	0	TP
 chr20	64131704	64134055	5	5	8	1	True	24	24	0	0	TP
 chr20	64134783	64136486	3	3	1	0	True	4	4	0	0	TP
 chr20	64173309	64176629	4	4	7	3	True	17	17	0	0	TP

From b63c5c34f7ec440f833d3e7c840dc695b6cc288e Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Wed, 7 Aug 2024 11:31:59 -0400
Subject: [PATCH 04/16] Fix #221

`stratify -w` is now default. `-w` is removed. `-v` is the old default.
Or.. stratify by default counts variants within regions and `-v` counts
those outside regions
---
 repo_utils/sub_tests/stratify.sh |  2 +-
 truvari/stratify.py              | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/repo_utils/sub_tests/stratify.sh b/repo_utils/sub_tests/stratify.sh
index 812f0a0c..b7f3257a 100644
--- a/repo_utils/sub_tests/stratify.sh
+++ b/repo_utils/sub_tests/stratify.sh
@@ -1,7 +1,7 @@
 # ------------------------------------------------------------
 #                                 stratify
 # ------------------------------------------------------------
-run test_stratify $truv stratify -w \
+run test_stratify $truv stratify \
         $INDIR/beds/include.bed \
         $INDIR/variants/input1.vcf.gz \
         -o $OD/stratify.txt
diff --git a/truvari/stratify.py b/truvari/stratify.py
index 60c81b5c..7535554a 100644
--- a/truvari/stratify.py
+++ b/truvari/stratify.py
@@ -1,5 +1,5 @@
 """
-Count variants per-region in vcf
+Count variants which start and end within each bed region
 """
 import os
 import logging
@@ -30,8 +30,8 @@ def parse_args(args):
                         help="Output bed-like file")
     parser.add_argument("--header", action="store_true",
                         help="Input regions have header to preserve in output")
-    parser.add_argument("-w", "--within", action="store_true",
-                        help="Only count variants contained completely within region boundaries")
+    parser.add_argument("-v", "--complement", action="store_false",
+                        help="Only count variants not within region boundaries")
     parser.add_argument("--debug", action="store_true",
                         help="Verbose logging")
     args = parser.parse_args(args)
@@ -62,7 +62,7 @@ def count_entries(vcf, chroms, regions, within):
     return counts
 
 
-def benchdir_count_entries(benchdir, regions, within=False, threads=4):
+def benchdir_count_entries(benchdir, regions, within=True, threads=4):
     """
     Count the number of variants per bed region in Truvari bench directory by state
 
@@ -95,12 +95,12 @@ def stratify_main(cmdargs):
     regions = pd.read_csv(args.regions, sep='\t', header=read_header)
     r_list = regions.to_numpy().tolist()  # the methods expect lists
     if os.path.isdir(args.in_vcf):
-        counts = benchdir_count_entries(args.in_vcf, r_list, args.within)
+        counts = benchdir_count_entries(args.in_vcf, r_list, args.complement)
     else:
         chroms = np.array([_[0] for _ in r_list])
         intvs = np.array([[_[1], _[2]] for _ in r_list])
         counts = count_entries(pysam.VariantFile(
-            args.in_vcf), chroms, intvs, args.within)
+            args.in_vcf), chroms, intvs, args.complement)
         counts = pd.Series(counts, name="count").to_frame()
     counts.index = regions.index
     regions = regions.join(counts)

From e0f9ce61c8766430a3b3f8c642f44d382e329e12 Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Sun, 11 Aug 2024 13:42:53 -0400
Subject: [PATCH 05/16] update bed parsing

slightly faster IntervalTree.from_tuples than .addi
---
 README.md                                |   1 +
 repo_utils/answer_key/anno/anno_bpovl.jl | Bin 2393 -> 2410 bytes
 truvari/annotations/density.py           |   4 ++--
 truvari/region_vcf_iter.py               |   7 +++++--
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index bdbc7227..69deb8d4 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,7 @@ Use Truvari's comparison engine to consolidate redundant variants in a merged mu
  - [segment](https://github.com/acenglish/truvari/wiki/segment) - Normalization of SVs into disjointed genomic regions
  - [stratify](https://github.com/acenglish/truvari/wiki/stratify) - Count variants per-region in vcf
  - [divide](https://github.com/ACEnglish/truvari/wiki/divide) - Divide a VCF into independent shards
+ - [ga4gh](https://github.com/ACEnglish/truvari/wiki/ga4gh) - Consolidate benchmarking result VCFs
 
 ## 🔎 More Information
 
diff --git a/repo_utils/answer_key/anno/anno_bpovl.jl b/repo_utils/answer_key/anno/anno_bpovl.jl
index 00328b57e8e4ffe188c3641446f8c64abf7baf43..11e3e0c8fad7ccad07fddeb64c92f702ab60c767 100644
GIT binary patch
delta 278
zcmca9^h$`Ofn{pqMiwqcjvm&OlFEYADU$^l;~C>O=Q9>CGfGc>z@jRu*pZ>+El`@o
z{T~6iCr@M*pRB<uFnI<LLuo;N22>z0S&&tnk;bI70Mt@OC_S|b%E(Zitj+GgXfZj5
sT~|adL$MR$j|QgXywco)O1=2W`HVS}B{-%qT1{Tdq0i_v`7TEd0KvIFk^lez

delta 260
zcmaDQbW@0>fn{p+MiwqcrVPo+VvO;V_p;|~e#FSa%qTVaIg2WfLPv(8w?JtU)Bpe6
zlNYjzPqttcn7jgrp|l`B0|YQaX@SX#tm2F`CZz?Sw!x^WRZv!j%4BPH2S)SBHSD^=
kvKb1U5a%~ACFhmq7F6mj-pkI&XgPTwhd!g@<aZo30HFUg6aWAK

diff --git a/truvari/annotations/density.py b/truvari/annotations/density.py
index bab723ed..83d7c7cb 100644
--- a/truvari/annotations/density.py
+++ b/truvari/annotations/density.py
@@ -61,8 +61,8 @@ def density_main(args):
     # setting new indexes after masking
     new_tree = defaultdict(IntervalTree)
     cnt = 0
-    for chrom in tree:
-        for intv in tree[chrom]:
+    for chrom, intvs in tree.items():
+        for intv in intvs:
             for i in range(intv.begin, intv.end, args.stepsize):
                 new_tree[chrom].addi(
                     i, min(intv.end, i + args.windowsize), data=cnt)
diff --git a/truvari/region_vcf_iter.py b/truvari/region_vcf_iter.py
index 4f4e4ba1..3a0d29c0 100644
--- a/truvari/region_vcf_iter.py
+++ b/truvari/region_vcf_iter.py
@@ -103,7 +103,7 @@ def build_anno_tree(filename, chrom_col=0, start_col=1, end_col=2, one_based=Fal
     """
     idx = 0
     correction = 1 if one_based else 0
-    tree = defaultdict(IntervalTree)
+    ttree = defaultdict(list)
     for line in truvari.opt_gz_open(filename):
         if line.startswith(comment):
             continue
@@ -115,8 +115,11 @@ def build_anno_tree(filename, chrom_col=0, start_col=1, end_col=2, one_based=Fal
             m_idx = idxfmt.format(idx)
         else:
             m_idx = idx
-        tree[chrom].addi(start, end + 1, data=m_idx)
+        ttree[chrom].append((start, end + 1, m_idx))
         idx += 1
+    tree = {}
+    for chrom in ttree:
+        tree[chrom] = IntervalTree.from_tuples(ttree[chrom])
     return tree, idx
 
 

From 9d88af8c71b570be3a40efa6c53e989c66215e6e Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Mon, 26 Aug 2024 12:46:26 -0400
Subject: [PATCH 06/16] correct monomorphic reference check

---
 truvari/matching.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/truvari/matching.py b/truvari/matching.py
index 60b85958..fd114b4a 100644
--- a/truvari/matching.py
+++ b/truvari/matching.py
@@ -155,7 +155,7 @@ def filter_call(self, entry, base=False):
         Returns True if the call should be filtered
         Base has different filtering requirements, so let the method know
         """
-        if self.params.check_monref and entry.alts and entry.alts[0] in (None, '*'):  # ignore monomorphic reference
+        if self.params.check_monref and (not entry.alts or entry.alts[0] in (None, '*')):  # ignore monomorphic reference
             return True
 
         if self.params.check_multi and len(entry.alts) > 1:

From a39ce0ac4af2741a475fd92b82b74549bdacf112 Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Mon, 26 Aug 2024 13:50:01 -0400
Subject: [PATCH 07/16] test coverage for monref

---
 repo_utils/run_unittest.py                | 15 +++++++++++++++
 repo_utils/test_files/variants/filter.vcf | 10 ++++++++++
 truvari/matching.py                       |  5 +----
 3 files changed, 26 insertions(+), 4 deletions(-)
 create mode 100644 repo_utils/test_files/variants/filter.vcf

diff --git a/repo_utils/run_unittest.py b/repo_utils/run_unittest.py
index efe22db2..74fdae7f 100644
--- a/repo_utils/run_unittest.py
+++ b/repo_utils/run_unittest.py
@@ -70,3 +70,18 @@
 vcf = pysam.VariantFile(vcf_fn)
 for entry in truvari.region_filter_fetch(vcf, tree, False):
     assert entry.info['include'] == 'in', f"Bad in {str(entry)}"
+
+
+"""
+Filtering logic
+"""
+vcf = pysam.VariantFile("repo_utils/test_files/variants/filter.vcf")
+matcher = truvari.Matcher()
+matcher.params.sizemin = 0
+matcher.params.sizefilt = 0
+matcher.params.passonly = True
+for entry in vcf:
+    try:
+        assert matcher.filter_call(entry), f"Didn't filter {str(entry)}"
+    except ValueError as e:
+        assert e.args[0].startswith("Cannot compare multi-allelic"), f"Unknown exception {str(entry)}"
diff --git a/repo_utils/test_files/variants/filter.vcf b/repo_utils/test_files/variants/filter.vcf
new file mode 100644
index 00000000..cbffdf9a
--- /dev/null
+++ b/repo_utils/test_files/variants/filter.vcf
@@ -0,0 +1,10 @@
+##fileformat=VCFv4.2
+##contig=<ID=reference,length=30>
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##INFO=<ID=reason,Number=1,Type=String,Description="Why variant should be filtered">
+##FILTER=<ID=FAIL,Description="Failure">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HG002
+reference	6	F1	TATATAT	.	30	.	reason=monref	GT	1|1
+reference	7	F2	ATATGCG	*	30	.	reason=star	GT	1|1
+reference	8	F3	ATATGCG	A	30	FAIL	reason=Fail	GT	1|1
+reference	9	F3	ATATGCG	A,C	30	.	reason=multi	GT	1|1
diff --git a/truvari/matching.py b/truvari/matching.py
index fd114b4a..95b225f7 100644
--- a/truvari/matching.py
+++ b/truvari/matching.py
@@ -1,7 +1,6 @@
 """
 Comparison engine
 """
-import sys
 import types
 import logging
 from collections import Counter, defaultdict
@@ -159,9 +158,7 @@ def filter_call(self, entry, base=False):
             return True
 
         if self.params.check_multi and len(entry.alts) > 1:
-            logging.error("Cannot compare multi-allelic records. Please split")
-            logging.error("line %s", str(entry))
-            sys.exit(10)
+            raise ValueError(f"Cannot compare multi-allelic records. Please split\nline {str(entry)}")
 
         if self.params.passonly and truvari.entry_is_filtered(entry):
             return True

From ffa0e813f7f8357d1b33d3e0c5ec887067eebf8f Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Wed, 28 Aug 2024 21:23:35 -0400
Subject: [PATCH 08/16] off-by-one error

in stratify. Messed with truvari refine
---
 truvari/stratify.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/truvari/stratify.py b/truvari/stratify.py
index 7535554a..cdade0cf 100644
--- a/truvari/stratify.py
+++ b/truvari/stratify.py
@@ -53,10 +53,9 @@ def count_entries(vcf, chroms, regions, within):
     for idx, row in enumerate(zip(chroms, regions)):
         chrom, coords = row
         start, end = coords
-        end += 1
         tree[chrom].addi(start, end + 1)
         counts_idx[(chrom, start, end)] = idx
-    for _, location in truvari.region_filter(vcf, tree, within, True):
+    for _, location in truvari.region_filter(vcf, tree, inside=within, with_region=True):
         key = (location[0], location[1].begin, location[1].end - 1)
         counts[counts_idx[key]] += 1
     return counts

From 967a5b66af08354fe95bb816a9fca41fbd1cad6f Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Wed, 4 Sep 2024 11:02:33 -0400
Subject: [PATCH 09/16] raising warnings about --reference deprecation

it will be removed at some point
---
 truvari/bench.py    | 6 ++++--
 truvari/collapse.py | 5 +++++
 truvari/matching.py | 2 --
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/truvari/bench.py b/truvari/bench.py
index 52e38799..9dc90d66 100644
--- a/truvari/bench.py
+++ b/truvari/bench.py
@@ -143,8 +143,10 @@ def check_params(args):
     if args.includebed and not os.path.exists(args.includebed):
         logging.error("Include bed %s does not exist", args.includebed)
         check_fail = True
-    if args.reference and not os.path.exists(args.reference):
-        logging.error("Reference %s does not exist", args.reference)
+    if args.reference:
+        logging.warning("`--reference` is no longer recommended and will be deprecated by v5")
+        not os.path.exists(args.reference):
+            logging.error("Reference %s does not exist", args.reference)
         check_fail = True
     return check_fail
 
diff --git a/truvari/collapse.py b/truvari/collapse.py
index 27ff4027..4e56c8b8 100644
--- a/truvari/collapse.py
+++ b/truvari/collapse.py
@@ -566,6 +566,11 @@ def check_params(args):
     if args.hap and args.keep != "first":
         check_fail = True
         logging.error("Using --hap must use --keep first")
+    if args.reference:
+        logging.warning("`--reference` is no longer recommended and will be deprecated by v5")
+        not os.path.exists(args.reference):
+            logging.error("Reference %s does not exist", args.reference)
+        check_fail = True
     return check_fail
 
 
diff --git a/truvari/matching.py b/truvari/matching.py
index 95b225f7..5ff891e3 100644
--- a/truvari/matching.py
+++ b/truvari/matching.py
@@ -89,8 +89,6 @@ def __init__(self, args=None):
 
         self.reference = None
         if self.params.reference is not None:
-            #sys.stderr.write("WARNING `--reference` is no longer recommended for use with bench/collapse ")
-            #sys.stderr.write("results will be slower and less accurate.\n")
             self.reference = pysam.FastaFile(self.params.reference)
 
     @staticmethod

From bce0b5b1fcf5f1dfda5740826bfc6a66213a98ec Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Wed, 4 Sep 2024 11:12:23 -0400
Subject: [PATCH 10/16] syntax error

---
 truvari/bench.py    | 2 +-
 truvari/collapse.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/truvari/bench.py b/truvari/bench.py
index 9dc90d66..83aed73a 100644
--- a/truvari/bench.py
+++ b/truvari/bench.py
@@ -145,7 +145,7 @@ def check_params(args):
         check_fail = True
     if args.reference:
         logging.warning("`--reference` is no longer recommended and will be deprecated by v5")
-        not os.path.exists(args.reference):
+        if not os.path.exists(args.reference):
             logging.error("Reference %s does not exist", args.reference)
         check_fail = True
     return check_fail
diff --git a/truvari/collapse.py b/truvari/collapse.py
index 4e56c8b8..a1388912 100644
--- a/truvari/collapse.py
+++ b/truvari/collapse.py
@@ -568,7 +568,7 @@ def check_params(args):
         logging.error("Using --hap must use --keep first")
     if args.reference:
         logging.warning("`--reference` is no longer recommended and will be deprecated by v5")
-        not os.path.exists(args.reference):
+        if not os.path.exists(args.reference):
             logging.error("Reference %s does not exist", args.reference)
         check_fail = True
     return check_fail

From ee167cc91feb76b3023fefff82fff346ed721223 Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Wed, 4 Sep 2024 11:16:25 -0400
Subject: [PATCH 11/16] fixing reference warning

---
 truvari/bench.py    | 2 +-
 truvari/collapse.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/truvari/bench.py b/truvari/bench.py
index 83aed73a..2b880852 100644
--- a/truvari/bench.py
+++ b/truvari/bench.py
@@ -147,7 +147,7 @@ def check_params(args):
         logging.warning("`--reference` is no longer recommended and will be deprecated by v5")
         if not os.path.exists(args.reference):
             logging.error("Reference %s does not exist", args.reference)
-        check_fail = True
+            check_fail = True
     return check_fail
 
 
diff --git a/truvari/collapse.py b/truvari/collapse.py
index a1388912..e715d924 100644
--- a/truvari/collapse.py
+++ b/truvari/collapse.py
@@ -570,7 +570,7 @@ def check_params(args):
         logging.warning("`--reference` is no longer recommended and will be deprecated by v5")
         if not os.path.exists(args.reference):
             logging.error("Reference %s does not exist", args.reference)
-        check_fail = True
+            check_fail = True
     return check_fail
 
 

From 82330d43c1b89158dd441776017755d92e1a6ea3 Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Sat, 7 Sep 2024 11:06:20 -0400
Subject: [PATCH 12/16] new anno chunks command

---
 repo_utils/answer_key/anno/chunks.bed | 11 ++++
 repo_utils/sub_tests/anno.sh          | 10 ++-
 truvari/anno.py                       |  1 +
 truvari/annotations/__init__.py       |  2 +
 truvari/annotations/chunks.py         | 77 +++++++++++++++++++++++
 truvari/collapse.py                   | 88 ++++++++++++++++++++++++---
 6 files changed, 179 insertions(+), 10 deletions(-)
 create mode 100644 repo_utils/answer_key/anno/chunks.bed
 create mode 100644 truvari/annotations/chunks.py

diff --git a/repo_utils/answer_key/anno/chunks.bed b/repo_utils/answer_key/anno/chunks.bed
new file mode 100644
index 00000000..3a6ee6d1
--- /dev/null
+++ b/repo_utils/answer_key/anno/chunks.bed
@@ -0,0 +1,11 @@
+chr20	149012	149095	3	3	3
+chr20	278929	279098	7	7	7
+chr20	280210	280275	1	1	1
+chr20	306267	306268	2	2	2
+chr20	380877	380878	1	1	1
+chr20	420664	420665	2	2	2
+chr20	613782	613837	1	1	1
+chr20	641905	642391	11	11	11
+chr20	709758	709852	2	2	2
+chr20	764441	764537	3	3	3
+chr20	949515	949619	1	1	1
diff --git a/repo_utils/sub_tests/anno.sh b/repo_utils/sub_tests/anno.sh
index 961c3c4d..91325985 100644
--- a/repo_utils/sub_tests/anno.sh
+++ b/repo_utils/sub_tests/anno.sh
@@ -171,10 +171,18 @@ if [ $test_anno_grpaf_subset ]; then
     assert_equal $(fn_md5 $ANSDIR/anno/anno_grpaf.subtags.vcf) $(fn_md5 $OD/anno_grpaf.subtags.vcf)
 fi
 
-# 
+#                           add id 
 run test_anno_addid \
     $truv anno addid $INDIR/variants/multi.vcf.gz -o $OD/addid.vcf
 if [ $test_anno_addid ]; then
     assert_exit_code 0
     assert_equal $(fn_md5 $ANSDIR/anno/addid.vcf) $(fn_md5 $OD/addid.vcf)
 fi
+
+
+#                           chunks
+run test_anno_chunks $truv anno chunks $INDIR/variants/multi.vcf.gz -o $OD/chunks.bed
+if [ $test_anno_chunks ]; then
+    assert_exit_code 0
+    assert_equal $(fn_md5 $ANSDIR/anno/chunks.bed) $(fn_md5 $OD/chunks.bed)
+fi
diff --git a/truvari/anno.py b/truvari/anno.py
index 62ce7592..29189eda 100644
--- a/truvari/anno.py
+++ b/truvari/anno.py
@@ -8,6 +8,7 @@
 ANNOS = {
     "addid": ("Set ID field", tannos.addid_main),
     "bpovl": ("Annotation Intersection", tannos.bpovl_main),
+    "chunks": ("Chunk Boundaries and Variant Counts", tannos.chunks_main),
     "density": ("Variant Density", tannos.density_main),
     "dpcnt": ("Call Depth Counts", tannos.dpcnt_main),
     "gcpct": ("GC Percent", tannos.gcpct_main),
diff --git a/truvari/annotations/__init__.py b/truvari/annotations/__init__.py
index ad9e04f2..1a7de4a5 100644
--- a/truvari/annotations/__init__.py
+++ b/truvari/annotations/__init__.py
@@ -1,6 +1,7 @@
 """ annotation modules """
 from truvari.annotations.addid import addid_main
 from truvari.annotations.bpovl import bpovl_main
+from truvari.annotations.chunks import chunks_main
 from truvari.annotations.density import density_main
 from truvari.annotations.dpcnt import dpcnt_main
 from truvari.annotations.gccontent import gcpct_main
@@ -18,6 +19,7 @@
 __all__ = [
     'addid_main',
     'bpovl_main',
+    'chunks_main',
     'density_main',
     'dpcnt_main',
     'gcpct_main',
diff --git a/truvari/annotations/chunks.py b/truvari/annotations/chunks.py
new file mode 100644
index 00000000..a303ca5c
--- /dev/null
+++ b/truvari/annotations/chunks.py
@@ -0,0 +1,77 @@
+"""
+Count the number of variants in each chunk
+Column 3: total number of variants
+Column 4: comma-deliminted number of sub-chunks after accounting for size
+Column 5: comma-deliminted number of sub-chunks after accounting for size and distance again
+"""
+import sys
+import argparse
+import pysam
+
+import truvari
+from truvari.collapse import tree_size_chunker, tree_dist_chunker
+
+def parse_args(args):
+    """
+    Parse arguments
+    """
+    parser = argparse.ArgumentParser(prog="chunks", description=__doc__,
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("input", type=str,
+                        help="Input VCF")
+    parser.add_argument("-o", "--output", type=str, default="/dev/stdout",
+                        help="Output name")
+    parser.add_argument("-c", "--chunksize", type=int, default=500,
+                        help="Distance between variants to split chunks (%(default)s)")
+    parser.add_argument("-s", "--sizemin", type=int, default=50,
+                        help="Minimum SV length")
+    parser.add_argument("-S", "--sizemax", type=int, default=50000,
+                        help="Maximum SV length")
+    args = parser.parse_args(args)
+    truvari.setup_logging(show_version=True)
+    return args
+
+def get_bounds(cnk):
+    """
+    Min start and max end of variants
+    """
+    mstart = sys.maxsize
+    mend = 0
+    for i in cnk:
+        mstart = min(mstart, i.start)
+        mend = max(mend, i.stop)
+    return mstart, mend
+
+def chunks_main(args):
+    """
+    Main
+    """
+    args = parse_args(args)
+    v = pysam.VariantFile(args.input)
+    m = truvari.Matcher()
+    m.params.pctseq = 0
+    m.params.sizemin = args.sizemin
+    m.params.sizefilt = args.sizemin
+    m.params.sizemax = args.sizemax
+    m.params.chunksize = args.chunksize
+    m.params.refdist = args.chunksize
+    c = truvari.chunker(m, ('base', v))
+
+    with open(args.output, 'w') as fout:
+        for chunk, _ in c:
+            if not chunk['base']:
+                continue
+            s, e = get_bounds(chunk['base'])
+            chrom = chunk['base'][0].chrom
+            num = len(chunk['base'])
+            fout.write(f"{chrom}\t{s}\t{e}\t{num}")
+            s_cnts = []
+            d_cnts = []
+            for i, _ in tree_size_chunker(m, [(chunk, 0)]):
+                if i['base']:
+                    s_cnts.append(len(i['base']))
+                for j, _ in tree_dist_chunker(m, [(i, 0)]):
+                    d_cnts.append(len(j['base']))
+            s_cnts = ",".join(map(str, s_cnts))
+            d_cnts = ",".join(map(str, d_cnts))
+            fout.write(f"\t{s_cnts}\t{d_cnts}\n")
diff --git a/truvari/collapse.py b/truvari/collapse.py
index e715d924..068054ee 100644
--- a/truvari/collapse.py
+++ b/truvari/collapse.py
@@ -16,7 +16,6 @@
 
 import pysam
 import numpy as np
-from intervaltree import IntervalTree
 
 import truvari
 import truvari.bench as trubench
@@ -721,6 +720,76 @@ def dump_log(self):
         logging.info("%d samples' FORMAT fields consolidated",
                      self["stats_box"]["consol_cnt"])
 
+class LinkedList:
+    """
+    Simple linked list which should(?) be faster than concatenating a bunch 
+    regular lists
+    """
+    def __init__(self, data=None):
+        """
+        init
+        """
+        self.head = None
+        self.tail = None
+        if data is not None:
+            self.append(data)
+
+    def append(self, data):
+        """
+        Put data onto end of list
+        """
+        new_node = (data, None)
+        if not self.head:
+            self.head = new_node
+            self.tail = new_node
+            return
+        self.tail[1] = new_node
+        self.tail = new_node
+
+
+    def to_list(self):
+        """
+        Turn into a regular list
+        """
+        cur_node = self.head
+        ret = []
+        while cur_node:
+            ret.append(cur_node[0])
+            cur_node = cur_node[1]
+        return ret
+
+    def concatenate(self, other):
+        """
+        Combine two linked lists
+        """
+        if not self.head:  # If the first list is empty
+            return other
+        self.tail[1] = other.head
+        self.tail = other.tail
+        return self
+
+def merge_intervals(intervals):
+    """
+    Merge sorted list of tuples
+    """
+    if not intervals:
+        return []
+    merged = []
+
+    current_start, current_end, current_data = intervals[0]
+    for i in range(1, len(intervals)):
+        next_start, next_end, next_data = intervals[i]
+
+        # Should be <=, maybe. but this replicates intervaltree
+        if next_start < current_end:
+            current_end = max(current_end, next_end)
+            current_data.concatenate(next_data)
+        else:
+            # No overlap
+            merged.append((current_start, current_end, current_data))
+            current_start, current_end, current_data = next_start, next_end, next_data
+    merged.append((current_start, current_end, current_data))
+    return merged
 
 def tree_size_chunker(matcher, chunks):
     """
@@ -733,8 +802,8 @@ def tree_size_chunker(matcher, chunks):
             chunk_count += 1
             yield chunk, chunk_count
             continue
-        tree = IntervalTree()
         yield {'__filtered': chunk['__filtered'], 'base': []}, chunk_count
+        to_add = []
         for entry in chunk['base']:
             # How much smaller/larger would be in sizesim?
             sz = truvari.entry_size(entry)
@@ -743,17 +812,18 @@ def tree_size_chunker(matcher, chunks):
                 sz *= - \
                     1 if truvari.entry_variant_type(
                         entry) == truvari.SV.DEL else 1
-            tree.addi(sz - diff, sz + diff, data=[entry])
-        tree.merge_overlaps(data_reducer=lambda x, y: x + y)
+            to_add.append((sz - diff, sz + diff, LinkedList(entry)))
+        tree = merge_intervals(to_add)
         for intv in tree:
             chunk_count += 1
-            yield {'base': intv.data, '__filtered': []}, chunk_count
+            yield {'base': intv[2].to_list(), '__filtered': []}, chunk_count
 
 
 def tree_dist_chunker(matcher, chunks):
     """
     To reduce the number of variants in a chunk try to sub-chunk by reference distance before hand
     Needs to return the same thing as a chunker
+    This does nothing
     """
     chunk_count = 0
     for chunk, _ in chunks:
@@ -761,17 +831,17 @@ def tree_dist_chunker(matcher, chunks):
             chunk_count += 1
             yield chunk, chunk_count
             continue
-        tree = IntervalTree()
         yield {'__filtered': chunk['__filtered'], 'base': []}, chunk_count
+        to_add = []
         for entry in chunk['base']:
             st, ed = truvari.entry_boundaries(entry)
             st -= matcher.params.refdist
             ed += matcher.params.refdist
-            tree.addi(st, ed, data=[entry])
-        tree.merge_overlaps(data_reducer=lambda x, y: x + y)
+            to_add.append((st, ed, LinkedList(entry)))
+        tree = merge_intervals(to_add)
         for intv in tree:
             chunk_count += 1
-            yield {'base': intv.data, '__filtered': []}, chunk_count
+            yield {'base': intv[2].to_list(), '__filtered': []}, chunk_count
 
 
 def collapse_main(args):

From 6aac99d25657ee0691817748603489ae6bd1d4c4 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Sat, 7 Sep 2024 15:08:52 +0000
Subject: [PATCH 13/16] Update coverage score

---
 imgs/coverage.svg | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/imgs/coverage.svg b/imgs/coverage.svg
index ea71b13a..a9915353 100644
--- a/imgs/coverage.svg
+++ b/imgs/coverage.svg
@@ -17,7 +17,7 @@
         <text x="32.5" y="14">coverage</text>
     </g>
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
-        <text x="83.0" y="15" fill="#010101" fill-opacity=".3">90%</text>
-        <text x="82.0" y="14">90%</text>
+        <text x="83.0" y="15" fill="#010101" fill-opacity=".3">89%</text>
+        <text x="82.0" y="14">89%</text>
     </g>
 </svg>

From feda14b5fa8577a24d3f9c8fd9d7004e311ec494 Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Sat, 7 Sep 2024 11:42:41 -0400
Subject: [PATCH 14/16] update func tests

---
 imgs/coverage.svg                                         | 4 ++--
 .../answer_key/collapse/input1_median_collapsed.vcf       | 5 +++--
 repo_utils/answer_key/collapse/input1_median_removed.vcf  | 3 +--
 repo_utils/sub_tests/collapse.sh                          | 3 +++
 truvari/collapse.py                                       | 8 +++++---
 5 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/imgs/coverage.svg b/imgs/coverage.svg
index ea71b13a..6a91c8d4 100644
--- a/imgs/coverage.svg
+++ b/imgs/coverage.svg
@@ -17,7 +17,7 @@
         <text x="32.5" y="14">coverage</text>
     </g>
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
-        <text x="83.0" y="15" fill="#010101" fill-opacity=".3">90%</text>
-        <text x="82.0" y="14">90%</text>
+        <text x="83.0" y="15" fill="#010101" fill-opacity=".3">91%</text>
+        <text x="82.0" y="14">91%</text>
     </g>
 </svg>
diff --git a/repo_utils/answer_key/collapse/input1_median_collapsed.vcf b/repo_utils/answer_key/collapse/input1_median_collapsed.vcf
index a5af9093..f3b9734b 100644
--- a/repo_utils/answer_key/collapse/input1_median_collapsed.vcf
+++ b/repo_utils/answer_key/collapse/input1_median_collapsed.vcf
@@ -888,7 +888,7 @@ chr20	419860	.	AGTGACCCTGCACCTGGCT	A	60	.	QNAME=HG002-S9-H2-000001F;QSTART=37411
 chr20	420228	.	A	C	60	.	QNAME=HG002-S9-H2-000001F;QSTART=374466;QSTRAND=+	GT:PL:DP	0|1:2,3,6:24,29
 chr20	420465	.	A	AG	60	.	QNAME=HG002-S9-H2-000001F;QSTART=374704;QSTRAND=+	GT:PL:DP	0|1:6,10,1:32,35
 chr20	420561	.	A	T	60	.	QNAME=HG002-S9-H1-000001F;QSTART=399939;QSTRAND=+	GT:PL:DP	1|1:5,6,7:36,12
-chr20	420665	.	G	GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCCATCCCCCGTCCGCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC	61	.	QNAME=HG002-S9-H2-000001F;QSTART=374905;QSTRAND=+;SVTYPE=INS;SVLEN=227;NumCollapsed=1;NumConsolidated=0;CollapseId=4.0;CollapseStart=420664;CollapseEnd=420665;CollapseSize=226	GT:PL	0/1:.
+chr20	420665	.	G	GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCCATCCCCCGTCCGCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC	61	.	QNAME=HG002-S9-H2-000001F;QSTART=374905;QSTRAND=+;SVTYPE=INS;SVLEN=227;NumCollapsed=1;NumConsolidated=0;CollapseId=9.0;CollapseStart=420664;CollapseEnd=420665;CollapseSize=226	GT:PL	0/1:.
 chr20	421409	.	G	A	60	.	QNAME=HG002-S9-H1-000001F;QSTART=401013;QSTRAND=+	GT:PL:DP	1|1:2,3,7:14,41
 chr20	421527	.	T	C	60	.	QNAME=HG002-S9-H2-000001F;QSTART=375993;QSTRAND=+	GT:PL:DP	0|1:3,8,4:49,42
 chr20	422066	.	A	G	60	.	QNAME=HG002-S9-H1-000001F;QSTART=401670;QSTRAND=+	GT:PL:DP	1|1:3,7,8:9,39
@@ -1260,7 +1260,7 @@ chr20	639104	.	A	AT	60	.	QNAME=HG002-S9-H1-000001F;QSTART=618555;QSTRAND=+	GT:PL
 chr20	640046	.	C	T	60	.	QNAME=HG002-S9-H1-000001F;QSTART=619497;QSTRAND=+	GT:PL:DP	1|0:10,10,9:11,48
 chr20	640049	.	C	T	60	.	QNAME=HG002-S9-H1-000001F;QSTART=619500;QSTRAND=+	GT:PL:DP	1|1:2,3,4:13,44
 chr20	641878	.	C	G	60	.	QNAME=HG002-S9-H1-000001F;QSTART=621329;QSTRAND=+	GT:PL:DP	1|0:8,1,7:7,13
-chr20	641913	.	G	GGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGA	60	.	QNAME=HG002-S9-H1-000001F;QSTART=621365;QSTRAND=+;SVTYPE=INS;SVLEN=66;NumCollapsed=1;NumConsolidated=0;CollapseId=6.0;CollapseStart=642120;CollapseEnd=642121;CollapseSize=66	GT:PL:DP	1/0:7,5,7:34,13
+chr20	641913	.	G	GGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGA	60	.	QNAME=HG002-S9-H1-000001F;QSTART=621365;QSTRAND=+;SVTYPE=INS;SVLEN=66	GT:PL:DP	1/0:7,5,7:34,13
 chr20	641944	.	GGA	G	60	.	QNAME=HG002-S9-H1-000001F;QSTART=621462;QSTRAND=+	GT:PL:DP	1|0:6,6,6:17,7
 chr20	642012	.	GGT	G	60	.	QNAME=HG002-S9-H1-000001F;QSTART=621528;QSTRAND=+	GT:PL:DP	1|0:5,7,1:6,23
 chr20	642037	.	T	TG	60	.	QNAME=HG002-S9-H1-000001F;QSTART=621551;QSTRAND=+	GT:PL:DP	1|0:4,10,10:14,47
@@ -1280,6 +1280,7 @@ chr20	642284	.	A	G	60	.	QNAME=HG002-S9-H1-000001F;QSTART=621795;QSTRAND=+	GT:PL:
 chr20	642300	.	G	GCCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGCCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGC	60	.	QNAME=HG002-S9-H1-000001F;QSTART=621812;QSTRAND=+;SVTYPE=INS;SVLEN=408	GT:PL:DP	1/0:5,10,5:44,36
 chr20	642300	.	G	GGC	60	.	QNAME=HG002-S9-H2-000001F;QSTART=597225;QSTRAND=+	GT:PL:DP	0|1:4,10,8:25,7
 chr20	642330	.	G	C	60	.	QNAME=HG002-S9-H1-000001F;QSTART=622249;QSTRAND=+	GT:PL:DP	1|0:2,9,8:16,10
+chr20	642330	.	G	GGCCCAGCGGGGGTGGAGTTGCCTGTGGTGGGGGGCCCAGCGGGGGTGGAGTTGCCTGTGGGGGGGC	60	.	QNAME=HG002-S9-H2-000001F;QSTART=597257;QSTRAND=+;SVTYPE=INS;SVLEN=66	GT:PL:DP	0/1:6,1,9:20,25
 chr20	642362	.	G	GC	60	.	QNAME=HG002-S9-H1-000001F;QSTART=622282;QSTRAND=+	GT:PL:DP	1|1:10,5,1:17,12
 chr20	642391	.	G	GC	60	.	QNAME=HG002-S9-H1-000001F;QSTART=622312;QSTRAND=+	GT:PL:DP	1|1:2,9,2:10,50
 chr20	642420	.	G	GC	60	.	QNAME=HG002-S9-H2-000001F;QSTART=597415;QSTRAND=+	GT:PL:DP	0|1:7,1,2:41,16
diff --git a/repo_utils/answer_key/collapse/input1_median_removed.vcf b/repo_utils/answer_key/collapse/input1_median_removed.vcf
index 2905ef00..cccc217d 100644
--- a/repo_utils/answer_key/collapse/input1_median_removed.vcf
+++ b/repo_utils/answer_key/collapse/input1_median_removed.vcf
@@ -47,5 +47,4 @@
 ##INFO=<ID=MatchId,Number=.,Type=String,Description="Tuple of base and comparison call ids which were matched">
 ##INFO=<ID=Multi,Number=0,Type=Flag,Description="Call is false due to non-multimatching">
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA24385
-chr20	420665	.	G	GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCATCCCCCGTCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC	60	.	QNAME=HG002-S9-H1-000001F;QSTART=400044;QSTRAND=+;SVTYPE=INS;SVLEN=226;PctSeqSimilarity=0.9956;PctSizeSimilarity=0.9956;PctRecOverlap=1;SizeDiff=1;StartDistance=0;EndDistance=0;GTMatch=.;TruScore=99;MatchId=4.0	GT:PL:DP	1/0:4,8,6:32,9
-chr20	642330	.	G	GGCCCAGCGGGGGTGGAGTTGCCTGTGGTGGGGGGCCCAGCGGGGGTGGAGTTGCCTGTGGGGGGGC	60	.	QNAME=HG002-S9-H2-000001F;QSTART=597257;QSTRAND=+;SVTYPE=INS;SVLEN=66;PctSeqSimilarity=0.9576;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-417;EndDistance=-417;GTMatch=.;TruScore=65;MatchId=6.0	GT:PL:DP	0/1:6,1,9:20,25
+chr20	420665	.	G	GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCATCCCCCGTCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC	60	.	QNAME=HG002-S9-H1-000001F;QSTART=400044;QSTRAND=+;SVTYPE=INS;SVLEN=226;PctSeqSimilarity=0.9956;PctSizeSimilarity=0.9956;PctRecOverlap=1;SizeDiff=1;StartDistance=0;EndDistance=0;GTMatch=.;TruScore=99;MatchId=9.0	GT:PL:DP	1/0:4,8,6:32,9
diff --git a/repo_utils/sub_tests/collapse.sh b/repo_utils/sub_tests/collapse.sh
index 93220119..70c5d66b 100644
--- a/repo_utils/sub_tests/collapse.sh
+++ b/repo_utils/sub_tests/collapse.sh
@@ -74,6 +74,8 @@ if [ $test_collapse_badparams ]; then
     assert_exit_code 100
 fi
 
+# Lower collapse sub-chunk threshold
+export COLLAP_SUB=1
 run test_collapse_median $truv collapse -f $INDIR/references/reference.fa \
                    -i $INDIR/variants/input1.vcf.gz \
                    -o $OD/input1_median_collapsed.vcf \
@@ -82,6 +84,7 @@ run test_collapse_median $truv collapse -f $INDIR/references/reference.fa \
 if [ $test_collapse_median ]; then
     collapse_assert 1_median
 fi
+unset COLLAP_SUB
 
 run test_collapse_intragt $truv collapse -i $INDIR/variants/bcftools_merged.vcf.gz \
                         -o $OD/inputintragt_collapsed.vcf \
diff --git a/truvari/collapse.py b/truvari/collapse.py
index 068054ee..7cd44c79 100644
--- a/truvari/collapse.py
+++ b/truvari/collapse.py
@@ -738,7 +738,7 @@ def append(self, data):
         """
         Put data onto end of list
         """
-        new_node = (data, None)
+        new_node = [data, None]
         if not self.head:
             self.head = new_node
             self.tail = new_node
@@ -797,8 +797,9 @@ def tree_size_chunker(matcher, chunks):
     Needs to return the same thing as a chunker
     """
     chunk_count = 0
+    thresh = 1 if "COLLAP_SUB" in os.environ and os.environ["COLLAP_SUB"] == "1" else 100
     for chunk, _ in chunks:
-        if len(chunk['base']) < 100:  # fewer than 100 is fine
+        if len(chunk['base']) < thresh:  # fewer than 100 is fine
             chunk_count += 1
             yield chunk, chunk_count
             continue
@@ -826,8 +827,9 @@ def tree_dist_chunker(matcher, chunks):
     This does nothing
     """
     chunk_count = 0
+    thresh = 1 if "COLLAP_SUB" in os.environ and os.environ["COLLAP_SUB"] == "1" else 100
     for chunk, _ in chunks:
-        if len(chunk['base']) < 100:  # fewer than 100 is fine
+        if len(chunk['base']) < thresh:  # fewer than 100 is fine
             chunk_count += 1
             yield chunk, chunk_count
             continue

From a5391dd42a9d797b19db29b0dc3956857812b142 Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Mon, 9 Sep 2024 09:39:57 -0400
Subject: [PATCH 15/16] version bump

---
 repo_utils/answer_key/help.txt | 2 +-
 truvari/__init__.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/repo_utils/answer_key/help.txt b/repo_utils/answer_key/help.txt
index 07955c2f..d282d1af 100644
--- a/repo_utils/answer_key/help.txt
+++ b/repo_utils/answer_key/help.txt
@@ -1,6 +1,6 @@
 usage: truvari [-h] CMD ...
 
-Truvari v4.3.0 Structural Variant Benchmarking and Annotation
+Truvari v4.3.1 Structural Variant Benchmarking and Annotation
 
 Available commands:
     bench         Performance metrics from comparison of two VCFs
diff --git a/truvari/__init__.py b/truvari/__init__.py
index 1c508655..4a94883e 100644
--- a/truvari/__init__.py
+++ b/truvari/__init__.py
@@ -88,7 +88,7 @@
 :data:`truvari.SZBINTYPE`
 """
 
-__version__ = '4.3.0'
+__version__ = '4.3.1'
 
 
 from truvari.annotations.af_calc import (

From 0f023376acd2a3d17c100e113e01cf1ef7945d66 Mon Sep 17 00:00:00 2001
From: Adam English <ACEnglish@gmail.com>
Date: Mon, 9 Sep 2024 09:41:43 -0400
Subject: [PATCH 16/16] freezing wiki

---
 docs/v4.3.1/Citations.md                      |  30 ++
 ...-with-a-High\342\200\220Density-of-SVs.md" |  23 +
 docs/v4.3.1/Development.md                    |  90 ++++
 docs/v4.3.1/Home.md                           |  36 ++
 docs/v4.3.1/Installation.md                   |  72 +++
 docs/v4.3.1/MatchIds.md                       |  74 +++
 docs/v4.3.1/Multi-allelic-VCFs.md             |  11 +
 docs/v4.3.1/Updates.md                        | 297 +++++++++++
 docs/v4.3.1/anno.md                           | 494 ++++++++++++++++++
 docs/v4.3.1/bench.md                          | 295 +++++++++++
 docs/v4.3.1/collapse.md                       | 163 ++++++
 docs/v4.3.1/consistency.md                    | 179 +++++++
 docs/v4.3.1/divide.md                         |  58 ++
 docs/v4.3.1/ga4gh.md                          |   3 +
 docs/v4.3.1/phab.md                           | 157 ++++++
 docs/v4.3.1/refine.md                         | 142 +++++
 docs/v4.3.1/segment.md                        |  18 +
 docs/v4.3.1/stratify.md                       |  58 ++
 docs/v4.3.1/vcf2df.md                         |  81 +++
 19 files changed, 2281 insertions(+)
 create mode 100644 docs/v4.3.1/Citations.md
 create mode 100644 "docs/v4.3.1/Collapse-on-Regions-with-a-High\342\200\220Density-of-SVs.md"
 create mode 100644 docs/v4.3.1/Development.md
 create mode 100644 docs/v4.3.1/Home.md
 create mode 100644 docs/v4.3.1/Installation.md
 create mode 100644 docs/v4.3.1/MatchIds.md
 create mode 100644 docs/v4.3.1/Multi-allelic-VCFs.md
 create mode 100644 docs/v4.3.1/Updates.md
 create mode 100644 docs/v4.3.1/anno.md
 create mode 100644 docs/v4.3.1/bench.md
 create mode 100644 docs/v4.3.1/collapse.md
 create mode 100644 docs/v4.3.1/consistency.md
 create mode 100644 docs/v4.3.1/divide.md
 create mode 100644 docs/v4.3.1/ga4gh.md
 create mode 100644 docs/v4.3.1/phab.md
 create mode 100644 docs/v4.3.1/refine.md
 create mode 100644 docs/v4.3.1/segment.md
 create mode 100644 docs/v4.3.1/stratify.md
 create mode 100644 docs/v4.3.1/vcf2df.md

diff --git a/docs/v4.3.1/Citations.md b/docs/v4.3.1/Citations.md
new file mode 100644
index 00000000..d600b860
--- /dev/null
+++ b/docs/v4.3.1/Citations.md
@@ -0,0 +1,30 @@
+# Citing Truvari
+
+English, A.C., Menon, V.K., Gibbs, R.A. et al. Truvari: refined structural variant comparison preserves allelic diversity. Genome Biol 23, 271 (2022). https://doi.org/10.1186/s13059-022-02840-6
+
+# Citations
+
+List of publications using Truvari. Most of these are just pulled from a [Google Scholar Search](https://scholar.google.com/scholar?q=truvari). Please post in the [show-and-tell](https://github.com/spiralgenetics/truvari/discussions/categories/show-and-tell) to have your publication added to the list.
+* [A robust benchmark for detection of germline large deletions and insertions](https://www.nature.com/articles/s41587-020-0538-8)
+* [Leveraging a WGS compression and indexing format with dynamic graph references to call structural variants](https://www.biorxiv.org/content/10.1101/2020.04.24.060202v1.abstract)
+* [Duphold: scalable, depth-based annotation and curation of high-confidence structural variant calls](https://academic.oup.com/gigascience/article/8/4/giz040/5477467?login=true)
+* [Parliament2: Accurate structural variant calling at scale](https://academic.oup.com/gigascience/article/9/12/giaa145/6042728)
+* [Learning What a Good Structural Variant Looks Like](https://www.biorxiv.org/content/10.1101/2020.05.22.111260v1.full)
+* [Long-read trio sequencing of individuals with unsolved intellectual disability](https://www.nature.com/articles/s41431-020-00770-0)
+* [lra: A long read aligner for sequences and contigs](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1009078)
+* [Samplot: a platform for structural variant visual validation and automated filtering](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02380-5)
+* [AsmMix: A pipeline for high quality diploid de novo assembly](https://www.biorxiv.org/content/10.1101/2021.01.15.426893v1.abstract)
+* [Accurate chromosome-scale haplotype-resolved assembly of human genomes](https://www.nature.com/articles/s41587-020-0711-0)
+* [Accurate circular consensus long-read sequencing improves variant detection and assembly of a human genome](https://www.nature.com/articles/s41587-019-0217-9)
+* [NPSV: A simulation-driven approach to genotyping structural variants in whole-genome sequencing data](https://academic.oup.com/bioinformatics/article-abstract/37/11/1497/5466452)
+* [SVIM-asm: structural variant detection from haploid and diploid genome assemblies](https://academic.oup.com/bioinformatics/article/36/22-23/5519/6042701?login=true)
+* [Readfish enables targeted nanopore sequencing of gigabase-sized genomes](https://www.nature.com/articles/s41587-020-00746-x)
+* [stLFRsv: A Germline Structural Variant Analysis Pipeline Using Co-barcoded Reads](https://internal-journal.frontiersin.org/articles/10.3389/fgene.2021.636239/full)
+* [Long-read-based human genomic structural variation detection with cuteSV](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02107-y)
+* [An international virtual hackathon to build tools for the analysis of structural variants within species ranging from coronaviruses to vertebrates](https://f1000research.com/articles/10-246)
+* [Paragraph: a graph-based structural variant genotyper for short-read sequence data](https://link.springer.com/article/10.1186/s13059-019-1909-7)
+* [Genome-wide investigation identifies a rare copy-number variant burden associated with human spina bifida](https://www.nature.com/articles/s41436-021-01126-9)
+* [TT-Mars: Structural Variants Assessment Based on Haplotype-resolved Assemblies](https://www.biorxiv.org/content/10.1101/2021.09.27.462044v1.abstract)
+* [An ensemble deep learning framework to refine large deletions in linked-reads](https://www.biorxiv.org/content/10.1101/2021.09.27.462057v1.abstract)
+* [MAMnet: detecting and genotyping deletions and insertions based on long reads and a deep learning approach](https://academic.oup.com/bib/advance-article-abstract/doi/10.1093/bib/bbac195/6587170)](https://academic.oup.com/bib/advance-article-abstract/doi/10.1093/bib/bbac195/6587170)
+* [Automated filtering of genome-wide large deletions through an ensemble deep learning framework](https://www.sciencedirect.com/science/article/pii/S1046202322001712#b0110)
diff --git "a/docs/v4.3.1/Collapse-on-Regions-with-a-High\342\200\220Density-of-SVs.md" "b/docs/v4.3.1/Collapse-on-Regions-with-a-High\342\200\220Density-of-SVs.md"
new file mode 100644
index 00000000..1ea24210
--- /dev/null
+++ "b/docs/v4.3.1/Collapse-on-Regions-with-a-High\342\200\220Density-of-SVs.md"
@@ -0,0 +1,23 @@
+Some regions of the reference genome can give rise to a huge number of SVs. These regions are typically in telomeres or centromeres, especially on chm13. Furthermore, some multi-sample VCFs might contain samples with a high genetic distance from the reference can also create an unreasonable number of SVs. These 'high-density' regions can cause difficulties for `truvari collapse` when there are too many comparisons that need to be made.
+
+If you find `truvari collapse` 'freezing' during processing where it is no longer writing variants to the output VCFs, you should investigate if there are regions which should be excluded from analysis. To do this, first run:
+
+```
+truvari anno chunks input.vcf.gz > counts.bed
+```
+
+The `counts.bed` will have the chromosome, start, and end position of each chunk in the VCF as well as three additional columns. The 3rd column has the number of SVs inside the chunk while the 4th and 5th have a comma-separated counts of the number of variants in 'sub-chunks'. These sub-chunk counts correspond to advanced separation of the variants beyond just their window. The 4th is after separating by svtype and svlen, the 5th is re-chunking the svtype/svlen separation by distance.
+
+If you find spans in the `counts.bed` with a huge number of SVs, these are prime candidates for exclusion to speed up `truvari collapse`. To exclude them, you first subset the regions of interest and then use `bedtools` to create a bed file that will skip them.
+
+```
+# exclude regions with more than 30k SVs. You should test this threshold.
+awk '$4 >= 30000' counts.bed > to_exclude.bed
+bedtools complement -g genome.bed -i to_exclude.bed > to_analyize.bed
+
+truvari collapse --bed to_analyze.bed -i input.vcf.gz
+```
+
+When considering what threshold you would like to use, just looking at the 3rd column may not be sufficient as the 'sub-chunks' may be smaller and will therefore run faster. There also is no guarantee that a region with high SV density will be slow. For example, if there are 100k SVs in a window, but they all could collapse, it would only take O(N - 1) comparisons to perform the collapse. The problems arise when the 100k SVs have few redundant SVs and therefore requires O(N**2) comparisons.
+
+A conservative workflow for figuring out which regions to exclude would run `truvari collapse` without a `--bed`, wait for regions to 'freeze', and then check if the last variant written out by `truvari collapse` is just before a high-density chunk in the `counts.bed`. That region could then be excluded an a collapsing could be repeated until success. 
\ No newline at end of file
diff --git a/docs/v4.3.1/Development.md b/docs/v4.3.1/Development.md
new file mode 100644
index 00000000..ccb38493
--- /dev/null
+++ b/docs/v4.3.1/Development.md
@@ -0,0 +1,90 @@
+# Truvari API
+Many of the helper methods/objects are documented such that developers can reuse truvari in their own code. To see developer documentation, visit [readthedocs](https://truvari.readthedocs.io/en/latest/).
+
+Documentation can also be seen using
+```python
+import truvari
+help(truvari)
+```
+
+# docker
+
+A Dockerfile exists to build an image of Truvari. To make a Docker image, clone the repository and run
+```bash
+docker build -t truvari .
+```
+
+You can then run Truvari through docker using
+```bash
+docker run -v `pwd`:/data -it truvari
+```
+Where `pwd` can be whatever directory you'd like to mount in the docker to the path `/data/`, which is the working directory for the Truvari run. You can provide parameters directly to the entry point.
+```bash
+docker run -v `pwd`:/data -it truvari anno svinfo -i example.vcf.gz
+```
+
+If you'd like to interact within the docker container for things like running the CI/CD scripts
+```bash
+docker run -v `pwd`:/data --entrypoint /bin/bash -it truvari
+```
+You'll now be inside the container and can run FuncTests or run Truvari directly
+```bash
+bash repo_utils/truvari_ssshtests.sh
+truvari anno svinfo -i example.vcf.gz
+```
+
+# CI/CD
+
+Scripts that help ensure the tool's quality. Extra dependencies need to be installed in order to run Truvari's CI/CD scripts. 
+
+```bash
+pip install pylint anybadge coverage
+```
+
+Check code formatting with 
+```bash
+python repo_utils/pylint_maker.py
+```
+We use [autopep8](https://pypi.org/project/autopep8/) (via [vim-autopep8](https://github.com/tell-k/vim-autopep8)) for formatting.
+
+Test the code and generate a coverage report with 
+```bash
+bash repo_utils/truvari_ssshtests.sh
+```
+
+Truvari leverages github actions to perform these checks when new code is pushed to the repository. We've noticed that the actions sometimes hangs through no fault of the code. If this happens, cancel and resubmit the job. Once FuncTests are successful, it uploads an artifact of the `coverage html` report which you can download to see a line-by-line accounting of test coverage.
+
+# git flow
+
+To organize the commits for the repository, we use [git-flow](https://danielkummer.github.io/git-flow-cheatsheet/). Therefore, `develop` is the default branch, the latest tagged release is on `master`, and new, in-development features are within `feature/<name>`
+
+When contributing to the code, be sure you're working off of develop and have run `git flow init`.
+ 
+# versioning
+
+Truvari uses [Semantic Versioning](https://semver.org/) and tries to stay compliant to [PEP440](https://peps.python.org/pep-0440/). As of v3.0.0, a single version is kept in the code under `truvari/__init__.__version__`. We try to keep the suffix `-dev` on the version in the develop branch. When cutting a new release, we may replace the suffix with `-rc` if we've built a release candidate that may need more testing/development. Once we've committed to a full release that will be pushed to PyPi, no suffix is placed on the version. If you install Truvari from the develop branch, the git repo hash is appended to the installed version as well as '.uc' if there are un-staged commits in the repo.
+
+# docs
+
+The github wiki serves the documentation most relevant to the `develop/` branch. When cutting a new release, we freeze and version the wiki's documentation with the helper utility `docs/freeze_wiki.sh`.
+
+# Creating a release
+Follow these steps to create a release
+
+0) Bump release version
+1) Run tests locally
+2) Update API Docs
+3) Change Updates Wiki
+4) Freeze the Wiki
+5) Ensure all code is checked in
+6) Do a [git-flow release](https://danielkummer.github.io/git-flow-cheatsheet/)
+7) Use github action to make a testpypi release
+8) Check test release
+```bash
+python3 -m venv test_truvari
+python3 -m pip install --index-url https://test.pypi.org/simple --extra-index-url https://pypi.org/simple/ truvari
+```
+9) Use GitHub action to make a pypi release
+10) Download release-tarball.zip from step #9’s action
+11) Create release (include #9) from the tag
+12) Checkout develop and Bump to dev version and README ‘commits since’ badge
\ No newline at end of file
diff --git a/docs/v4.3.1/Home.md b/docs/v4.3.1/Home.md
new file mode 100644
index 00000000..e76a3cb3
--- /dev/null
+++ b/docs/v4.3.1/Home.md
@@ -0,0 +1,36 @@
+The wiki holds documentation most relevant for develop. For information on a specific version of Truvari, see [`docs/`](https://github.com/spiralgenetics/truvari/tree/develop/docs)
+
+Citation:  
+English, A.C., Menon, V.K., Gibbs, R.A. et al. Truvari: refined structural variant comparison preserves allelic diversity. Genome Biol 23, 271 (2022). https://doi.org/10.1186/s13059-022-02840-6
+
+# Before you start
+VCFs aren't always created with a strong adherence to the format's specification. 
+
+Truvari expects input VCFs to be valid so that it will only output valid VCFs. 
+
+We've developed a separate tool that runs multiple validation programs and standard VCF parsing libraries in order to validate a VCF. 
+
+Run [this program](https://github.com/acenglish/usable_vcf) over any VCFs that are giving Truvari trouble. 
+
+Furthermore, Truvari expects 'resolved' SVs (e.g. DEL/INS) and will not interpret BND signals across SVTYPEs (e.g. combining two BND lines to match a DEL call). A brief description of Truvari bench methodology is linked below.
+
+Finally, Truvari does not handle multi-allelic VCF entries and as of v4.0 will throw an error if multi-allelics are encountered. Please use `bcftools norm` to split multi-allelic entries. 
+
+# Index
+
+- [[Updates|Updates]]
+- [[Installation|Installation]]
+- Truvari Commands:
+  - [[anno|anno]]
+  - [[bench|bench]]
+  - [[collapse|collapse]]
+  - [[consistency|consistency]]
+  - [[divide|divide]]
+  - [[ga4gh|ga4gh]]
+  - [[phab|phab]]
+  - [[refine|refine]]
+  - [[segment|segment]]
+  - [[stratify|stratify]]
+  - [[vcf2df|vcf2df]]
+- [[Development|Development]]
+- [[Citations|Citations]]
\ No newline at end of file
diff --git a/docs/v4.3.1/Installation.md b/docs/v4.3.1/Installation.md
new file mode 100644
index 00000000..a8aae743
--- /dev/null
+++ b/docs/v4.3.1/Installation.md
@@ -0,0 +1,72 @@
+Recommended
+===========
+For stable versions of Truvari, use pip
+```
+python3 -m pip install truvari
+```
+Specific versions can be installed via
+```
+python3 -m pip install truvari==3.2.0
+```
+See [pypi](https://pypi.org/project/Truvari/#history) for a history of all distributed releases.
+
+Manual Installation
+===================
+To build Truvari directly, clone the repository and switch to a specific tag.
+```
+git clone https://github.com/ACEnglish/truvari.git
+git checkout tags/v3.0.0
+python3 -m pip install .
+```
+
+To see a list of all available tags, run:
+```
+git tag -l
+```
+
+If you have an older clone of the repository and don't see the version you're looking for in tags, make sure to pull the latest changes:
+```
+git pull
+git fetch --all --tags
+```
+
+Mamba / Conda
+=============
+NOTE!! There is a very old version of Truvari on bioconda that - for unknown reasons - supersedes the newer, supported versions. Users may need to specify to conda which release to build. See [this ticket](https://github.com/ACEnglish/truvari/issues/130#issuecomment-1196607866) for details.
+
+Truvari releases are automatically deployed to bioconda. 
+Users can follow instructions here (https://mamba.readthedocs.io/en/latest/installation.html) to install mamba. (A faster alternative conda compatible package manager.)
+
+Creating an environment with Truvari and its dependencies.
+```
+mamba create -c conda-forge -c bioconda -n truvari truvari
+```
+
+Alternatively, see the [conda page](https://anaconda.org/bioconda/truvari) for details
+```
+conda install -c bioconda truvari
+```
+
+Building from develop
+=====================
+The default branch is `develop`, which holds in-development changes. This is for developers or those wishing to try experimental features and is not recommended for production. Development is versioned higher than the most recent stable release with an added suffix (e.g. Current stable release is `3.0.0`, develop holds `3.1.0-dev`). If you'd like to install develop, repeat the steps above but without `git checkout tags/v3.0.0`. See [wiki](https://github.com/spiralgenetics/truvari/wiki/Development#git-flow) for details on how branching is handled.
+
+Docker
+======
+See [Development](https://github.com/spiralgenetics/truvari/wiki/Development#docker) for details on building a docker container.
+
+edlib error
+===========
+Some environments have a hard time installing edlib due to a possible cython bug ([source](https://bugs.launchpad.net/ubuntu/+source/cython/+bug/2006404)). If you seen an error such as the following:
+```
+edlib.bycython.cpp:198:12: fatal error: longintrepr.h: No such file or directory
+     198 |  #include "longintrepr.h"
+```
+
+One method to prevent the error message is to run the following commands ([source](https://github.com/Martinsos/edlib/issues/212))
+```bash
+python3 -m venv my_env
+source my_env/bin/activate
+python3 -m pip install --no-cache-dir cython setuptools wheel
+EDLIB_USE_CYTHON=1 python3 -m pip install truvari --no-cache
+```
\ No newline at end of file
diff --git a/docs/v4.3.1/MatchIds.md b/docs/v4.3.1/MatchIds.md
new file mode 100644
index 00000000..ca52076f
--- /dev/null
+++ b/docs/v4.3.1/MatchIds.md
@@ -0,0 +1,74 @@
+MatchIds are used to tie base/comparison calls together in post-processing for debugging or other exploring. MatchIds have a structure of `{chunkid}.{callid}`. The chunkid is unique id per-chunk of calls. All calls sharing chunkid were within `--chunksize` distance and were compared. The callid is unique to a call in a chunk for each VCF. Because `bench` processes two VCFs (the base and comparison VCFs), the `MatchId` has two values: the first is the base variant's MatchId and the second the comparison variant's MatchId. 
+
+For `--pick single`, the two MatchIds will be identical in the e.g. tp-base.vcf.gz and tp-comp.vcf.gz. However, for `--pick ac|multi`, it's possible to have cases such as one base variant matching to multiple comparison variants. That would give us MatchIds like:
+
+```
+# tp-base.vcf
+MatchId=4.0,4.1
+
+# tp-comp.vcf
+MatchId=4.0,4.1
+MatchId=4.0,4.2
+```
+
+This example tells us that the tp-comp variants are both pointing to `4.0` in tp-base. The tp-base variant has a higher match to the tp-comp `4.1` variant. 
+
+One easy way to combine matched variants is to use `truvari vcf2df` to convert a benchmarking result to a pandas DataFrame and leverage pandas' merge operation. First, we convert the `truvari bench` result.
+
+```bash
+truvari vcf2df --info --bench-dir bench_result/ data.jl
+```
+
+Next, we combine rows of matched variants:
+```python
+import joblib
+import pandas as pd
+
+# Load the data
+data = joblib.load("data.jl")
+
+# Separate out the variants from the base VCF and add new columns of the base/comp ids
+base = data[data['state'].isin(['tpbase', 'fn'])].copy()
+base['base_id'] = base['MatchId'].apply(lambda x: x[0])
+base['comp_id'] = base['MatchId'].apply(lambda x: x[1])
+
+# Separate out the variants from the comparison VCF and add new columns of the base/comp ids
+comp = data[data['state'].isin(['tp', 'fp'])].copy()
+comp['base_id'] = comp['MatchId'].apply(lambda x: x[0])
+comp['comp_id'] = comp['MatchId'].apply(lambda x: x[1])
+
+# Merge the base/comparison variants
+combined = pd.merge(base, comp, left_on='base_id', right_on='comp_id', suffixes=('_base', '_comp'))
+
+# How many comp variants matched to multiple base variants?
+counts1 = combined['base_id_comp'].value_counts()
+print('multi-matched comp count', (counts1 != 1).sum())
+
+# How many base variants matched to multiple comp variants?
+counts2 = combined['comp_id_base'].value_counts()
+print('multi-matched base count', (counts2 != 1).sum())
+```
+
+The `MatchId` is also used by `truvari collapse`. However there are two differences. First, in the main `collapse` output, the relevant INFO field is named `CollapsedId`. Second, because collapse only has a single input VCF, it is much easier to merge DataFrames. To merge collapse results kept variants with those that were removed, we again need to convert the VCFs to DataFrames:
+
+```bash
+truvari vcf2df -i kept.vcf.gz kept.jl
+truvari vcf2df -i removed.vcf.gz remov.jl
+```
+
+Then we combine them:
+```python
+import joblib
+import pandas as pd
+
+# Load the kept variants and set the index.
+kept = joblib.load("kept.jl").set_index('CollapseId')
+
+# Load the removed variants and set the index.
+remov = joblib.load("remov.jl")
+remov['CollapseId'] = remov['MatchId'].apply(lambda x: x[0])
+remov.set_index('CollapseId', inplace=True)
+
+# Join the two sets of variants
+result_df = kept.join(remov, how='right', rsuffix='_removed')
+```
\ No newline at end of file
diff --git a/docs/v4.3.1/Multi-allelic-VCFs.md b/docs/v4.3.1/Multi-allelic-VCFs.md
new file mode 100644
index 00000000..fd0eb23e
--- /dev/null
+++ b/docs/v4.3.1/Multi-allelic-VCFs.md
@@ -0,0 +1,11 @@
+Truvari only compares the first alternate allele in VCFs. If a VCF contains multi-allelic sites such as:
+
+```
+chr2     1948201     .     T     TACAACACGTACGATCAGTAGAC,TCAACACACAACACGTACGATCAGTAGAC     ....
+```
+
+Then pre-process the VCFs with bcftools:
+
+```bash
+bcftools norm -m-any base_calls.vcf.gz | bgzip > base_calls_split.vcf.gz
+```
\ No newline at end of file
diff --git a/docs/v4.3.1/Updates.md b/docs/v4.3.1/Updates.md
new file mode 100644
index 00000000..1cb41cfb
--- /dev/null
+++ b/docs/v4.3.1/Updates.md
@@ -0,0 +1,297 @@
+# Truvari 4.3.1
+*September 9, 2024*
+* `bench`
+  * Correctly filtering `ALT=*` alleles ([details](https://github.com/ACEnglish/truvari/discussions/219)) and monomorphic reference
+    * including test coverage this time
+* `stratify`
+  * Default behavior is to count variants within ([#221](https://github.com/ACEnglish/truvari/issues/221))
+* `collapse`
+  * Faster sub-chunking operations by dropping use of pyintervaltree
+* `anno chunks`
+  * New command for identifying windows with a high number of SVs ([details](https://github.com/ACEnglish/truvari/wiki/Collapse-on-Regions-with-a-High%E2%80%90Density-of-SVs))
+
+
+# Truvari 4.3.0
+*July 31, 2024*
+* `refine` & `stratify`
+  * Fixed variant and bed boundary overlapping issue 
+* general
+  * Definition of variants within a region now includes replacement style from TR callers having an anchor base 1bp upstream of catalog/includebed regions
+  * Propagating MAFFT errors ([#204](https://github.com/ACEnglish/truvari/issues/204))
+  * FIPS compliance ([#205](https://github.com/ACEnglish/truvari/issues/205))
+  * Allow csi-indexed vcf files ([#209](https://github.com/ACEnglish/truvari/issues/209))
+  * bcftools sort tempfile ([#213](https://github.com/ACEnglish/truvari/issues/213))
+
+# Truvari 4.2.2
+*March 28, 2024*
+
+* `collapse`
+  * Fewer comparisons needed per-chunk on average
+  * Fixed `--chain` functionality ([details](https://github.com/ACEnglish/truvari/issues/196))
+  * Fixed `--gt` consolidation of format fields
+* `bench`
+  * Faster result at the cost of less complete annotations with `--short` flag
+* `refine`
+  * Assures variants are sequence resolved before incorporating into consensus
+  * `bench --passonly --sizemax` parameters are used when building consensus for a region. Useful for `refine --use-original-vcfs`
+  * When a refined region has more than 5k variants, it is skipped and a warning of the region is written to the log
+  * Flag `--use-region-coords` now expands `--region` coordinates by 100bp (`phab --buffer` default) to allow variants to harmonize out of regions.
+* general
+  * Dynamic bed/vcf parsing tries to choose faster of streaming/fetching variants
+ 
+# Truvari 4.2.1
+*February 6, 2024*
+* `collapse`
+  * Faster handling of genotype data for `--gt` and `--keep common`
+* general
+  * Fix to bed end position bug for including variants ([details](https://github.com/ACEnglish/truvari/issues/193))
+  * Fix to Dockerfile
+* `refine`
+  * Changes to `--recount` that accompany the fix to bed end positions.
+* New command `ga4gh` to convert Truvari results into GA4GH truth/query VCFs with intermediates tags
+
+# Truvari 4.2
+*January 12, 2024*
+* `collapse`
+  * New parameter `--gt` disallows intra-sample events to collapse ([details](https://github.com/ACEnglish/truvari/wiki/collapse#--gt))
+  * New parameter `--intra` for consolidating SAMPLE information during intra-sample collapsing ([details](https://github.com/ACEnglish/truvari/wiki/collapse#--intra))
+  * Preserve phasing information when available
+  * Faster O(n-1) algorithm instead of O(n^2)
+  * Faster sub-chunking strategy makes smaller chunks of variants needing fewer comparisons
+  * Fixed rare non-determinism error in cases where multiple variants are at the same position and equal qual/ac could be ordered differently.
+* `phab`
+  * Correct sample handling with `--bSamples` `--cSamples` parameters
+  * Faster generation of consensus sequence
+  * Resolved 'overlapping' variant issue causing variants to be dropped
+  * New `poa` approach to harmonization. Faster than mafft but less accurate. Slower than wfa but more accurate.
+* `bench`
+  * New, easier `MatchId` field to track which baseline/comparison variants match up [details](https://github.com/ACEnglish/truvari/wiki/MatchIds)
+  * `entry_is_present` method now considers partial missing variants (e.g. `./1`) as present
+  * Removed the 'weighted' metrics from `summary.json`
+* `consistency`
+  * Fixed issue with counting duplicate records
+  * Added flag to optionally ignore duplicate records
+* `anno svinfo` now overwrites existing SVLEN/SVTYPE info fields
+* general
+  * Reduced fn matches for unroll sequence similarity by reporting maximum of multiple manipulations of variant sequence (roll up/down/none). Comes at a small, but reasonable, expense of some more fp matches.
+  * Bump pysam version
+  * Fixed bug in `unroll` sequence similarity that sometimes rolled from the wrong end 
+  * Fixed bug for handling of None in ALT field
+  * `truvari.compress_index_vcf` forces overwriting of tabix index to prevent annoying crashes
+
+
+# Truvari 4.1
+*August 7, 2023*
+
+* `bench`
+  * Creates `candidate.refine.bed` which hooks into `refine` on whole-genome VCFs [details](https://github.com/ACEnglish/truvari/wiki/bench#refining-bench-output)
+  * `--recount` for correctly assessing whole-genome refinement results
+  * experimental 'weighted' summary metrics [details](https://github.com/ACEnglish/truvari/wiki/bench#weighted-performance)
+  * Unresolved SVs (e.g. `ALT == <DEL>`) are filtered when `--pctseq != 0`
+* `phab`
+  * ~2x faster via reduced IO from operating in stages instead of per-region
+  * Removed most external calls (e.g. samtools doesn't need to be in the environment anymore)
+  * new `--align wfa` allows much faster (but slightly less accurate) variant harmonization
+  * increased determinism of results [detals](https://github.com/ACEnglish/truvari/commit/81a9ab85b91b0c530f9faeedfa4e7e0d68a5e8c2)
+* `refine`
+  * Faster bed file intersection of `--includebed` and `--regions`
+  * Refine pre-flight check
+  * Correct refine.regions.txt end position from IntervalTree correction
+  * Better refine region selection with `--use-original`
+  * `--use-includebed` switched to `--use-region-coords` so that default behavior is to prefer the includebed's coordinates
+  * `--use-original-vcfs` to use the original pre-bench VCFs
+  * `refine.variant_summary.json` is cleaned of uninformative metrics
+* `stratify`
+  * parallel parsing of truvari directory to make processing ~4x faster
+* `msa2vcf` Fixed REPL decomposition bug to now preserve haplotypes
+* `anno grpaf` - expanded annotation info fields
+* `anno density` - new parameter `--stepsize` for sliding windows
+* `collapse`
+  * New optional `--median-info` fields [#146](https://github.com/ACEnglish/truvari/issues/146)
+* Minor updates
+  * Fix some `anno` threading on macOS [#154](https://github.com/ACEnglish/truvari/issues/154)
+  * Monomorphic/multiallelic check fix in `bench`
+  * `PHAB_WRITE_MAFFT` environment variable to facilitate updating functional test answer key
+  * Slightly slimmer docker container
+
+# Truvari 4.0
+*March 13, 2023*
+
+As part of the GIAB TR effort, we have made many changes to Truvari's tooling to enable comparison of variants in TR regions down to 5bp. Additionally, in order to keep Truvari user friendly we have made changes to the UI. Namely, we've updated some default parameters, some command-line arguments, and some outputs. There are also a few new tools and how a couple of tools work has changed. Therefore, we decided to bump to a new major release. If you're using Truvari in any kind of production capacity, be sure to test your pipeline before moving to v4.0.
+
+* New `refine` command for refining benchmarking results. [Details](refine)
+* `bench`
+  * [Unroll](bench#unroll) is now the default sequence comparison approach. 
+  * New `--pick` parameter to control the number of matches a variant can participate in [details](bench#controlling-the-number-of-matches)
+  * The `summary.txt` is now named `summary.json`
+  * Outputs parameters to `params.json`
+  * Output VCFs are sorted, compressed, and indexed
+  * Ambiguous use of 'call' in outputs corrected to 'comp' (e.g. `tp-call.vcf.gz` is now `tp-comp.vcf.gz`)
+  * Renamed `--pctsim` parameter to `--pctseq`
+  * Fixed bug where FP/FN weren't getting the correct, highest scoring match reported
+  * Fixed bug where `INFO/Multi` wasn't being properly applied
+  * Fixed bug where variants spanning exactly one `--includebed` region were erroneously being counted.
+  * Removed parameters: `--giabreport`, `--gtcomp`,`--multimatch`, `--use-lev`, `--prog`, `--unroll`
+* `collapse`
+  * Renamed `--pctsim` parameter to `--pctseq`
+  * Runtime reduction by ~40% with short-circuiting during `Matcher.build_match`
+  * Better output sorting which may allow pipelines to be a little faster.
+* `vcf2df`
+  * More granular sizebins for `[0,50)` including better handling of SNPs
+  * `--multisample` is removed. Now automatically add all samples with `--format`
+  * key index column removed and replaced by chrom, start, end. Makes rows easier to read and easier to work with e.g. pyranges
+* `anno`
+  * Simplified ui. Commands that work on a single VCF and can stream (stdin/stdout) no longer use `--input` but a positional argument.
+  * Added `addid`
+* `consistency`
+  * Slight speed improvement
+  * Better json output format
+* `segment`
+  * Added `--passonly` flag
+  * Changed UI, including writing to stdout by default
+  * Fixed END and 1bp DEL bugs, now adds N to segmented variants' REF, and info fields SVTYPE/SVLEN
+* API
+  * Began a focused effort on improving re-usability of Truvari code.
+  * Entry point to run benchmarking programmatically with [Bench object](https://truvari.readthedocs.io/en/latest/truvari.html#bench).
+  * Better development version tracking. [details](https://github.com/ACEnglish/truvari/commit/4bbf8d9a5be3b6a3f935afbd3a9b323811b676a0)
+  * Improved developer documentation. See [readthedocs](https://truvari.readthedocs.io/)
+* general
+  * msa2vcf now left-trims and decomposes variants into indels 
+  * Functional tests reorganization
+  * Fix for off-by-one errors when using pyintervaltree. See [ticket](https://github.com/ACEnglish/truvari/issues/137)
+  * Removed progressbar and Levenshtein dependencies as they are no longer used.
+
+# Truvari 3.5
+*August 27, 2022*
+
+* `bench`
+  * `--dup-to-ins` flag automatically treats SVTYPE==DUP as INS, which helps compare some programs/benchmarks
+  * New `--unroll` sequence comparison method for `bench` and `collapse` ([details](bench#unroll))
+* Major `anno trf` refactor (TODO write docs) including:
+  * annotation of DEL is fixed (was reporting the ALT copy numbers, not the sample's copy numbers after incorporating the ALT
+  * allow 'denovo' annotation by applying any TRF annotations found, not just those with corresponding annotations
+* New `anno grpaf` annotates vcf with allele frequency info for groups of samples
+* New `phab` for variant harmonization ([details](../phab))
+* backend
+  * `truvari.entry_size` returns the length of the event in the cases where len(REF) == len(ALT) (e.g. SNPs entry_size is 1)
+  * New key utility for `truvari.build_anno_trees`
+* general
+  * Float metrics written to the VCF (e.g. PctSizeSimilarity) are rounded to precision of 4
+  * Nice colors in some `--help` with [rich](https://github.com/Textualize/rich/)
+* `divide` 
+  * output shards are now more easily sorted (i.e. `ls divide_result/*.vcf.gz` will return the shards in the order they were made)
+  * compression/indexing of sub-VCFs in separate threads, reducing runtime
+* user issues
+  * Monomorphic reference ALT alleles no longer throw an error in `bench` ([#131](https://github.com/ACEnglish/truvari/issues/131))
+  * `SVLEN Number=A` fix ([#132](https://github.com/ACEnglish/truvari/issues/132))
+
+# Truvari 3.4
+*July 7, 2022*
+
+* Improved performance of `consistency` (see [#127](https://github.com/ACEnglish/truvari/pull/127))
+* Added optional json output of `consistency` report
+* Allow GT to be missing, which is allowed by VCF format specification
+* TRF now uses `truvari.entry_variant_type` instead of trying to use `pysam.VariantRecord.info["SVLEN"]`
+directly which allows greater flexibility.
+* vcf2df now parses fields with `Number=\d` (e.g. 2+), which is a valid description
+* `truvari.seqsim` is now case insensitive (see [#128](https://github.com/ACEnglish/truvari/issues/128))
+* Collapse option to skip consolidation of genotype information so kept alleles are unaltered
+* `truvari anno dpcnt --present` will only count the depths of non ./. variants
+* New collapse annotation `NumConsolidate` records how many FORMATs were consolidated
+* Official [conda](https://anaconda.org/bioconda/truvari) support
+
+# Truvari 3.3
+*May 25, 2022*
+
+* New utilities `vcf_ranges` and `make_temp_filename`
+* New annotations `dpcnt` and `lcr`
+* Fixed a bug in `truvari collapse --keep` that prevented the `maxqual` or `common` options from working
+* Increased determinism for `truvari collapse` so that in cases of tied variant position the longer allele is returned. If the alleles also have the same length, they are sorted alphabetically by the REF
+* New `truvari bench --extend` functionality. See [discussion](https://github.com/ACEnglish/truvari/discussions/99) for details
+
+# Truvari 3.2
+*Apr 1, 2022*
+
+* Removed `truvari.copy_entry` for `pysam.VariantRecord.translate` a 10x faster operation
+* Faster `truvari collapse`  ([@c8b319b](https://github.com/ACEnglish/truvari/commit/c8b319b0e717a9e342f52e4a5e927f154eeb0e4a))
+* When building `MatchResult` between variants with shared start/end positions, we save processing work by skipping haplotype creation and just compare REFs/ALTs directly.
+* Updated documentation to reference the paper https://doi.org/10.1101/2022.02.21.481353
+* New `truvari anno density` for identifying regions with 'sparse' and 'dense' overlapping SVs ([details](https://github.com/spiralgenetics/truvari/wiki/anno#truvari-anno-density))
+* Better `bench` genotype reporting with `summary.txt` having a `gt_matrix` of Base GT x Comp GT for all Base calls' best, TP match.
+* New `truvari anno bpovl` for intersecting against tab-delimited files ([details](https://github.com/spiralgenetics/truvari/wiki/anno#truvari-anno-bpovl))
+* New `truvari divide` command to split VCFs into independent parts ([details](https://github.com/ACEnglish/truvari/wiki/divide))
+* Replaced `--buffer` parameter with `--minhaplen` for slightly better matching specificity
+* Bugfix - `truvari anno trf` no longer duplicates entries spanning multple parallelization regions
+* Bugfix - `collapse` MatchId/CollapseId annotation wasn't working
+* Bugfixes - from [wwliao](https://github.com/wwliao) ([@4dd9968](https://github.com/ACEnglish/truvari/commit/4dd99683912236f433166889bb0b5667e9fa936d) [@ef2cfb3](https://github.com/ACEnglish/truvari/commit/ef2cfb366b60a5af4671d65d3ed987b08da72227))
+* Bugfixes - Issues [#107](https://github.com/ACEnglish/truvari/issues/107), [#108](https://github.com/ACEnglish/truvari/issues/108)
+
+# Truvari 3.1
+*Dec 22, 2021*
+
+* `bench` now annotates FPs by working a little differently. See [[bench|bench#methodology]] for details.
+* Recalibrated TruScore and new reciprocal overlap measurement for sequence resolved `INS` ([details](https://github.com/spiralgenetics/truvari/discussions/92))
+* Match objects are now usable via the SDK. See [#94](https://github.com/spiralgenetics/truvari/discussions/94) for an example of using Truvari programmatically
+* `file_zipper` VCF iteration strategy (`GenomeTree` -> `RegionVCFIterator`) that improves speed, particularly when using `--includebed`
+* `collapse` refactored to use Match object and for prettier code, cleaner output.
+* `anno remap` now optionally adds `INFO` field of the location of the top N hits.
+* An experimental tool `truvari segment` added to help SV association analysis.
+* `vcf2df` now supports pulling `FORMAT` fields from multiple samples.
+* `vcf2df` now adds `('_ref', '_alt')`, or `('_ref', '_het', '_hom')` for `INFO,Number=[R|G]` fields, respectively.
+* Improved documentation, including http://truvari.readthedocs.io/ for developers.
+* Increasing/diversifying test coverage exposed minor bugs which were fixed.
+* `bench --no-ref --cSample` bug fixes.
+* Minor usability feature implemented in `help_unknown_cmd`.
+
+# Truvari 3.0
+*Sep 15, 2021*
+
+As Truvari's adoption and functionality grows, we decided to spend time working on sustainability and performance of the tool.  Multiple [Actions](https://github.com/spiralgenetics/truvari/actions) for CI/CD have been added. Many components have been refactored for speed, and other 'cruft' code has been removed.  Some of these changes (particularly the switch to using edlib for sequence similarity) affects the results. Therefore, we've bumped to a new major release version.
+
+* Working on speed improvements
+* Added edlib as the default when calculating pctseq_sim, keeping Levenstein as an option (`--use-lev`).
+* `truvari bench` summary's gt_precision/gt_recall are replaced by gt_concordance, which is just the percent of TP-comp calls with a concordant genotype. `--no-ref` has better functionality. `--giabreport` is different.
+* Added `—keep common` to `truvari collapse`, which allows one to choose to keep the allele with the highest MAC.
+* `truvari collapse --hap` wasn't working correctly. The assumptions about the calls being phased wasn't being 
+properly used (e.g. don't collapse 1|1) and the NumCollapsed was being populated before the single-best 
+match was chosen. The latter is a reporting problem, but the former had an effect on the results with 
+~3% of collapsed calls being mis-collapsed.
+* `truvari anno trf` is now faster and simpler in its approach and whats reported.. and hopefully more useful.
+* `truvari anno grm` has min_size and regions arguments added.
+* truv2df has become `truvari vcf2df` where the default is vcf conversion with options to run on a `truvari bench` output directory. It also allows a specific sample to be parsed with `--format` and better Number=A handling.
+* NeighId added to `truvari anno numneigh`, which works like bedtools cluster.
+* The method af_calc now makes MAC/AC.
+* Added 'partial' to `truvari anno remap`.
+* Added `truvari anno svinfo`.
+* Removed `truvari stats` as `truvari vcf2df` is better and began building [community-driven summaries](https://github.com/spiralgenetics/truvari/discussions/categories/vcf2df-recipes).
+* Ubiquitous single version.
+* Added a Dockerfile and instructions for making a Truvari docker container.
+* Code and repository cleaning.
+* Github actions for automated pylint, testing, and releases to pypi.
+* Preserving per-version documentation from the wiki in `docs/`.
+
+
+# Truvari 2.1
+*Jan 27, 2021*
+
+We've expanded and improved Truvari's [annotations](https://github.com/spiralgenetics/truvari/wiki/anno). We've added an [SV "collapsing" tool](https://github.com/spiralgenetics/truvari/wiki/collapse). And we've added a way to [turn VCFs into pandas DataFrames](https://github.com/spiralgenetics/truvari/wiki/truv2df) easily for downstream analysis/QC. 
+
+# Truvari 2.0
+*May 14, 2020*
+
+After performing a drastic code refactor, we were able to create several helper methods from Truvari's core functionality around SV comparisons and VCF manipulations. This reusable code gave us an opportunity to create tools relevant for SV analysis. 
+
+Truvari now contains multiple subcommands. In addition to the original benchmarking functionality (`truvari bench`), Truvari can generate SV relevant summary statistics, compute consistency of calls within VCFs, and we've begun to develop annotations for SVs. Details on these tools are on the [WIKI](https://github.com/spiralgenetics/truvari/wiki).
+
+We are committed to continually improving Truvari with the hopes of advancing the study and analysis of structural variation.
+
+# Truvari 1.3
+*September 25th, 2019*
+
+Truvari has some big changes. In order to keep up with the o deement of Python 2.7 https://pythonclock.org/
+We're now only supporting Python 3.
+
+Additionally, we now package Truvari so it and its dependencies can be installed directly. See Installation 
+below. This will enable us to refactor the code for easier maintenance and reusability.
+
+Finally, we now automatically report genotype comparisons in the summary stats.
\ No newline at end of file
diff --git a/docs/v4.3.1/anno.md b/docs/v4.3.1/anno.md
new file mode 100644
index 00000000..55835902
--- /dev/null
+++ b/docs/v4.3.1/anno.md
@@ -0,0 +1,494 @@
+
+Truvari annotations:
+* [gcpct](anno#truvari-anno-gcpct) - GC Percent
+* [gtcnt](anno#truvari-anno-gtcnt) - Genotype Counts
+* [trf](anno#truvari-anno-trf) - Tandem Repeats
+* [grm](anno#truvari-anno-grm) - Mappability
+* [repmask](anno#truvari-anno-repmask) - Repeats
+* [remap](anno#truvari-anno-remap) - Allele Remapping
+* [hompct](anno#truvari-anno-hompct) - Homozygous Percent
+* [numneigh](anno#truvari-anno-numneigh) - Number of Neighbors
+* [svinfo](anno#truvari-anno-svinfo) - SVINFO Fields
+* [bpovl](anno#truvari-anno-bpovl) - Annotation Intersection
+* [density](anno#truvari-anno-density) - Call Density
+* [dpcnt](anno#truvari-anno-dpcnt) - Depth (DP) and Alt-Depth (AD) Counts
+* [lcr](anno#truvari-anno-lcr) - Low-complexity Regions
+* [grpaf](anno#truvari-anno-grpaf) - Sample Group Allele-Frequency Annotations
+
+# truvari anno gcpct
+
+This will add an INFO tag `GCPCT` to each element in a VCF of the GC percent of the call's sequence.
+
+For deletions, this is the GC percent of the reference range of the call. For insertions, the ALT sequence is analyzed.
+```
+usage: gcpct [-h] [-o OUTPUT] -r REFERENCE [input]
+
+Annotates GC content of SVs
+
+positional arguments:
+  input                 VCF to annotate (stdin)
+
+options:
+  -h, --help            show this help message and exit
+  -o OUTPUT, --output OUTPUT
+                        Output filename (stdout)
+  -r REFERENCE, --reference REFERENCE
+                        Reference fasta
+```
+
+# truvari anno gtcnt
+This will add an INFO tag `GTCNT` to each element in a VCF with the count of genotypes found across all samples. The value is a list of Counts of genotypes for the allele across all samples (UNK, REF, HET, HOM). This is most useful for pVCFs.
+
+```
+usage: gtcnt [-h] [-o OUTPUT] [input]
+
+Annotates GTCounts of alleles
+
+positional arguments:
+  input                 VCF to annotate (stdin)
+
+options:
+  -h, --help            show this help message and exit
+  -o OUTPUT, --output OUTPUT
+                        Output filename (stdout)
+```
+
+# truvari anno trf
+Adds a tandem repeat annotation to sequence resolved Insertion/Deletion variants a VCF.
+
+### Annotations added
+| Field Name | Description                                                 |
+|------------|-------------------------------------------------------------|
+| TRF        | Entry hits a tandem repeat region                           |
+| TRFdiff    | ALT TR copy difference from reference                       |
+| TRFrepeat  | Repeat motif                                                |
+| TRFovl     | Percent of ALT covered by TRF annotation                    |
+| TRFstart   | tart position of discovered repeat                          |
+| TRFend     | End position of discovered repeat                           |
+| TRFperiod  | eriod size of the repeat                                    |
+| TRFcopies  | Number of copies aligned with the consensus pattern         |
+| TRFscore   | Alignment score                                             |
+| TRFentropy | Entropy measure                                             |
+| TRFsim     | Similarity of ALT sequence to generated motif faux sequence |
+
+### Details
+Given a set of tandem repeat regions and a VCF, this annotate the tandem repeat motif and copy number change of insertions and deletions as expansions/contraction. The expected input catalog of tandem repeats is from a subset of columns in the Adotto TR catalog ([link](https://github.com/ACEnglish/adotto/blob/main/regions/DataDescription.md)). This file can be formatted for `truvari anno trf` via:
+```bash
+zcat adotto_TRregions_v1.1.bed.gz | cut -f1-3,18 | bgzip > anno.trf.bed.gz
+tabix anno.trf.bed.gz
+```
+For deletions, the tool simply finds the motif annotation with the highest overlap over the variant's boundaries. It then removes that sequence from the reference and calculates how many copies of the motif are removed with the formula `round(-(ovl_pct * svlen) / anno["period"], 1)`. If a deletion overlaps multiple motifs, the highest scoring motif is chosen based on higher reciprocal overlap percent first and TRF score second (see [code](https://github.com/ACEnglish/truvari/blob/2219f52850252c18dcd8c679da6644bb1cee5b68/truvari/annotations/trf.py#L29)]. 
+
+For insertions, by default the tool first tries to estimate which motif is contained in the alternate sequence. For each overlapping annotation, the copy number difference of the motif is calculated via `copy_diff = len(entry.alts[0][1:]) / anno["period"]`. Next, a 'feaux sequence' is made from `copy_diff` number of the motif. If the sequence is above the `--motif-similarity` with the insertion sequence, that is considered the insertion's motif. If no estimate is above the `--motif-similarity`, the insertion is incorporated into the reference and TRF is run. If the discovered TRF hits match a motif in the tandem repeat regions file, that annotation is used. If the highest scoring TRF hit doesn't match the tandem repeats region file, the nominally de novo annotation is added to the insertion's vcf entry.
+
+```
+usage: trf [-h] -i INPUT [-o OUTPUT] [-e EXECUTABLE] [-T TRF_PARAMS] -r REPEATS -f REFERENCE [-s MOTIF_SIMILARITY]
+           [-m MIN_LENGTH] [-R] [--no-estimate] [-C CHUNK_SIZE] [-t THREADS] [--debug]
+
+Intersect vcf with reference tandem repeats and annotate
+variants with the best fitting repeat motif and its copy number
+relative to the reference
+
+options:
+  -h, --help            show this help message and exit
+  -i INPUT, --input INPUT
+                        VCF to annotate
+  -o OUTPUT, --output OUTPUT
+                        Output filename (stdout)
+  -e EXECUTABLE, --executable EXECUTABLE
+                        Path to tandem repeat finder (trf409.linux64)
+  -T TRF_PARAMS, --trf-params TRF_PARAMS
+                        Default parameters to send to trf (3 7 7 80 5 40 500 -h -ngs)
+  -r REPEATS, --repeats REPEATS
+                        Reference repeat annotations
+  -f REFERENCE, --reference REFERENCE
+                        Reference fasta file
+  -s MOTIF_SIMILARITY, --motif-similarity MOTIF_SIMILARITY
+                        Motif similarity threshold (0.9)
+  -m MIN_LENGTH, --min-length MIN_LENGTH
+                        Minimum size of entry to annotate (50)
+  -R, --regions-only    Only write variants within --repeats regions (False)
+  --no-estimate         Skip INS estimation procedure and run everything through TRF. (False)
+  -C CHUNK_SIZE, --chunk-size CHUNK_SIZE
+                        Size (in mbs) of reference chunks for parallelization (5)
+  -t THREADS, --threads THREADS
+                        Number of threads to use (1)
+  --debug               Verbose logging
+```
+
+# truvari anno grm
+
+For every SV, we create a kmer over the the upstream and downstream reference and alternate breakpoints. 
+We then remap that kmer to the reference genome and report alignment information.
+This does not alter the VCF traditional annotations, but instead will create a pandas 
+DataFrame and save it to a joblib object.
+
+There are four queries made per-SV. For both reference (r), alternate (a) we create upstream (up) and downstream (dn) kmers.
+So the columns are all prefixed with one of "rup_", "rdn_", "aup_", "adn_".
+
+In the alignment information per-query, there are three 'hit' counts:
+- nhits : number of query hits
+- dir_hits : direct strand hit count
+- com_hits : compliment strand hit count
+
+The rest of the alignment information is reported by average (avg), maximum (max), and minimum (min)
+
+The suffixes are:
+- q : mapping quality score of the hits
+- ed : edit distance of the hits
+- mat : number of matches
+- mis : number of mismatches
+
+For example, "aup_avg_q", is the alternate's upstream breakend kmer's average mapping quality score.
+
+```
+usage: grm [-h] -i INPUT -r REFERENCE [-R REGIONS] [-o OUTPUT] [-k KMERSIZE] [-m MIN_SIZE] [-t THREADS] [--debug]
+
+Maps graph edge kmers with BWA to assess Graph Reference Mappability
+
+options:
+  -h, --help            show this help message and exit
+  -i INPUT, --input INPUT
+                        Input VCF
+  -r REFERENCE, --reference REFERENCE
+                        BWA indexed reference
+  -R REGIONS, --regions REGIONS
+                        Bed file of regions to parse (None)
+  -o OUTPUT, --output OUTPUT
+                        Output dataframe (results.jl)
+  -k KMERSIZE, --kmersize KMERSIZE
+                        Size of kmer to map (50)
+  -m MIN_SIZE, --min-size MIN_SIZE
+                        Minimum size of variants to map (25)
+  -t THREADS, --threads THREADS
+                        Number of threads (1)
+  --debug               Verbose logging
+```
+
+# truvari anno repmask
+
+```
+usage: repmask [-h] -i INPUT [-o OUTPUT] [-e EXECUTABLE] [-m MIN_LENGTH] [-M MAX_LENGTH] [-t THRESHOLD] [-p PARAMS] [-T THREADS]
+               [--debug]
+
+ Wrapper around RepeatMasker to annotate insertion sequences in a VCF
+
+options:
+  -h, --help            show this help message and exit
+  -i INPUT, --input INPUT
+                        VCF to annotate (None)
+  -o OUTPUT, --output OUTPUT
+                        Output filename (/dev/stdout)
+  -e EXECUTABLE, --executable EXECUTABLE
+                        Path to RepeatMasker (RepeatMasker)
+  -m MIN_LENGTH, --min-length MIN_LENGTH
+                        Minimum size of entry to annotate (50)
+  -M MAX_LENGTH, --max-length MAX_LENGTH
+                        Maximum size of entry to annotate (50000)
+  -t THRESHOLD, --threshold THRESHOLD
+                        Threshold for pct of allele covered (0.8)
+  -p PARAMS, --params PARAMS
+                        Default parameter string to send to RepeatMasker (-pa {threads} -qq -e hmmer -species human -lcambig
+                        -nocut -div 50 -no_id -s {fasta})
+  -T THREADS, --threads THREADS
+                        Number of threads to use (1)
+  --debug               Verbose logging
+```
+
+# truvari anno remap
+
+Taking the Allele’s sequence, remap it to the reference and annotate based on the closest alignment.
+
+![](https://github.com/spiralgenetics/truvari/blob/develop/imgs/remap_example.png)
+
+```
+usage: remap [-h] -r REFERENCE [-o OUTPUT] [-m MINLENGTH] [-t THRESHOLD] [-d DIST] [-H HITS] [--debug] [input]
+
+Remap VCF'S alleles sequence to the reference to annotate REMAP
+
+- novel : Allele has no hits in reference
+- tandem : Allele's closest hit is within len(allele) bp of the SV's position
+- interspersed : Allele's closest hit is not tandem
+- partial : Allele only has partial hit(s) less than --threshold
+
+Which alleles and alignments to consider can be altered with:
+- --minlength : minimum SV length to considred (50)
+- --dist : For deletion SVs, do not consider alignments that hit within Nbp of the SV's position
+(a.k.a. alignments back to the source sequence) (10)
+- --threshold : Minimum percent of allele's sequence used by alignment to be considered (.8)
+
+positional arguments:
+  input                 Input VCF (/dev/stdin)
+
+options:
+  -h, --help            show this help message and exit
+  -r REFERENCE, --reference REFERENCE
+                        BWA indexed reference
+  -o OUTPUT, --output OUTPUT
+                        Output VCF (/dev/stdout)
+  -m MINLENGTH, --minlength MINLENGTH
+                        Smallest length of allele to remap (50)
+  -t THRESHOLD, --threshold THRESHOLD
+                        Threshold for pct of allele covered to consider hit (0.8)
+  -d DIST, --dist DIST  Minimum distance an alignment must be from a DEL's position to be considered (10))
+  -H HITS, --hits HITS  Report top hits as chr:start-end.pct (max 0)
+  --debug               Verbose logging
+```
+# truvari anno hompct
+
+```
+usage: hompct [-h] -i INPUT [-o OUTPUT] [-b BUFFER] [-m MINANNO] [-M MAXGT] [-c MINCOUNT] [--debug]
+
+Calcluate the the Hom / (Het + Hom) of variants in the region of SVs
+Requires the VCF to contain SVs beside SNPs/Indels
+
+options:
+  -h, --help            show this help message and exit
+  -i INPUT, --input INPUT
+                        Compressed, indexed VCF to annotate
+  -o OUTPUT, --output OUTPUT
+                        Output filename (stdout)
+  -b BUFFER, --buffer BUFFER
+                        Number of base-pairs up/dn-stream to query (5000)
+  -m MINANNO, --minanno MINANNO
+                        Minimum size of event to annotate (50)
+  -M MAXGT, --maxgt MAXGT
+                        Largest event size to count for genotyping (1)
+  -c MINCOUNT, --mincount MINCOUNT
+                        Minimum number of genotyping events to report HOMPCT (0)
+  --debug               Verbose logging
+```
+
+# truvari anno numneigh
+
+```
+usage: numneigh [-h] [-o OUTPUT] [-r REFDIST] [-s SIZEMIN] [--passonly] [--debug] [input]
+
+For every call within size boundaries,
+Add NumNeighbors info field of how many calls are within the distance
+Add NeighId clustering field in the same chained neighborhood
+For example,
+::
+    -- is a call, refdist is 2
+         - - -   -    - -
+    nn:  1 2 1   0    1 1
+    id:  0 0 0   1    2 2
+
+positional arguments:
+  input                 VCF to annotate
+
+options:
+  -h, --help            show this help message and exit
+  -o OUTPUT, --output OUTPUT
+                        Output vcf (stdout)
+  -r REFDIST, --refdist REFDIST
+                        Max reference location distance (1000)
+  -s SIZEMIN, --sizemin SIZEMIN
+                        Minimum variant size to consider for annotation (50)
+  --passonly            Only count calls with FILTER == PASS
+  --debug               Verbose logging
+```
+
+# truvari anno svinfo
+
+Uses `truvari.entry_size` and  `truvari.entry_variant_type` on entries >= `args.minsize` to add 'SVLEN' and ‘SVTYPE’ annotations to a VCF’s INFO.
+
+How SVLEN is determined:
+- Starts by trying to use INFO/SVLEN
+- If SVLEN is unavailable and ALT field is an SV (e.g. \<INS\>, \<DEL\>, etc), use abs(vcf.start - vcf.end). The INFO/END tag needs to be available, especially for INS.
+- Otherwise, return the size difference of the sequence resolved call using abs(len(vcf.REF) - len(str(vcf.ALT[0])))
+
+How SVTYPE is determined:
+- Starts by trying to use INFO/SVTYPE
+- If SVTYPE is unavailable, infer if entry is a insertion or deletion by looking at the REF/ALT sequence size differences
+- If REF/ALT sequences are not available, try to parse the \<INS\>, \<DEL\>, etc from the ALT column.
+- Otherwise, assume 'UNK'
+
+```
+usage: svinfo [-h] [-o OUTPUT] [-m MINSIZE] [input]
+
+Adds SVTYPE and SVLEN INFO fields
+
+positional arguments:
+  input                 VCF to annotate (stdin)
+
+options:
+  -h, --help            show this help message and exit
+  -o OUTPUT, --output OUTPUT
+                        Output filename (stdout)
+  -m MINSIZE, --minsize MINSIZE
+                        Minimum size of entry to annotate (50)
+```
+
+# truvari anno bpovl
+
+After turning a tab-delimited annotation file into an IntervalTree, intersect the start/end and overlap of SVs.
+The output is a light-weight pandas DataFrame saved with joblib. The columns in the output are:
+
+- vcf_key : Variant key from `truvari.entry_to_key`
+- intersection : Type of intersection between the SV and the annotation
+    - start_bnd - SV's start breakpoints hits the annotation
+    - end_bnd - SV's end breakpoint hits the annotation
+    - overlaps - Annotation's start/end boundaries are completely within the SV
+    - contains - Annotation's start/end boundaries span the entire SV
+- anno_key : Annotation file's line index
+
+The idea with this tool is to annotate variants against tab-delimited files, especially when there's a 1-to-N variant to annotations. This tool is useful when used in conjunction with `truvari vcf2df` and pandas DataFrames.
+
+For example, if we have a VCF of SVs and a GTF of genes/gene features from Ensmbl. Any SV may intersect multiple features, which doesn't lend itself well to directly annotating the VCF's INFO. After using `bpovl`, we'll use Truvari to convert the SVs to a DataFrame.
+
+```bash
+truvari anno bpovl -i variants.vcf.gz -a genes.gtf.gz -o anno.jl -p gff
+truvari vcf2df variants.vcf.gz variants.jl
+```
+
+We can then treat the files similar to a database and do queries and joins to report which variants intersect which annotations.
+
+```python
+import joblib
+from gtfparse import read_gtf
+variants = joblib.load("variants.jl")
+genes = read_gtf("genes.gtf.gz")
+annos = joblib.load("anno.jl")
+to_check = annos.iloc[0]
+
+print(to_check)
+# vcf_key         chr20:958486-958487.A
+# intersection                start_bnd
+# anno_key                           11
+
+print(variants.loc[to_check['vcf_key']])
+# id                        None
+# svtype                     INS
+# ... etc
+
+print(annos.loc[to_check['anno_key']])
+# seqname                               chr20
+# source                       ensembl_havana
+# feature                                exon
+# start                                958452
+# ... etc
+```
+
+Similar to tabix, `bpovl` has presets for known file types like bed and gff. But any tab-delimited file with sequence/chromosome, start position, and end position can be parsed. Just set the "Annotation File Arguments" to the 0-based column indexes. For example, a bed file 
+has arguments `-s 0 -b 1 -e 2  -c #`.
+
+```
+usage: bpovl [-h] -a ANNO -o OUTPUT [--sizemin SIZEMIN] [--spanmax SPANMAX] [-p {bed,gff}] [-c COMMENT] [-s SEQUENCE] [-b BEGIN]
+             [-e END] [-1]
+             [input]
+
+Creates intersection of features in an annotation file with SVs' breakpoints and overlap
+
+positional arguments:
+  input                 VCF to annotate (stdin)
+
+options:
+  -h, --help            show this help message and exit
+  -a ANNO, --anno ANNO  Tab-delimited annotation file
+  -o OUTPUT, --output OUTPUT
+                        Output joblib DataFrame
+  --sizemin SIZEMIN     Minimum size of variant to annotate (50)
+  --spanmax SPANMAX     Maximum span of SVs to annotate (50000)
+
+Annotation File Arguments:
+  -p {bed,gff}, --preset {bed,gff}
+                        Annotation format. This option overwrites -s, -b, -e, -c and -1 (None)
+  -c COMMENT, --comment COMMENT
+                        Skip lines started with character. (#)
+  -s SEQUENCE, --sequence SEQUENCE
+                        Column of sequence/chromosome name. (0)
+  -b BEGIN, --begin BEGIN
+                        Column of start chromosomal position. (1)
+  -e END, --end END     Column of end chromosomal position. (2)
+  -1, --one-based       The position in the anno file is 1-based rather than 0-based. (False)
+```
+# truvari anno density
+Partitions a `--genome` into `--windowsize` regions and count how many variants overlap. Annotate
+regions with no variants as 'sparse' and with greater than or equal to (mean + `--threshold` * standard
+deviation) number of variants as 'dense'. Outputs a joblib DataFrame with columns 
+`chrom, start, end, count, anno`.
+
+```
+usage: density [-h] -g GENOME -o OUTPUT [-m MASK] [-w WINDOWSIZE] [-s STEPSIZE] [-t THRESHOLD] [input]
+
+Identify 'dense' and 'sparse' variant windows of the genome
+
+positional arguments:
+  input                 Input VCF (/dev/stdin)
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -g GENOME, --genome GENOME
+                        Genome bed file
+  -o OUTPUT, --output OUTPUT
+                        Output joblib DataFrame
+  -m MASK, --mask MASK  Mask bed file
+  -w WINDOWSIZE, --windowsize WINDOWSIZE
+                        Window size (10000)
+  -s STEPSIZE, --stepsize STEPSIZE
+                        Window step size (10000)
+  -t THRESHOLD, --threshold THRESHOLD
+                        std for identifying 'dense' regions (3)
+```
+
+# truvari anno dpcnt
+
+For multi-sample VCFs, it is often useful to have summarized depth (DP) information across samples per-variant. This adds a `INFO/DPCNT` with counts of how many samples have `FORMAT/DP` for each of the user-defined bins. Bins are incremented using `bisect` e.g. `pos = bisect.bisect(bins, dp); bins[pos] += 1;
+
+```
+usage: dpcnt [-h] [-b BINS] [--no-ad] [-p] [-o OUTPUT] [input]
+
+Quick utility to count how many samples have >= Nx coverage per-variant
+
+positional arguments:
+  input                 VCF to annotate (stdin)
+
+options:
+  -h, --help            show this help message and exit
+  -b BINS, --bins BINS  Coverage bins to bisect left the counts (0,5,10,15)
+  --no-ad               Skip adding ADCNT bins
+  -p, --present         Only count sites with present (non ./.) genotypes
+  -o OUTPUT, --output OUTPUT
+                        Output filename (stdout)
+```
+
+# truvari anno lcr
+
+```
+usage: lcr [-h] [-o OUTPUT] [input]
+
+Annotate low complexity region entropy score for variants
+Credit: https://jszym.com/blog/dna_protein_complexity/
+
+positional arguments:
+  input                 VCF to annotate (stdin)
+
+options:
+  -h, --help            show this help message and exit
+  -o OUTPUT, --output OUTPUT
+                        Output filename (stdout)
+```
+
+# truvari anno grpaf
+
+Add INFO tags of allele frequency annotations for groups of samples. For every group in `--labels` tab-delimited file, calculate the AF,MAF,ExcHet,HWE,MAC,AC for the samples in the group. Adds INFO tags with suffix of the group identifier (e.g. `AF_EAS`). `--strict` will hard fail if there are samples in the `--labels` not present in the vcf header.
+
+```
+usage: grpaf [-h] [-o OUTPUT] -l LABELS [-t TAGS] [--strict] [--debug] [input]
+
+Add allele frequency annotations for subsets of samples
+
+positional arguments:
+  input                 VCF to annotate
+
+options:
+  -h, --help            show this help message and exit
+  -o OUTPUT, --output OUTPUT
+                        Output filename (stdout)
+  -l LABELS, --labels LABELS
+                        Tab-delimited file of sample and group
+  -t TAGS, --tags TAGS  Comma-separated list of tags to add from AF,MAF,ExcHet,HWE,MAC,AC,AN (all)
+  --strict              Exit if sample listed in labels is not present in VCF (False)
+  --debug               Verbose logging
+```
\ No newline at end of file
diff --git a/docs/v4.3.1/bench.md b/docs/v4.3.1/bench.md
new file mode 100644
index 00000000..145bf974
--- /dev/null
+++ b/docs/v4.3.1/bench.md
@@ -0,0 +1,295 @@
+
+Quick start
+===========
+Run this command where base is your 'truth set' SVs and comp is the comparison set of SVs.
+```bash
+truvari bench -b base_calls.vcf -c comp_calls.vcf -o output_dir/
+```
+
+Matching Parameters
+===================
+Picking matching parameters can be more of an art than a science. It really depends on the precision of your callers and the tolerance you wish to allow them such that it is a fair comparison.
+
+For example, depth of coverage callers (such as CNVnator) will have very 'fuzzy' boundaries, and don't report the exact deleted sequence but only varying regions. So thresholds of `pctseq=0`, `pctsize=.5`, `pctovl=.5`, `refdist=1000` may seem fair.
+
+[BioGraph](https://github.com/spiralgenetics/biograph) and many long-read callers report precise breakpoints and full alternate allele sequences. When benchmarking those results, we want to ensure our accuracy by using the stricter default thresholds.
+
+If you're still having trouble picking thresholds, it may be beneficial to do a few runs of Truvari bench over different values. Start with the strict defaults and gradually increase the leniency. From there, you can look at the performance metrics and manually inspect differences between the runs to find out what level you find acceptable. Truvari is meant to be flexible for comparison. More importantly, Truvari helps one clearly report the thresholds used for reproducibility.
+
+Here is a rundown of each matching parameter.
+| Parameter  | Default | Definition |
+|------------|---------|------------|
+| refdist    | 500     | Maximum distance comparison calls must be within from base call's start/end |
+| pctseq     | 0.7     | Edit distance ratio between the REF/ALT haplotype sequences of base and<br>comparison call. See "Comparing Sequences of Variants" below. |
+| pctsize    | 0.7     | Ratio of min(base_size, comp_size)/max(base_size, comp_size) |
+| pctovl     | 0.0     | Ratio of two calls' (overlapping bases)/(longest span) |
+| typeignore | False   | Types don't need to match to compare calls. |
+
+Below are matching parameter diagrams to illustrate (approximately) how they work.
+
+```
+ █ = Deletion ^ = Insertion
+
+--refdist REFDIST (500)
+  Max reference location distance
+
+    ACTGATCATGAT
+     |--████--|    
+          █████      
+  
+  Calls are within reference distance of 2
+
+--pctsize PCTSIZE (0.7)
+  Min pct allele size similarity
+
+    ACTGATCATGA    sizes
+      █████     -> 5bp
+        ████    -> 4bp
+
+  variants have 0.8 size similarity
+
+
+--pctovl PCTOVL (0.0)
+  Min pct reciprocal overlap
+
+    ACTGATCATGA  ranges
+      █████      [2,7)
+        ████     [4,8)
+
+  variants have 0.6 reciprocial overlap
+
+
+--pctseq PCTSEQ (0.7)
+  Min percent allele sequence similarity
+
+    A-CTG-ACTG
+     ^   ^       haplotypes
+     |   └ACTG -> CTGACTGA
+     └CTGA     -> CTGACTGA
+
+  haplotypes have 100% sequence similarity
+```
+
+Outputs
+=======
+Truvari bench writes the following files to the `--output` directory.
+| File                 | Description                                 |
+|----------------------|---------------------------------------------|
+| tp-base.vcf.gz       | True positive calls form the base VCF       |
+| tp-comp.vcf.gz       | True positive calls from the comparison VCF |
+| fp.vcf.gz            | False positive calls from comparison        |
+| fn.vcf.gz            | False negative calls from base              |
+| summary.json         | Json output of performance stats            |
+| params.json          | Json output of parameters used              |
+| candidate.refine.bed | Bed file of regions for `refine`            |
+| log.txt              | Run's log                                   |
+
+summary.json
+------------
+Stats generated by benchmarking are written to `summary.json`.
+
+| Metric         | Definition                                                 |
+|----------------|------------------------------------------------------------|
+| TP-base        | Number of matching calls from the base vcf                 |
+| TP-comp        | Number of matching calls from the comp vcf                 |
+| FP             | Number of non-matching calls from the comp vcf             |
+| FN             | Number of non-matching calls from the base vcf             |
+| precision      | TP-comp / (TP-comp + FP)                                   |
+| recall         | TP-base / (TP-base + FN)                                   |
+| f1             | 2 * ((recall * precision) / (recall + precision))          |
+| base cnt       | Number of calls in the base vcf                            |
+| comp cnt       | Number of calls in the comp vcf                            |
+| TP-comp_TP-gt  | TP-comp with genotype match                                |
+| TP-comp_FP-gt  | TP-comp without genotype match                             |
+| TP-base_TP-gt  | TP-base with genotype match                                |
+| TP-base_FP-gt  | TP-base without genotype match                             |
+| gt_concordance | TP-comp_TP-gt / (TP-comp_TP-gt + TP-comp_FP-gt)            |
+| gt_matrix      | Base GT x Comp GT Matrix of all Base calls' best, TP match |
+| weighted       | Metrics weighed by variant sequence/size similarity        |
+
+The `gt_matrix` is a table. For example:
+```json
+"gt_matrix": {
+    "(0, 1)": {
+        "(0, 1)": 500,
+        "(1, 1)": 10
+    },
+    "(1, 1)": {
+        "(1, 1)": 800,
+        "(0, 1)": 20
+    }
+}
+```
+Represents -> 
+```
+comp    (0,1)   (1,1)
+base
+(0,1)   500     10
+(1,1)   20      800 
+```
+
+Added annotations
+-----------------
+The output vcfs are annotated with INFO fields and then sorted, compressed, and indexed inside of the output directory.
+
+| Anno              | Definition |
+|-------------------|-----------------------------------------------------------------------------------------------------------------|
+| TruScore	    | Truvari score for similarity of match. `((pctseq + pctsize + pctovl) / 3 * 100)`                                |
+| PctSeqSimilarity  | Pct sequence similarity between this variant and its closest match                                              |
+| PctSizeSimilarity | Pct size similarity between this variant and its closest match                                                  |
+| PctRecOverlap     | Percent reciprocal overlap percent of the two calls                                                             |
+| StartDistance     | Distance of the base call's start from comparison call's start                                                  |
+| EndDistance       | Distance of the base call's end from comparison call's end                                                      |
+| SizeDiff          | Difference in size of base and comp calls                                                                       |
+| GTMatch           | Base/comp calls' Genotypes match                                                                                |
+| MatchId           | Id to help tie base/comp calls together {chunkid}.{baseid}.{compid} See [[MatchIds wiki\|MatchIds]] for details. |
+
+
+Refining bench output
+=====================
+As described in the [[refine wiki|refine]], a limitation of Truvari bench is 1-to-1 variant comparison. However, `truvari refine` can harmonize the variants to give them more consistent representations. A bed file named `candidate.refine.bed` is created by `truvari bench` and holds a set of regions which may benefit from refinement. To use it, simply run
+```bash
+truvari bench -b base.vcf.gz -c comp.vcf.gz -o result/
+truvari refine --regions result/candidate.refine.bed \
+               --reference reference.fasta \
+               --recount --use-region-coords \
+               result/
+```
+See [[refine wiki|refine]] for details.
+
+Comparing Sequences of Variants
+===============================
+
+Truvari has implemented two approaches to compare variant sequences. The default comparison is called 'unroll'. Optionally, a `--reference` can be provided and Truvari will use the reference context of a pair of variants for comparison.
+
+## Unroll
+The method of giving a pair of calls the same reference context can be achieved using an 'unroll' method. For a formal description, see [this gist](https://gist.github.com/ACEnglish/1e7421c46ee10c71bee4c03982e5df6c). 
+
+The main idea is that in order to move variants upstream/downstream, the reference sequence flanking the variant will need to be moved downstream/upstream respectively. Or, to say this another way, we can think of the alternate sequences as being circular instead of linear. This means that in order to move the variant e.g. 1bp downstream for an INS, we could remove the first base from the ALT and append it to the end. So in the 'ab' example used to describe "Reference context" below, we only need to unroll the insertion at a position by the distance between it and another variant e.g. the INS `ab` at POS 2 becomes identical to the INS `ba` at POS 1 by rolling `2-1 = 1` bases from the start to the end.
+
+This unroll method has a number of benefits and a couple of considerations, including:
+* not needing a `--reference` for comparison, which saves I/O time
+* increasing the number of correctly matching SVs
+* decreasing the number of 'suspect' matches in smaller size regimes
+* providing a simpler pattern between PctSizeSimilarity and PctSeqSimilarity
+
+## Reference context
+For the reference context method, consider a hypothetical tandem repeat expansion of the reference sequence 'AB'. Here, we'll represent the 'insertion' sequence as lower-case 'ab', though it should be considered equivalent to 'AB'. Three equally valid descriptions of this 
+variant would be:
+
+```text
+#POS INS  Haplotype
+  0  ab   abAB
+  1  ba   AbaB
+  2  ab   ABab
+```
+
+Therefore, to compare the sequence similarity, Truvari builds the haplotypes over the range of a pair of calls'
+`min(starts):max(ends)` before making the the sequence change introduced by the variants. In python, this line 
+looks like:
+
+``` python
+hap1_seq = ref.get_seq(a1_chrom, start + 1, a1_start).seq + a1_seq + ref.get_seq(a1_chrom, a1_end + 1, end).seq
+```
+
+Where `a1_seq1` is the longer of the REF or ALT allele.
+
+## SVs without sequences
+
+If the base or comp vcfs do not have sequence resolved calls (e.g. `<DEL>`, simply set `--pctseq=0` to turn off
+sequence comparison. The `--reference` does not need to be provided when not using sequence comparison. If
+`--pctseq != 0` and an unresolved SV is encountered, a warning will be raised and the variant will not be compared.
+
+Controlling the number of matches
+=================================
+
+How many matches a variant is allowed to participate in is controlled by the `--pick` parameter. The available pickers are `single`, `ac`, and `multi`. 
+
+* `single` (the default option) allows each variant to participate in up to one match.  
+* `ac` uses the genotype allele count to control how many matches a variant can have. This means a homozygous alternate variant can participate in two matches (its GT is 1/1 so AC=2). A heterozygous variant can only participate in one match (GT 0/1, AC=1). And, a homozygous reference variant cannot be matched. Note that missing genotypes are considered reference alleles and do not add to the AC e.g. (GT ./1, AC=1).
+* `multi` variants can participate in all matches available.
+
+As an example, imagine we have three variants in a pVCF with two samples we want to compare. 
+
+```
+CHROM POS      ID  REF ALT           base comp
+chr20 17785968 ID1 A   ACGCGCGCGCG   1/1  1/0
+chr20 17785968 ID2 A   ACGCGCGCGCGCG 0/0  0/1
+chr20 17785969 ID3 C   CGCGCGCGCGCGC 0/0  1/1
+```
+
+To compare samples inside the same vcf, we would use the command:
+```bash
+truvari bench -b input.vcf.gz -c input.vcf.gz -o output/ --bSample base --cSample comp --no-ref a
+```
+
+This VCF makes different results depending on the `--pick` parameter
+
+| Parameter | ID1 State | ID2 State | ID3 State |
+|-----------|-----------|-----------|-----------|
+| single    |    TP     |   FP      |    FP     |
+| ac        |    TP     |   TP      |    FP     |
+| multi     |    TP     |   TP      |    TP     |
+
+--dup-to-ins
+============
+
+Most SV benchmarks only report DEL and INS SVTYPEs. The flag `--dup-to-ins` will interpret SVs with SVTYPE == DUP to SVTYPE == INS. Note that DUPs generally aren't sequence resolved (i.e. the ALT isn't a sequence) like INS. Therefore, `--dup-to-ins` typically should be used without sequence comparison via `--pctseq 0`
+
+Size filtering
+==============
+
+`--sizemax` is the maximum size of a base or comparison call to be considered.
+
+`--sizemin` is the minimum size of a base call to be considered.  
+
+`--sizefilt` is the minimum size of a comparison call that will be matched to base calls. It can 
+be less than `sizemin` for edge case variants.
+
+For example: Imagine `sizemin` is set at 50 and `sizefilt` at 30, and a 50bp base call is 98% similar to a 49bp comparison
+call at the same position.
+
+These two calls could be considered matching. However, if we removed comparison calls less than `sizemin`, 
+we'd incorrectly classify the 50bp base call as a false negative. Instead, we allow comparison calls between `[sizefilt,sizemin)` to find matches.
+
+This has the side effect of artificially inflating specificity. For example, if that same 49bp call described
+above were below the similarity threshold, it would not be classified as a FP since it is below the `sizemin`
+threshold. So we're giving the call a better chance to be useful and less chance to be detrimental
+to final statistics.
+
+Include Bed & VCF Header Contigs 
+================================
+
+If an `--includebed` is provided, only base and comp calls contained within the defined regions are used 
+for comparison. This is similar to pre-filtering your base/comp calls using:
+
+```bash
+(zgrep "#" my_calls.vcf.gz && bedtools intersect -u -a my_calls.vcf.gz -b include.bed) | bgzip > filtered.vcf.gz
+```
+
+with the exception that Truvari requires the start and the end to be contained in the same includebed region 
+whereas `bedtools intersect` does not.
+
+If an `--includebed` is not provided, the comparison is restricted to only the contigs present in the base VCF
+header. Therefore, any comparison calls on contigs not in the base calls will not be counted toward summary 
+statistics and will not be present in any output vcfs.
+
+Extending an Include Bed
+------------------------
+The option `--extend` extends the regions of interest (set in `--includebed` argument) by the given number of bases on each side, allowing base variants to match comparison variants that are just outside of the original region. If a comparison variant is in the extended regions it can potentially match a base variant that is in the original regions turning it to TP. Comparison variants in the extended regions that don't have a match are not counted as FP. This strategy is similar to the one implemented for size matching where only the base variants longer than sizemin (equal to 50 by default) are considered, but they are allowed to match shorter comparison variants sizefilt (30bp by default) or longer.
+
+See this [discussion](https://github.com/ACEnglish/truvari/discussions/99)for details.
+
+Methodology
+===========
+Here is a high-level pseudocode description of the steps Truvari bench conducts to compare the two VCFs.
+```
+* zip the Base and Comp calls together in sorted order
+* create chunks of all calls overlapping within ±`--chunksize` basepairs
+* make a |BaseCall| x |CompCall| match matrix for each chunk
+* build a Match for each call pair in the chunk - annotate as TP if >= all thresholds 
+* if the chunk has no Base or Comp calls
+** return them all as FNs/FPs 
+* use `--pick` method to sort and annotate variants with their best match
+```
+![](https://github.com/acenglish/truvari/blob/develop/imgs/TruvariBenchMethod.png)
\ No newline at end of file
diff --git a/docs/v4.3.1/collapse.md b/docs/v4.3.1/collapse.md
new file mode 100644
index 00000000..7ac22adb
--- /dev/null
+++ b/docs/v4.3.1/collapse.md
@@ -0,0 +1,163 @@
+`collapse` is Truvari's approach to SV merging. After leveraging `bcftools` to merge VCFs, `truvari collapse` can then iterate over the calls and create clusters of SVs that match over the [provided thresholds](https://github.com/spiralgenetics/truvari/wiki/bench#matching-parameters). This is also useful when removing potentially redundant calls within a single sample. 
+
+Example
+=======
+To start, we merge multiple VCFs (each with their own sample) and ensure there are no multi-allelic entries via: 
+```bash
+bcftools merge -m none one.vcf.gz two.vcf.gz | bgzip > merge.vcf.gz
+```
+
+This will `paste` SAMPLE information between vcfs when calls have the exact same chrom, pos, ref, and alt.
+For example, consider two vcfs:
+
+    >> one.vcf:
+    chr1 1 ... GT 0/1
+    chr1 5 ... GT 1/1
+    >> two.vcf:
+    chr1 1 ... GT 1/1
+    chr1 7 ... GT 0/1
+
+`bcftools merge` creates:
+
+    >> merge.vcf:
+    chr1 1 ... GT 0/1 1/1
+    chr1 5 ... GT 1/1 ./.
+    chr1 7 ... GT ./. 0/1    
+
+This VCF can then be collapsed to allow 'fuzzier' matching than the exact merge just performed.
+
+```bash
+truvari collapse -i merge.vcf.gz -o truvari_merge.vcf -c truvari_collapsed.vcf
+```
+
+For example, if we collapsed our example merge.vcf by matching any calls within 3bp, we'd create:
+
+    >> truvari_merge.vcf
+    chr1 1 ... GT 0/1 1/1
+    chr1 5 ... GT 1/1 0/1
+    >> truvari_collapsed.vcf
+    chr1 7 ... GT ./. 0/1    
+
+--choose behavior
+=================
+When collapsing, the default `--choose` behavior is to take the `first` variant by position from a cluster to
+be written to the output while the others will be placed in the collapsed output. 
+Other choosing options are `maxqual` (the call with the highest quality score) or `common` (the call with the highest minor allele count).
+
+Samples with no genotype information in the kept variant will be filled by the first
+collapsed variant containing genotype information.                                                                                    
+
+--gt
+====
+For some results, one may not want to collapse variants with conflicting genotypes from a single sample. With the `--gt all` parameter, variants which are present (non `0/0` or `./.`) in the same sample are not collapsed. With the `-gt het` parameter, only variants which are both heterozygous in a sample (e.g. `0/1` and `0/1`) are prevented from collapsing. The `--gt het` is useful for some SV callers which will redundantly call variants and typically genotype them all as `1/1`.
+
+--intra
+=======
+When a single sample is run through multiple SV callers, one may wish to consolidate those results. After the `bcftools merge` of the VCFs, there will be one SAMPLE column per-input. With `--intra`, collapse will consolidate the sample information so that only a single sample column is present in the output. Since the multiple callers may have different genotypes or other FORMAT fields with conflicting information, `--intra` takes the first column from the VCF, then second, etc. For example, if we have an entry with:
+```
+FORMAT    RESULT1     RESULT2
+GT:GQ:AD  ./.:.:3,0  1/1:20:0,30
+```
+The `--intra` output would be:
+```
+FORMAT    RESULT1
+GT:GQ:AD  1/1:20:3,0
+```
+As you can see in this example, 1) The first sample name is the only one preserved. 2) conflicting FORMAT fields can be consolidated in a non-useful way (here the AD of `3,0` isn't informative to a `1/1` genotype). We're working to provide an API to help users write custom intra-sample consolidation scripts.
+
+--hap mode
+==========
+When using `--hap`, we assume phased variants from a single individual. Only the
+single best matching call from the other haplotype will be collapsed,
+and the consolidated genotype will become 1/1
+
+For example, if we collapse anything at the same position:
+
+    chr1 1 .. GT 0|1
+    chr1 1 .. GT 1|0
+    chr1 2 .. GT 1|0
+
+will become:
+
+    chr1 1 .. GT 1/1
+    chr1 2 .. GT 1|0
+
+--chain mode
+============
+Normally, every variant in a set of variants that are collapsed together matches every other variant in the set. However, when using `--chain` mode, we allow 'transitive matching'. This means that all variants match to only at least one other variant in the set. In situations where a 'middle' variant has two matches that don't match each other, without `--chain` the locus will produce two variants whereas using `--chain` will produce one.
+For example, if we have
+
+    chr1 5 ..
+    chr1 7 ..
+    chr1 9 ..
+
+When we collapse anything within 2bp of each other, without `--chain`, we output:
+
+    chr1 5 ..
+    chr1 9 ..
+
+With `--chain`, we would collapse `chr1 9` as well, producing
+
+    chr1 5 ..
+
+Annotations
+===========
+`collapse` produces two files. The output file has kept variants along with unanalyzed (< sizemin) variants. The collapsed file contains the variants that were collapsed into the kept variants. 
+
+The output file has only two annotations added to the `INFO`. 
+- `CollapseId` - Identifier of the variant when comparing to the collapse outputs. 
+- `NumCollapsed` - Number of variants collapsed into this variant
+- `NumConsolidated` - Number of samples' genotypes consolidated into this call's genotypes
+
+The collapsed file has all of the annotations added by [[bench|bench#definition-of-annotations-added-to-tp-vcfs]]. Note that `MatchId` is tied to the output file's `CollapseId`. See [MatchIds](https://github.com/spiralgenetics/truvari/wiki/MatchIds) for details.
+
+```
+usage: collapse [-h] -i INPUT [-o OUTPUT] [-c COLLAPSED_OUTPUT] [-f REFERENCE] [-k {first,maxqual,common}] [--debug]
+                [-r REFDIST] [-p PCTSIM] [-B MINHAPLEN] [-P PCTSIZE] [-O PCTOVL] [-t] [--use-lev] [--hap] [--chain]
+                [--no-consolidate] [--null-consolidate NULL_CONSOLIDATE] [-s SIZEMIN] [-S SIZEMAX] [--passonly]
+
+Structural variant collapser
+
+Will collapse all variants within sizemin/max that match over thresholds
+
+options:
+  -h, --help            show this help message and exit
+  -i INPUT, --input INPUT
+                        Comparison set of calls
+  -o OUTPUT, --output OUTPUT
+                        Output vcf (stdout)
+  -c COLLAPSED_OUTPUT, --collapsed-output COLLAPSED_OUTPUT
+                        Where collapsed variants are written (collapsed.vcf)
+  -f REFERENCE, --reference REFERENCE
+                        Indexed fasta used to call variants
+  -k {first,maxqual,common}, --keep {first,maxqual,common}
+                        When collapsing calls, which one to keep (first)
+  --debug               Verbose logging
+  --hap                 Collapsing a single individual's haplotype resolved calls (False)
+  --chain               Chain comparisons to extend possible collapsing (False)
+  --no-consolidate      Skip consolidation of sample genotype fields (True)
+  --null-consolidate NULL_CONSOLIDATE
+                        Comma separated list of FORMAT fields to consolidate into the kept entry by taking the first non-null
+                        from all neighbors (None)
+
+Comparison Threshold Arguments:
+  -r REFDIST, --refdist REFDIST
+                        Max reference location distance (500)
+  -p PCTSIM, --pctsim PCTSIM
+                        Min percent allele sequence similarity. Set to 0 to ignore. (0.95)
+  -B MINHAPLEN, --minhaplen MINHAPLEN
+                        Minimum haplotype sequence length to create (50)
+  -P PCTSIZE, --pctsize PCTSIZE
+                        Min pct allele size similarity (minvarsize/maxvarsize) (0.95)
+  -O PCTOVL, --pctovl PCTOVL
+                        Min pct reciprocal overlap (0.0) for DEL events
+  -t, --typeignore      Variant types don't need to match to compare (False)
+  --use-lev             Use the Levenshtein distance ratio instead of edlib editDistance ratio (False)
+
+Filtering Arguments:
+  -s SIZEMIN, --sizemin SIZEMIN
+                        Minimum variant size to consider for comparison (50)
+  -S SIZEMAX, --sizemax SIZEMAX
+                        Maximum variant size to consider for comparison (50000)
+  --passonly            Only consider calls with FILTER == PASS
+```
\ No newline at end of file
diff --git a/docs/v4.3.1/consistency.md b/docs/v4.3.1/consistency.md
new file mode 100644
index 00000000..17ba23b7
--- /dev/null
+++ b/docs/v4.3.1/consistency.md
@@ -0,0 +1,179 @@
+
+In addition to looking at performance of a single set of variation against a baseline, one may wish to measure the consistency between multiple sets of variation. The tool `truvari consistency` can automatically create that result. 
+
+Running
+=======
+
+```
+usage: consistency [-h] [-d] [-j] [-o OUTPUT] VCFs [VCFs ...]
+
+Over multiple vcfs, calculate their intersection/consistency.
+
+Calls will match between VCFs if they have a matching key of:
+    CHROM POS ID REF ALT
+
+positional arguments:
+  VCFs                  VCFs to intersect
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -d, --no-dups         Disallow duplicate SVs
+  -j, --json            Output report in json format
+  -o OUTPUT, --output OUTPUT
+                        Write tsv of variant keys and their flag
+```
+
+Example
+=======
+
+```bash
+truvari consistency fileA.vcf fileB.vcf fileC.vcf
+```
+
+Matching Entries
+================
+
+VCF entries will be considered matching if and only if they have an exact same key of `CHROM POS ID REF ALT`. Because of this stringency, it is recommend that you compare the tp-base.vcf or fn.vcf results from each individual VCF's Truvari output. The key and flags can be written with the `--output` option.
+
+Duplicates
+==========
+
+If there are VCFs with duplicate keys, they are handled by appending a e.g. `.1` to the key. If you'd like to ignore duplicates, just add `--no-dups`
+
+Output Report
+=============
+
+Below is an example report:
+
+```text
+#
+# Total 5534 calls across 3 VCFs
+#
+#File	NumCalls
+fileA.vcf	4706
+fileB.vcf	4827
+fileC.vcf	4882
+#
+# Summary of consistency
+#
+#VCFs	Calls	Pct
+3	3973	71.79%
+2	935	16.90%
+1	626	11.31%
+#
+# Breakdown of VCFs' consistency
+#
+#Group	Total	TotalPct	PctOfFileCalls
+111	3973	71.79%	84.42% 82.31% 81.38%
+011	351	6.34%	7.27% 7.19%
+101	308	5.57%	6.54% 6.31%
+110	276	4.99%	5.86% 5.72%
+001	250	4.52%	5.12%
+010	227	4.10%	4.70%
+100	149	2.69%	3.17%
+```
+
+At the top we see that we compared 5,534 unique variants between the 3 files, with fileC.vcf having the most calls at 4,882.
+
+The "Summary of consistency" shows us that 3,973 (71.79%) of all the calls are shared between the 3 VCFs, while 626 (11.31%) are only found in one of the VCFs.
+
+Reading the "Breakdown of VCFs' consistency", a `Group` is a unique key for presence (1) or absence (0) of a call within each of the listed `#Files`. For example: `Group 111` is calls present in all VCFs; `Group 011` is calls present in only the 2nd and 3rd VCFs (i.e. fileB.vcf and fileC.vcf).
+
+We see that `Group 101` has calls belonging to the 1st and 3rd `#Files` (i.e. fileA.vcf and fileC.vcf). This group has a total of 308 calls that intersect, or 5.57%  of all calls in all VCFs. This 308 represents 6.54% of calls in fileA.vcf and 6.31% of calls in fileC.vcf. 
+
+Finally, we see that fileA.vcf has the least amount of calls unique to it on the `Group 100` line.
+
+Json
+====
+Below is a consistency report in json format.
+```json
+{
+    "vcfs": [
+        "repo_utils/test_files/variants/input1.vcf.gz",
+        "repo_utils/test_files/variants/input2.vcf.gz",
+        "repo_utils/test_files/variants/input3.vcf.gz"
+    ],
+    "total_calls": 3513,
+    "num_vcfs": 3,
+    "vcf_counts": {
+        "repo_utils/test_files/variants/input1.vcf.gz": 2151,
+        "repo_utils/test_files/variants/input2.vcf.gz": 1783,
+        "repo_utils/test_files/variants/input3.vcf.gz": 2065
+    },
+    "shared": [
+        {
+            "vcf_count": 3,
+            "num_calls": 701,
+            "call_pct": 0.1995445488186735
+        },
+        {
+            "vcf_count": 2,
+            "num_calls": 1084,
+            "call_pct": 0.3085681753487048
+        },
+        {
+            "vcf_count": 1,
+            "num_calls": 1728,
+            "call_pct": 0.4918872758326217
+        }
+    ],
+    "detailed": [
+        {
+            "group": "111",
+            "total": 701,
+            "total_pct": 0.1995445488186735,
+            "repo_utils/test_files/variants/input1.vcf.gz": 0.32589493258949326,
+            "repo_utils/test_files/variants/input2.vcf.gz": 0.393157599551318,
+            "repo_utils/test_files/variants/input3.vcf.gz": 0.3394673123486683
+        },
+        {
+            "group": "001",
+            "total": 645,
+            "total_pct": 0.18360375747224594,
+            "repo_utils/test_files/variants/input1.vcf.gz": 0,
+            "repo_utils/test_files/variants/input2.vcf.gz": 0,
+            "repo_utils/test_files/variants/input3.vcf.gz": 0.31234866828087166
+        },
+        {
+            "group": "100",
+            "total": 598,
+            "total_pct": 0.17022487902077996,
+            "repo_utils/test_files/variants/input1.vcf.gz": 0.2780102278010228,
+            "repo_utils/test_files/variants/input2.vcf.gz": 0,
+            "repo_utils/test_files/variants/input3.vcf.gz": 0
+        },
+        {
+            "group": "101",
+            "total": 487,
+            "total_pct": 0.1386279533162539,
+            "repo_utils/test_files/variants/input1.vcf.gz": 0.22640632264063226,
+            "repo_utils/test_files/variants/input2.vcf.gz": 0,
+            "repo_utils/test_files/variants/input3.vcf.gz": 0.2358353510895884
+        },
+        {
+            "group": "010",
+            "total": 485,
+            "total_pct": 0.13805863933959578,
+            "repo_utils/test_files/variants/input1.vcf.gz": 0,
+            "repo_utils/test_files/variants/input2.vcf.gz": 0.27201346045989905,
+            "repo_utils/test_files/variants/input3.vcf.gz": 0
+        },
+        {
+            "group": "110",
+            "total": 365,
+            "total_pct": 0.10389980074010817,
+            "repo_utils/test_files/variants/input1.vcf.gz": 0.1696885169688517,
+            "repo_utils/test_files/variants/input2.vcf.gz": 0.2047111609646663,
+            "repo_utils/test_files/variants/input3.vcf.gz": 0
+        },
+        {
+            "group": "011",
+            "total": 232,
+            "total_pct": 0.06604042129234272,
+            "repo_utils/test_files/variants/input1.vcf.gz": 0,
+            "repo_utils/test_files/variants/input2.vcf.gz": 0.13011777902411667,
+            "repo_utils/test_files/variants/input3.vcf.gz": 0.11234866828087167
+        }
+    ]
+}
+```
\ No newline at end of file
diff --git a/docs/v4.3.1/divide.md b/docs/v4.3.1/divide.md
new file mode 100644
index 00000000..3bbad3cb
--- /dev/null
+++ b/docs/v4.3.1/divide.md
@@ -0,0 +1,58 @@
+Divide a VCF into independent shards.
+
+Unfortunately, `pysam.VariantRecord` objects aren't pickle-able. This means that if we wanted to have Truvari leverage python's multiprocessing we'd need to make a custom VCF parser. However, the command `truvari divide` allows us to take an input VCF and divide it into multiple independent parts (or shards) which can the be processed over multiple processes. 
+
+`truvari divide` works by parsing a VCF and splitting it into multiple, smaller sub-VCFs. If any variants are within `--buffer` base-pairs, they're output to the same sub-VCF. This allows variants in the same region which would need to be compared to one-another (see `--refdist`) to stay in the same sub-VCF. The `--min` parameter allows us to control the minimum number of variants per sub-VCF so that we don't make too many tiny VCFs. Once the sub-VCFs are created, we can process each independently through whatever truvari command.
+
+For example, let's say we want to run `truvari collapse` on a very large VCF with many variants and many samples. First, we divide the VCF:
+
+```bash
+truvari divide big_input.vcf.gz sub_vcfs_directory/
+```
+
+Inside of `sub_vcfs_directory/` we'll have multiple VCFs, which we can process with a simple bash script
+
+```bash
+NJOBS=$(nproc) # use all processors by default
+mkdir -p output_vcfs/
+mkdir -p collap_vcfs/
+mkdir -p logs/
+
+for in_vcf in sub_vcfs_directory/*.vcf.gz
+do
+    # Setup file names
+    base_name=$(basename $in_vcf)
+    base_name=${base_name%.vcf.gz}
+    output_name=output_vcfs/${base_name}.vcf
+    collap_name=collap_vcfs/${base_name}.vcf
+    log_name=logs/${base_name}.log
+    # Run the command and send it to the background
+    truvari collapse -i $in_vcf -o $output_name -c $collap_name -f reference.fa &> logs/${log_name}.log &
+    # If too many jobs are running, wait for one to finish
+    while [ $( jobs | wc -l ) -ge ${NJOBS} ]
+    do
+        sleep 5
+    done
+done
+```
+
+Obviously the logs and `while` loop are tricks for running on a single machine. If you have access to a cluster, I'm sure you can imagine how to create/submit the commands.
+
+```
+usage: divide [-h] [-b BUFFER] [-m MIN] [--no-compress] [-T THREADS] VCF DIR
+
+Divide a VCF into independent parts
+
+positional arguments:
+  VCF                   VCF to split
+  DIR                   Output directory to save parts
+
+options:
+  -h, --help            show this help message and exit
+  -b BUFFER, --buffer BUFFER
+                        Buffer to make mini-clusters (1000)
+  -m MIN, --min MIN     Minimum number of entries per-vcf (100)
+  --no-compress         Don't attempt to compress/index sub-VCFs
+  -T THREADS, --threads THREADS
+                        Number of threads for compressing/indexing sub-VCFs (1)
+```
\ No newline at end of file
diff --git a/docs/v4.3.1/ga4gh.md b/docs/v4.3.1/ga4gh.md
new file mode 100644
index 00000000..864c24cd
--- /dev/null
+++ b/docs/v4.3.1/ga4gh.md
@@ -0,0 +1,3 @@
+The ga4gh command will consolidate tp-base/fn and tp-comp/fp VCF files into a single truth and query vcf, respectively. Additional tags are added that are compliant with the [ga4gh intermediates](https://github.com/ga4gh/benchmarking-tools/blob/master/doc/ref-impl/intermediate.md).  
+
+When consolidating a result which has `truvari refine`, the `truvari ga4gh --with-refine` flag will extract original variants from unharmonized regions and altered variants from harmonized regions inside the `phab_bench/` sub-directory.
\ No newline at end of file
diff --git a/docs/v4.3.1/phab.md b/docs/v4.3.1/phab.md
new file mode 100644
index 00000000..bdcaec9e
--- /dev/null
+++ b/docs/v4.3.1/phab.md
@@ -0,0 +1,157 @@
+Introduction
+------------
+
+Truvari's comparison engine can match variants using a wide range of thresholds. However, some alleles can produce radically different variant representations. We could dramatically lower our thresholds to identify the match, but this would cause variants from unidentical alleles to be falsely matched. 
+
+This problem is easiest to conceptualize in the case of 'split' variants: imagine a pipeline calls a single 100bp DEL that can also be represented as two 50bp DELs. To match these variants, we would need to loosen our thresholds to `--pick multi --pctsim 0.50 --pctsize 0.50`. Plus, these thresholds leave no margin for error. If the variant caller erroneously deleted an extra base to make a 101bp DEL we would have to lower our thresholds even further. These thresholds are already too low because there's plenty of distinct alleles with >= 50% homology.
+
+So how do we deal with inconsistent representations? In an ideal world, we would simply get rid of them by harmonizing the variants. This is the aim of `truvari phab` 
+
+`truvari phab` is designed to remove variant representation inconsistencies through harmonization. By reconstructing haplotypes from variants, running multiple-sequence alignment of the haplotypes along with the reference, and then recalling variants, we expect to remove discordance between variant representations and simplify the work required to perform variant comparison.
+
+Requirements
+------------
+Since `truvari phab` uses mafft v7.505 via a command-line call, it expects it to be in the environment path. Download mafft and have its executable available in the `$PATH` [mafft](https://mafft.cbrc.jp/alignment/software/)
+
+Alternatively, you can use the Truvari [Docker container](Development#docker) which already has mafft ready for use.
+
+Also, you can use wave front aligner (pyWFA) or partial order alignment (pyabpoa). While wfa is the fastest approach, it will independently align haplotypes and therefore may produce less parsimonous aligments. And while poa is more accurate than wfa and faster than mafft, it is less accurate than mafft.
+
+Example
+-------
+As an example, we'll use Truvari's test files in `repo_utils/test_files/phab*` which were created from real data over a tandem repeat at GRCh38 chr1:26399065-26401053 and translated to a small test genome with coordinates chr1:1-1988.
+
+* `phab_base.vcf.gz` - an 86 sample squared-off pVCF
+* `phab_comp.vcf.gz` - a single sample's VCF
+* `phab_ref.fa` - a subset of the GRCh38 reference
+
+This dataset is interesting because the `HG002` sample in `phab_base.vcf.gz` uses the same sequencing experiment ([HPRC](https://github.com/human-pangenomics/HPP_Year1_Assemblies)) as the sample `syndip` in `phab_comp.vcf.gz`, but processed with a different pipeline. And as we will see, the pipeline can make all the difference.
+
+To start, let's use `truvari bench` to see how similar the variant calls are in this region.
+```bash
+truvari bench --base phab_base.vcf.gz \
+	--comp phab_comp.vcf.gz \
+	--sizemin 1 --sizefilt 1 \
+	--bSample HG002 \
+	--cSample syndip \
+	--no-ref a \
+	--output initial_bench
+```
+This will compare all variants greater than 1bp ( `-S 1 -s 1` which includes SNPs) from the `HG002` sample to the `syndip` sample. We're also excluding any uncalled or reference homozygous sites with `--no-ref a`. The report in `initial_bench/summary.txt` shows:
+```json
+{
+    "TP-base": 5,
+    "TP-comp": 5,
+    "FP": 2,
+    "FN": 22,
+    "precision": 0.7142857142857143,
+    "recall": 0.18518518518518517,
+    "f1": 0.2941176470588235,
+}
+```
+
+These variants are pretty poorly matched, especially considering the `HG002` and `syndip` samples are using the same sequencing experiment. We can also inspect the `initial_bench/fn.vcf.gz` and see a lot of these discordant calls are concentrated in a 200bp window. Let's use `truvari phab` to harmonize the variants in this region.
+```bash
+truvari phab --base phab_base.vcf.gz \
+	--comp phab_comp.vcf.gz \
+	--bSample HG002 \
+	--cSample syndip \
+	--reference phab_ref.fa \
+	--region chr1:700-900 \
+	-o harmonized.vcf.gz
+```
+
+In our `harmonized.vcf.gz` we can see there are now only 9 variants. Let's run `truvari bench` again on the output to see how well the variants match after being harmonized.
+
+```bash
+truvari bench -b harmonized.vcf.gz \
+	-c harmonized.vcf.gz \
+	-S 1 -s 1 \
+	--no-ref a \
+	--bSample HG002 \
+	--cSample syndip \
+	-o harmonized_bench/
+```
+Looking at `harmonized_bench/summary.txt` shows:
+```json
+{
+    "TP-base": 8,
+    "TP-comp": 8,
+    "FP": 0,
+    "FN": 0,
+    "precision": 1.0,
+    "recall": 1.0,
+    "f1": 1.0,
+}
+```
+Now there is no difference between our two sets of variants in this region.
+
+For this variant call-set, `truvri phab` makes `truvari bench` overkill since the variants create identical haplotypes. In fact, we can benchmark simply by counting the genotypes.
+```bash
+$ bcftools query -f "[%GT ]\n" phab_result/output.vcf.gz | sort | uniq -c
+      1 0/1 1/0
+      1 1/0 0/1
+      6 1/1 1/1
+```
+(We can ignore the phasing differences (`0/1` vs. `1/0`). These pipelines reported the parental alleles in a different order)
+
+MSA
+---
+
+If you read the `truvari phab --help` , you may have noticed that the `--comp` VCF is optional. This is by design so that we can also harmonize the variants inside a single VCF. By performing a multiple-sequence alignment across samples, we can better represent variation across a population. To see this in action, let's run `phab` on all 86 samples in the `repo_utils/test_files/phab_base.vcf.gz`
+```bash
+truvari phab -b phab_base.vcf.gz \
+	-f phab_ref.fa \
+	-r chr1:700-900 \
+	-o msa_example.vcf.gz
+```
+
+As a simple check, we can count the number of variants before/after `phab`:
+```bash
+bcftools view -r chr1:700-900 phab_base.vcf.gz | grep -vc "#"
+bcftools view -r chr1:700-900 msa_example.vcf.gz | grep -vc "#"
+```
+The `160` original variants given to `phab` became just `60`.
+
+Better yet, these fewer variants occur on fewer positions:
+```bash
+
+bcftools query -r chr1:700-900 -f "%POS\n" phab_base.vcf.gz | sort | uniq | wc -l
+bcftools query -r chr1:700-900 -f "%POS\n" msa_example.vcf.gz | sort | uniq | wc -l
+```
+This returns that the variants were over `98` positions but now sit at just `16`
+
+We can also observe changes in the allele frequency after running `phab`:
+```bash
+bcftools +fill-tags -r chr1:700-900 phab_base.vcf.gz | bcftools query -f "%AC\n" | sort -n | uniq -c
+bcftools +fill-tags -r chr1:700-900 msa_example.vcf.gz | bcftools query -f "%AC\n" | sort -n | uniq -c
+```
+The allele-count (AC) shows a 15% reduction in singletons and removal of all variants with an AF > 0.50 which would have suggested the reference holds a minor allele.
+```txt
+  original  phab
+    # AC     # AC
+   39 1     33 1
+   18 2      4 2
+    3 3      2 3
+    3 4      2 4
+    2 5      1 5
+         ...
+    3 69     1 35
+    1 89     1 40
+    8 109    1 53
+    1 132    1 56
+    1 150    1 81
+```
+
+(TODO: pull the adotto TR region annotations and run this example through `truvari anno trf`. I bet we'll get a nice spectrum of copy-diff of the same motif in the `phab` calls.)
+
+`--align`
+=========
+By default, `phab` will make the haplotypes and use an external call `mafft` to perform a multiple sequence alignment between them and the reference to harmonize the variants. While this is the most accurate alignment technique, it isn't fast. If you're willing to sacrifice some accuracy for a huge speed increase, you can use `--align wfa`, which also doesn't require an external tool. Another option is `--align poa` which performs a partial order alignment which is faster than mafft but less accurate and slower than wfa but more accurate. However, `poa` appears to be non-deterministic which is not ideal for some benchmarking purposes.
+
+Limitations
+-----------
+* Creating and aligning haplotypes is impractical for very long sequences and maybe practically impossible for entire human chromosomes. Therefore, `truvari phab` is recommended to only be run on sub-regions.
+* By giving the variants new representations, variant counts will likely change. 
+* Early testing on `phab` is on phased variants. While it can run on unphased variants, we can't yet recommend it. If regions contain unphased Hets or overlapping variants, it becomes more difficult to build a consensus sequence. So you can try out unphased variants, but proceed with caution.
+
diff --git a/docs/v4.3.1/refine.md b/docs/v4.3.1/refine.md
new file mode 100644
index 00000000..eb2ef07a
--- /dev/null
+++ b/docs/v4.3.1/refine.md
@@ -0,0 +1,142 @@
+As described in the [[phab|phab]] documentation, a constraint on Truvari `bench` finding matches is that there needs to be some consistency in how the variants are represented. To help automate the process of running Truvari `phab` on a benchmarking result and recomputing benchmarking performance on harmonized variants, we present the tool `refine`.
+
+Quick Start
+===========
+
+Basic
+-----
+After making a `bench` result:
+```bash
+truvari bench -b base.vcf.gz -c comp.vcf.gz -o result/
+```
+Use `refine` on the `result/`
+```bash
+truvari refine -r subset.bed -f ref.fa result/
+```
+
+The regions spanned by `subset.bed` should be shorter and focused around the breakpoints of putative FNs/FPs. Haplotypes from these boundaries are fed into a realignment procedure which can take an extremely long time on e.g entire chromosomes. Also, the genotypes within these regions must be phased.
+
+Whole genome
+------------
+After making a `bench` result:
+```bash
+truvari bench -b base.vcf.gz -c comp.vcf.gz --includebed hc_regions.bed -o result/
+```
+Use `refine` on the `result/` analyzing only the regions with putative FP/FN that would benefit from harmonization
+```bash
+truvari refine -R -U -f ref.fa --regions result/candidate.refine.bed result/
+```
+
+Tandem Repeats
+--------------
+For benchmarks such as the [GIAB TR](https://www.biorxiv.org/content/10.1101/2023.10.29.564632v1), a TR caller may analyze a different subset of regions. In order to avoid unnecessarily penalizing the performance with FNs from unanalyzed regions:
+
+```bash
+truvari bench -b base.vcf.gz -c comp.vcf.gz --includebed hc_regions.bed -o result/
+```
+
+Use `refine` on the `result/` analyzing only the `hc_regions.bed` covered by the TR caller's `tool_regions.bed`
+```
+truvari refine -f ref.fa --regions tool_regions.bed result/
+```
+
+Output
+======
+* `refine.variant_summary.json` - result of re-evaluating calls within the specified regions. Same structure as [[summary.json|bench#summaryjson]]
+* `refine.regions.txt` - Tab-delimited file with per-region variant counts
+* `refine.region_summary.json` - Per-region performance metrics
+* `phab_bench/` - Bench results on the subset of variants harmonized
+
+To see an example output, look at [test data](https://github.com/ACEnglish/truvari/tree/develop/answer_key/refine/refine_output_one)
+
+Using `refine.regions.txt`
+==========================
+| Column            | Description                             |
+| ----------------- | --------------------------------------- |
+| chrom             | Region's chromosome                     |
+| start             | Region's start                          |
+| end               | Region's end                            |
+| in_tpbase         | Input's True Positive base count        |
+| in_tp             | Input's True Positive comparison count  |
+| in_fp             | Input's false positive count            |
+| in_fn             | Input's false negative count            |
+| refined           | Boolean for if region was re-evaluated  |
+| out_tpbase        | Output's true positive base count       |
+| out_tp            | Output's true positive comparison count |
+| out_fn            | Outputs false positive count            |
+| out_fp            | Output's false negative count           |
+| state             | True/False state of the region          |
+
+
+Performance by Regions
+======================
+
+Because `truvari phab` can alter variant counts during harmonization, one may wish to assess the performance on a per-region basis rather than the per-variant basis.  In the `refine.regions.txt`, a column `state` will have a TP/FN/FP value as defined by the following rules:
+
+```python
+false_pos = (data['out_fp'] != 0)
+false_neg = (data['out_fn'] != 0)
+any_false = false_pos | false_neg
+
+true_positives = (data['out_tp'] != 0) & (data['out_tpbase'] != 0) & ~any_false
+
+true_negatives = (data[['out_tpbase', 'out_tp', 'out_fn', 'out_fp']] == 0).all(axis=1)
+
+baseP = (data['out_tpbase'] != 0) | (data['out_fn'] != 0)
+compP = (data['out_tp'] != 0) | (data['out_fp'] != 0)
+```
+
+This logic has two edge cases to consider. 1) a region with at least one false-positive and one false-negative will be counted as both a false-positive and a false-negative. 2) Regions within `--refdist` may experience 'variant bleed' where they e.g. have an out_tp, but no other variants because a neighboring region actually contains the the corresponding `out_tpbase`. For the first case, we simply count the region twice and set its state in `refine.regions.txt` to "FP,FN". For the second case, we set the state to 'UNK' and ignore it when calculating the region summary. Future versions may figure out exactly how to handle (prevent?) 'UNK' regions.
+
+These by-region state counts are summarized and written to `refine.region_summary.json`. The definition of metrics inside this json are:
+| Key    | Definition                                   | Formula                         |
+|--------|----------------------------------------------|---------------------------------|
+| TP     | True Positive region count                   |                                 |
+| TN     | True Negative region count                   |                                 |
+| FP     | False Positive region count                  |                                 |
+| FN     | False Negative region count                  |                                 |
+| base P | Regions with base variant(s)                 |                                 |
+| base N | Regions without base variant(s)              |                                 |
+| comp P | Regions with comparison variant(s)           |                                 |
+| comp N | Regions without comparison variant(s)        |                                 |
+| PPV    | Positive Predictive Value (a.k.a. precision) | TP / comp P                     |
+| TPR    | True Positive Rate (a.k.a. recall)           | TP / base P                     |
+| TNR    | True Negative Rate (a.k.a. specificity)      | TN / base N                     |
+| NPV    | Negative Predictive Value                    | TN / comp N                     |
+| ACC    | Accuracy                                     | (TP + TN) / (base P + base N)   |
+| BA     | Balanced Accuracy                            | (TPR + TNR) / 2                 |
+| F1     | f1 score                                     | 2 * ((PPV * TPR) / (PPV + TPR)) |
+| UND    | Regions without an undetermined state        |                                 |
+
+Even though PPV is synonymous with precision, we use these abbreviated names when dealing with per-region performance in order to help users differentiate from the by-variant performance reports.
+
+`--align`
+=========
+By default, Truvari will make the haplotypes and use an external call `mafft` to perform a multiple sequence alignment between them and the reference to harmonize the variants. While this is the most accurate alignment technique, it isn't fast. If you're willing to sacrifice some accuracy for a huge speed increase, you can use `--align wfa`, which also doesn't require an external tool. Another option is `--align poa` which performs a partial order alignment which is faster than mafft but less accurate and slower than wfa but more accurate. However, `poa` appears to be non-deterministic which is not ideal for some benchmarking purposes.
+
+`--use-original-vcfs`
+=====================
+
+By default, `refine` will use the base/comparison variants from the `bench` results `tp-base.vcf.gz`, `fn.vcf.gz`, `tp-comp.vcf.gz`, and `fp.vcf.gz` as input for `phab`. However, this contains a filtered subset of variants originally provided to `bench` since it removes variants e.g. below `--sizemin` or not `--passonly`. 
+
+With the `--use-original-vcfs` parameter, all of the original calls from the input vcfs are fetched. This parameter is useful in recovering matches in situations when variants in one call set are split into two variants which are smaller than the minimum size analyzed by `bench`. For example, imagine a base VCF with a 20bp DEL, a comp VCF with two 10bp DEL, and `bench --sizemin 20` was used. `--use-original-vcfs` will consider the two 10bp comp variants during phab harmonization with the 20bp base DEL.
+
+
+`--regions`
+===========
+
+This parameter specifies which regions to re-evaluate. If this is not provided, the original `bench` result's `--includebed` is used. If both `--regions` and `--includebed` are provided, the `--includebed` is subset to only those intersecting `--regions`.
+
+This parameter is helpful for cases when the `--includebed` is not the same set of regions that a caller analyzes. For example, if a TR caller only discovers short tandem repeats (STR), but a benchmark has TRs of all lengths, it isn't useful to benchmark against the non-STR variants. Therefore, you can run `bench` on the full benchmark's regions (`--includebed`), and automatically subset to only the regions analyzed by the caller with `refine --regions`.
+
+Note that the larger these regions are the slower MAFFT (used by `phab`) will run. Also, when performing the intersection as described above, there may be edge effects in the reported `refine.variant_summary.json`. For example, if a `--region` partially overlaps an `--includebed` region, you may not be analyzing a subset of calls looked at during the original `bench` run. Therefore, the `*summary.json` should be compared with caution.
+
+`--use-region-coords`
+=====================
+
+When intersecting `--includebed` with `--regions`, use `--regions` coordinates. By default, `refine` will prefer the `--includebed` coordinates. However, the region's coordinates should be used when using the `candidates.refine.bed` to limit analysis to only the regions with putative FP/FN that would benefit from harmonization - for example, when performing whole genome benchmarking.
+
+`--reference`
+=============
+
+By default, the reference is pulled from the original `bench` result's `params.json`. If a reference wasn't used with `bench`, it must be specified with `refine` as it's used by `phab` to realign variants.
\ No newline at end of file
diff --git a/docs/v4.3.1/segment.md b/docs/v4.3.1/segment.md
new file mode 100644
index 00000000..953e5822
--- /dev/null
+++ b/docs/v4.3.1/segment.md
@@ -0,0 +1,18 @@
+Segmentation: Normalization of SVs into disjointed genomic regions
+
+For SVs with a span in the genome (currently only DELs), split the overlaps into disjointed regions. This is an experimental tool that explores the possibility of assisting SV association analysis.
+
+This tool adds an INFO field `SEGCNT` which holds the number of original SVs that overlap the newly reported region. It also adds a FORMAT field `SEG`, which is the 'allele coverage' per-sample. For example, if a sample has two overlapping heterozygous deletions, the shared region will have `SEG=2`. If the two deletions were homozygous then `SEG=4`.
+
+In the below example, we have three deletions found across three samples.
+
+![](https://github.com/spiralgenetics/truvari/blob/develop/imgs/segment_example.png)
+
+The `segment` added annotations for the regions would then be:
+| Region | INFO/SEGCNT | S1/SEG | S2/SEG | S3/SEG |
+|--------|-------------|--------|--------|--------|  
+| A | 1 | 2 | 0 | 0 |
+| B | 2 | 2 | 1 | 0 |
+| C | 3 | 2 | 2 | 2 |
+| D | 2 | 2 | 1 | 0 |
+| E | 1 | 0 | 1 | 0 |
\ No newline at end of file
diff --git a/docs/v4.3.1/stratify.md b/docs/v4.3.1/stratify.md
new file mode 100644
index 00000000..353dcab5
--- /dev/null
+++ b/docs/v4.3.1/stratify.md
@@ -0,0 +1,58 @@
+`stratify` is a helper utility for counting variants within bed regions which is essentially the same as running `bedtools intersect -c`. When working with benchmarking results, there are are four vcfs to count (tp-base, tp-comp, fn, fp). Instead of running bedtools four times and collating the results, `stratify` can be given a single `bench` result directory to generate the counts.
+
+For example:
+```bash
+$ truvari stratify input.bed bench/
+chrom    start     end   tpbase	tp	fn	fp
+chr20	280300	280900	 0	0	0	0
+chr20	100000	200000	 1	1	0	0
+chr20	642000	642350	 1	1	2	1
+```
+
+The output from this can then be parsed to generate more details:
+
+```python
+import pandas as pd
+import truvari
+
+df = pd.read_csv("stratify.output.txt", sep='\t')
+
+# If the input.bed didn't have a header and so we couldn't use the `--header` parameter, we need to name columns
+df.columns = ['chrom', 'start', 'end', 'tpbase', 'tp', 'fn', 'fp']
+
+# Create the precision, recall, and f1 for each row
+metrics = df[["tpbase", "tp", "fn", "fp"]].apply((lambda x: truvari.performance_metrics(*x)), axis=1)
+
+# metrics is now a DataFrame with a single column of tuples, lets separate them into columns
+metrics = pd.DataFrame(metrics.to_list(), columns=["precision", "recall", "f1"])
+
+# Extend the dataframe's columns
+df = df.join(metrics)
+df.head()
+```
+Which gives the result:
+```
+   chrom   start     end  tpbase  tp  fn  fp  precision  recall        f1
+0  chr20  135221  239308       1   1   0   0       1.00    1.00  1.000000
+1  chr20  260797  465632       3   3   3   1       0.75    0.50  0.600000
+2  chr20  465866  622410       1   1   0   0       1.00    1.00  1.000000
+3  chr20  623134  655257       1   1   3   1       0.50    0.25  0.333333
+4  chr20  708338  732041       1   1   1   0       1.00    0.50  0.666667
+```
+
+```
+usage: stratify [-h] [-o OUT] [--header] [-w] [--debug] BED VCF
+
+Count variants per-region in vcf
+
+positional arguments:
+  BED                   Regions to process
+  VCF                   Truvari bench result directory or a single VCF
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -o OUT, --output OUT  Output bed-like file
+  --header              Input regions have header to preserve in output
+  -w, --within          Only count variants contained completely within region boundaries
+  --debug               Verbose logging
+```
\ No newline at end of file
diff --git a/docs/v4.3.1/vcf2df.md b/docs/v4.3.1/vcf2df.md
new file mode 100644
index 00000000..14d9f06c
--- /dev/null
+++ b/docs/v4.3.1/vcf2df.md
@@ -0,0 +1,81 @@
+We enjoy using [pandas](https://pandas.pydata.org/)/[seaborn](https://seaborn.pydata.org/) for python plotting, so we've made the command `truvari vcf2df`. This will turn a VCF into a pandas DataFrame and save it to a file using joblib. The resulting DataFrame will always have the columns:
+* chrom: variant chromosome
+* start: 0-based start from pysam.VariantRecord.start
+* end: 0-based end from pysam.VariantRecord.stop
+* id : VCF column ID
+* svtype : SVTYPE as determined by `truvari.entry_variant_type`
+* svlen : SVLEN as determined by `truvari.entry_size`
+* szbin : SVLEN's size bin as determined by `truvari.get_sizebin`
+* qual : VCF column QUAL
+* filter : VCF column FILTER
+* is_pass : boolean of if the filter is empty or PASS
+
+Optionally, `vcf2df` can attempt to pull `INFO` and `FORMAT` fields from the VCF and put each field into the DataFrame as a new column. For FORMAT fields, the VCF header definition's `Number` is considered and multiple columns may be added. For example, the `AD` field, typically holding Allele Depth has `Number=A`, indicating that there will be one value for each allele. Truvari assumes that all VCFs hold one allele per-line, so there are only 2 alleles described per-line, the reference and alternate allele. Therefore, two columns are added to the DataFrame, `AD_ref` and `AD_alt` corresponding to the 0th and 1st values from the AD field's list of values. Similarity, for PL (genotype likelihood) with `Number=G`, there's three values and columns are created named `PL_ref`, `PL_het`, `PL_hom`. 
+
+After you've created your benchmarking results with `truvari bench`, you'll often want to plot different views of your results. `vcf2df --bench-dir` can parse a truvari output directory's multiple VCF files and add a 'state' column 
+* state : The truvari state assigned to the variant 
+    * tpbase : Parsed from the tp-base.vcf
+    * tp : Parsed from the tp-comp.vcf
+    * fp : Parsed from the fp.vcf
+    * fn : Parsed from the fn.vcf
+
+The created DataFrame is saved into a joblib file, which can then be plotted as simply as:
+```python
+import joblib
+import seaborn as sb
+import matplotlib.pyplot as plt
+
+data = joblib.load("test.jl")
+p = sb.countplot(data=data[data["state"] == 'tp'], x="szbin", hue="svtype", hue_order=["DEL", "INS"])
+plt.xticks(rotation=45, ha='right')
+p.set(title="True Positives by svtype and szbin")
+```
+![](https://github.com/spiralgenetics/truvari/blob/develop/imgs/truv2df_example.png)
+
+This enables concatenation of Truvari results across multiple benchmarking experiments for advanced comparison. For example, imagine there's multiple parameters used for SV discovery over multiple samples. After running `truvari bench` on each of the results with the output directories named to `params/sample/` and each converted to DataFrames with `truvari vcf2df`, we can expand/concatenate the saved joblib DataFrames with:
+
+```python
+import glob
+import joblib
+import pandas as pd
+
+files = glob.glob("*/*/data.jl")
+dfs = []
+for f in files:
+    params, sample, frame = f.split('/')
+    d = joblib.load(f)
+    d["params"] = params
+    d["sample"] = sample
+    dfs.append(d)
+df = pd.concat(dfs)
+joblib.dump(df, "results.jl")
+```
+
+To facilitate range queries, PyRanges is helpful. `vcf2df` results can be parsed quickly by pyranges with the command:
+```python
+result = pyranges.PyRanges(df.rename(columns={'chrom':"Chromosome", "start":"Start", "end":"End"}))
+```
+
+```
+usage: vcf2df [-h] [-b] [-i] [-f] [-s SAMPLE] [-n] [-S] [-c LVL] [--debug] VCF JL
+
+Takes a vcf and creates a data frame. Can parse a bench output directory
+
+positional arguments:
+  VCF                   VCF to parse
+  JL                    Output joblib to save
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -b, --bench-dir       Input is a truvari bench directory
+  -i, --info            Attempt to put the INFO fields into the dataframe
+  -f, --format          Attempt to put the FORMAT fileds into the dataframe
+  -s SAMPLE, --sample SAMPLE
+                        SAMPLE name to parse when building columns for --format
+  -n, --no-prefix       Don't prepend sample name to format columns
+  -S, --skip-compression
+                        Skip the attempt to optimize the dataframe's size
+  -c LVL, --compress LVL
+                        Compression level for joblib 0-9 (3)
+  --debug               Verbose logging
+```
\ No newline at end of file