From 03d94f34e8c3b19ac6acd44705093b54199061f5 Mon Sep 17 00:00:00 2001 From: Mark Bicknell Date: Mon, 3 Jun 2024 10:24:43 +0100 Subject: [PATCH 1/2] DOR-736 Add test for handling very short reads that are entirely trimmed by the ScalerNode and fix the crash they cause. --- dorado/read_pipeline/ScalerNode.cpp | 10 +++++++--- tests/data/pod5/degenerate/trimming_bomb.pod5 | Bin 0 -> 15472 bytes tests/test_simple_basecaller_execution.sh | 3 +++ 3 files changed, 10 insertions(+), 3 deletions(-) create mode 100644 tests/data/pod5/degenerate/trimming_bomb.pod5 diff --git a/dorado/read_pipeline/ScalerNode.cpp b/dorado/read_pipeline/ScalerNode.cpp index 7deca4505..2c4e64f94 100644 --- a/dorado/read_pipeline/ScalerNode.cpp +++ b/dorado/read_pipeline/ScalerNode.cpp @@ -141,7 +141,7 @@ void ScalerNode::input_thread_fn() { int trim_start = 0; if (is_rna_model) { trim_start = determine_rna_adapter_pos(*read, m_model_type); - if (m_trim_rna_adapter) { + if (m_trim_rna_adapter && trim_start < read->read_common.get_raw_data_samples()) { read->read_common.raw_data = read->read_common.raw_data.index({Slice(trim_start, at::indexing::None)}); read->read_common.rna_adapter_end_signal_pos = 0; @@ -241,8 +241,12 @@ void ScalerNode::input_thread_fn() { utils::DEFAULT_TRIM_MIN_ELEMENTS); } - read->read_common.raw_data = - read->read_common.raw_data.index({Slice(trim_start, at::indexing::None)}); + if (trim_start < read->read_common.get_raw_data_samples()) { + read->read_common.raw_data = + read->read_common.raw_data.index({Slice(trim_start, at::indexing::None)}); + } else { + trim_start = 0; + } } read->read_common.num_trimmed_samples = trim_start; diff --git a/tests/data/pod5/degenerate/trimming_bomb.pod5 b/tests/data/pod5/degenerate/trimming_bomb.pod5 new file mode 100644 index 0000000000000000000000000000000000000000..2ca66cb8e28141aeb55108d5f7b953dfd0531302 GIT binary patch literal 15472 zcmeHOU2I&%6}}EmFsTCuQknvyS54ZQg3aAuuY+5h;D&_e#|@!aRn?ojyLatX*1MbC zyNQF+VihVy0*MGAnj+L96}5Q)DV0(w50x6EREiJ}d59_=qAF@pix3Y;@Q@a0`h7EV zypwBh;xu_EmB%_ebLO1+J2P|U%$e(F$9IgaT6yQnyPp5~8;9@QzU-BYFTGiLq578% zJ9qAQBH=g}5l$|1oIZzPrE{~h!ddS0AcfG?J(u4LwAN^yW4j>vA$j_X8e6&qgk$1k&XHt#~VxyW)b)NTbH>TY4$!5hYe>PXt_c#luG4_1H~{iG>|R~ zX9jXXI5ZF>!%`}jE+k6XBG}Yey1N`_3~Bbg55GN-#f=U}-e=^{wTyjS1Ly`Y+z4m} z)AdT&bY4KeT;Sh_Od-76SN?NYA~(j_XW_V%hf&A+Wx_P`6to$6{i2Up)GrqtDW1x zi@x8&`d$OredkykVNisORo^Kq%Hi$*qVGMBE7ue2C%Ayije)ZuV`#O|0gRzXjYO{- zbm8I!QwJi{eg`~ ze0Wu3vgh9Vk9X_6MPAB}G1s0BHzQ3M_o}^cv&w`%Ku0L&vjFxvt^-bgdi6TsQ&Z+@ zbbx2+n(L(lDC0BSlnh4TqY8=ttW;}iNgpoh!~a!%cp4j1&Umig%bov+a42cLwYc;D z9O{$3e-P;rgCBtov30qe%a=R<yYIPU<}X$ zI13%|DqsT85BLDO;sjt8Fa}r+co+2?H*&fA6J-zF%DP>X2NtDkc{WFlps=T0o#eUN z3*H>`A+lFd7rOv-L7lgwh*<~EB^Wu4GHi|!@E_FmFXtSx4EY&oXYC6rEgAtu@Eq3l zDn8aDd>E3mCp_RB0?%oyF3ySYu%FEX%_yAqqXYHOIS#&$ugt4Vkn`YkRoh*vZM$Bs z7u(kB1>aVSx8Ldm=eQZ}ZH3iB*zoJMW;udsr28|5GS67$)>-9f)66f2IR*a-tBh-v z;dy7uaAx+DBgZ)k-uE=WvIFfllzfyO=r@9h62H_wadBSZP?aW9 zyOQ&1w{bi0dZQNA3bl&gjDkkwM`f6JM>M~hE6(NPyB>B9TkW~X>jp5~0+_3#(W-LI z&4Tx22mVzyzcDC}f%ij;-<~7d8>LEZzh4L|6(21XTZPCk)QS+vdnnTn8(HMwf>-)Q z`>tJ1iNaX}-iZ$U97i})4;v6bHS$%@HiP$&=2v@7WXHY6HODpzwQ2;b+K+XGaXWo7r2R**uUI+gNx{b^3x3l2UcL?rRwk&rDr{n(XK|eODzASvM2E~F88`wFc z``g9!F=O$&z(t%X>TZj9S8mr6$8|S{V>*q{&t3Oh;9omoch;nD>WXI>`jPpzF_^lWS<-$3a@USEn zh>c7{HV)S^17%_d!G%9(6Ok4i$AJo?CYDHWp>Gy~i3OMX$7A$PIAzN)qa){li7%K~ zBEf~DE7yjLbK4zGX#`pyn{$qwN_=y<5;6guX==xr1-MdVC zXJYle6RWQiK5NQPP%$AKH}l>{W5^1E&zii$hW?thj_w4_LKWw=;xc~I?3~`|gkNiW zpSSW~5kZ*vN;uj20U7x7h7!X=xnQ`MO=dIcVlkCZ=7zkXOxgWIHGH`H}6UD`7DoG{Zswn+Fy~ zsXXb^8A+?87}jB^(MYk(6`IhM8rXd)hMMZQ1q72|SCKeOjd56NXv0_ZJG!FQCf6u! zn+E_qqq2Yp0iys0uUHNI#_(WzFyW^&UcTP+r-H&@p)|R-P^h>3NkB7dgw3WOh0Q3D z$Axw}Jv2Bxm`i#Htb_8H%9EV-_m`t7Ki{hCA;AR7ZPSMB=|m=}_^HD;ZQU_|`(MIi zhfJWZHWnf5_6PRY8hezQZC316p96JAf58yza1`> z2J%5EF_cJUy;6qL_?%P(b z(ueIP(}P}eFp+ipH&Q~blFiH%-5>%sG`Z6VX1JCSj>?NPaoBWs@h`6A#Rz&8O80UiZ#J^l!=7w|K{i-6w% zjss2t{tWmVfJ?gzYyB?3T0j~w3K$0jfI8r5z>9#F0lx;k0iezdn&krS1}5^u5ED18 zFkAuaDq%eOdcC)B>rr#4-TZV_`^{RbQNRY+S1!;mZxUSl#WL1hIp0!6U}8qC<|SFC zYU~)lS!k5&+Q7HadaHr^U#N2CE<*PGp`EtxAd{_n{ebMOsW6C|aEoK%PuHhv&FGTr z$|V_1+%vFQtF7sL*dT>FyIe)jD-}I%j;h@Bo8{>ia>8N_R})a6AM~vmz0miz$wsYJ zhcxCg^d)Q#Z5!N@)szlv&bixStC=^-#(G=9l&kgMhxp3?t{v_p>IW__+Ns|LYywmO zPXT@f_$}affHwhe0nP%>0p11t9qBLsQrW?5c6d0O-m<|&F06jp+#kvVd%U!UO`JNpudt{GCcqt zksRKY@YZK?>r);SSt6NA-sgGiJx|nBTRp{hD>|Ia4rfw{M|O@4Ok_5CUP|_?#MOe= z%D1Xf%T0QNUV5HtT#U||*UNY?~^{$CEq8{gf#$JC$xkA-+iBlFv!Ke*YN>} zXRQv4d$a3wejoYg)t}#Qn=;(Xi#@-cf3@4WUgtOOMx*fCoWMeP2CxgT25=5W$}4~$ z0`3N!!$t21U;?lX@G%blQ-BJ<1-y%To&)Rxpt$n~*c!MwEvJlmzS}yBwvMvHtEC!} z&Mf@sv=30N3Va_xLiyNF;N6SwWI$I?PU`@ypW#+OSS|Xng`2qgmTJ5yA@2k5k7(b# zYJUuQb~)Nb?Q-;4^Z{s{==Ti_M}@SiL-%78Z&oAv-Rg#g`T;*gIdAT7O&}p^SfO>c z6Ll!ZH#NIltyF4;5$+bITpxC@d|AY|0k$hLjgN`vKphup{gTHqaWw5Q@x3?xez6+u z^y#+deUsIV()*!G5jk1(pS6v2;H7RUOa;|ySaI^;Wk07ZUVH9&w&MzD91Hl~3;d(V zGw*Low*A1_J_FY$ALwu^#EYdr-J0>))FkCw(re7&H*=7S8pHj_m$LNhuq~{QBCnP4 z3T{R@7d)$VJ@eY;9;-KS(JtUJic2)6n0L{saw&5DYT6#Zy0U6z+@sf~^IV6rYRr7k zp4TdTXr}Dxt0`;N<4dM&w~pt%zj8fNUgjGyKimh8TNCq-wnjl4(dFi?f4*KWpk zd+uMZA+|5y3CjCehc`AQ3%fnmQ-)K6wX8&~tZNkD;OoBjuB7h%e1GqE6eFVS7(FGU!?igKgp-;u=f(yMKqYK_+@@Han!DU^~#^{2}dOQ@P3ohjk$LNAf z|Bl4yf(!lC7+r9w|7eUZxb*K>j4rs8Cz;=<2*0o|;UWngU#mP8`kvgbQ*=r81%0~w z;+;EAvE??;$)|KrI>nmW^Q{LrFvZiE;k>Cp)|1ZV@9Zdl_azQI#CSfPxr_H}??O8l zz<(CNx;TmmFkR|CvU+!DhgQg76)IO`s<^0;mtah%~y_h2Wke4ciStFYo9S#WPh>03};Nf@XxAT*g2*?$@fft+&|t)o=bj)B|n4hFIe(3EcqEa_$ZeA z4A8A|zAgC~wEuGrWD@`X#)bYTa-shX;rSges~+c&-wU|h{}pcTxx8O~(~cdxzOz%G z_w+%~!@&QTW4*K``T^{_*ch~T4+3@pIJfeDu<`fqb)YTwe^-0ZLN{7OSDn*q@q_SF zZY~pg;lZY#n74WMi@jMxW`vjT(lLK~m3PSI>r#Cu?R{exyhj8f{{KYJn*THY4_3M; A!2kdN literal 0 HcmV?d00001 diff --git a/tests/test_simple_basecaller_execution.sh b/tests/test_simple_basecaller_execution.sh index e5f072d7a..4246755cc 100755 --- a/tests/test_simple_basecaller_execution.sh +++ b/tests/test_simple_basecaller_execution.sh @@ -78,6 +78,9 @@ if $dorado_bin basecaller $model_5k_v43 $data_dir/duplex/pod5 --modified-bases 5 fi set -e +# Check that dorado handles degenerate reads without crashing +$dorado_bin basecaller $model_5k_v43 $data_dir/pod5/degenerate --skip-model-compatibility-check > $output_dir/error_condition.fq + echo dorado summary test stage $dorado_bin summary $output_dir/calls.bam From 1d389d0a8609c03d2345fed93366387bdc787553 Mon Sep 17 00:00:00 2001 From: Mark Bicknell Date: Mon, 3 Jun 2024 11:09:14 +0100 Subject: [PATCH 2/2] Fixed sign comparison error --- dorado/read_pipeline/ScalerNode.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dorado/read_pipeline/ScalerNode.cpp b/dorado/read_pipeline/ScalerNode.cpp index 2c4e64f94..69922d4d5 100644 --- a/dorado/read_pipeline/ScalerNode.cpp +++ b/dorado/read_pipeline/ScalerNode.cpp @@ -141,7 +141,8 @@ void ScalerNode::input_thread_fn() { int trim_start = 0; if (is_rna_model) { trim_start = determine_rna_adapter_pos(*read, m_model_type); - if (m_trim_rna_adapter && trim_start < read->read_common.get_raw_data_samples()) { + if (m_trim_rna_adapter && + size_t(trim_start) < read->read_common.get_raw_data_samples()) { read->read_common.raw_data = read->read_common.raw_data.index({Slice(trim_start, at::indexing::None)}); read->read_common.rna_adapter_end_signal_pos = 0; @@ -241,7 +242,7 @@ void ScalerNode::input_thread_fn() { utils::DEFAULT_TRIM_MIN_ELEMENTS); } - if (trim_start < read->read_common.get_raw_data_samples()) { + if (size_t(trim_start) < read->read_common.get_raw_data_samples()) { read->read_common.raw_data = read->read_common.raw_data.index({Slice(trim_start, at::indexing::None)}); } else {