From 3433853010342d1b85622f38a4b5975b31f7ff57 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Sun, 14 Jul 2024 15:55:27 -0700 Subject: [PATCH] Parses for `GenomicRangesList` (#44) - Add methods to parse RDS files containing `GenomicRangesList` - Fix bug in reading strand information; mostly RLE vectors defined in S4Vectors package. - Update tests and documentation --- CHANGELOG.md | 6 +++++ src/rds2py/granges.py | 52 ++++++++++++++++++++++++++++++++----- tests/data/grangeslist.rds | Bin 0 -> 7033 bytes tests/test_granges.py | 13 ++++++++-- 4 files changed, 62 insertions(+), 9 deletions(-) create mode 100644 tests/data/grangeslist.rds diff --git a/CHANGELOG.md b/CHANGELOG.md index 84daa5b..2200fc8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## Version 0.4.4 + +- Add methods to parse RDS files containing `GenomicRangesList` +- Fix bug in reading strand information; mostly RLE vectors. +- Update tests and documentation + ## Version 0.4.0 - 0.4.3 - Migrate to the new class implementations diff --git a/src/rds2py/granges.py b/src/rds2py/granges.py index 3347a4f..7e7081b 100644 --- a/src/rds2py/granges.py +++ b/src/rds2py/granges.py @@ -1,6 +1,6 @@ -from genomicranges import GenomicRanges, SeqInfo -from iranges import IRanges from biocframe import BiocFrame +from genomicranges import GenomicRanges, GenomicRangesList, SeqInfo +from iranges import IRanges from .parser import get_class from .pdf import as_pandas_from_dframe @@ -36,12 +36,16 @@ def as_granges(robj): _seqnames = _as_list(robj["attributes"]["seqnames"]) - _strand_obj = robj["attributes"]["strand"]["attributes"]["values"] - _strands = _strand_obj["data"] + _strands = robj["attributes"]["strand"] + _fstrand = None if "attributes" in _strands: - if "levels" in _strands["attributes"]: - _levels_data = _strands["attributes"]["levels"]["data"] - _strands = [_levels_data[x] for x in _strands] + _lengths = _strands["attributes"]["lengths"]["data"] + _factors = _strands["attributes"]["values"]["data"] + _levels = _strands["attributes"]["values"]["attributes"]["levels"]["data"] + _strds = [_levels[x - 1] for x in _factors] + _fstrand = [] + for i, x in enumerate(_lengths): + _fstrand.extend([_strds[i]] * x) _seqinfo_seqnames = robj["attributes"]["seqinfo"]["attributes"]["seqnames"]["data"] _seqinfo_seqlengths = robj["attributes"]["seqinfo"]["attributes"]["seqlengths"][ @@ -71,6 +75,7 @@ def as_granges(robj): return GenomicRanges( seqnames=_seqnames, ranges=_ranges, + strand=_fstrand, names=_gr_names, mcols=_mcols, seqinfo=_seqinfo, @@ -111,3 +116,36 @@ def _as_list(robj): _data = _final return _data + + +def as_granges_list(robj): + """Parse an R object as a :py:class:`~genomicranges.GenomicRangesList.GenomicRangesList`. + + Args: + robj: + Object parsed from the `RDS` file. + + Usually the result of :py:func:`~rds2py.parser.read_rds`. + + Returns: + A ``GenomicRangesList`` object. + """ + + _cls = get_class(robj) + + if _cls not in ["CompressedGRangesList", "GRangesList"]: + raise TypeError(f"obj is not genomic ranges list, but is `{_cls}`.") + + _gre = as_granges(robj["attributes"]["unlistData"]) + + _groups = robj["attributes"]["partitioning"]["attributes"]["NAMES"]["data"] + _partitionends = robj["attributes"]["partitioning"]["attributes"]["end"]["data"] + + _grelist = [] + + current = 0 + for _pend in _partitionends: + _grelist.append(_gre[current:_pend]) + current = _pend + + return GenomicRangesList(ranges=_grelist, names=_groups) diff --git a/tests/data/grangeslist.rds b/tests/data/grangeslist.rds new file mode 100644 index 0000000000000000000000000000000000000000..5bf7bcd326556a58926654cc761c654112c08bbb GIT binary patch literal 7033 zcma)gXE>Z|*EX?{5D^4}C_y5kjUMbEB6^7~+UP`yAfnD-i$2(T2}ZZ|-rFEXFByzZ zVxkYiU<`wA_TJC)=l$Lv?|q#2ajkQ;Ypp+P9rw){dz<8+hipH`KbvhJbubvLmPNK? z7v_HZ{%;xmpWKSl%=Ew1e!pz)HNn6UL*n%>jx5H8?AZEJ`+1)z-ipZh64KUBb7xD5 z{#NM2!R<#+v6I78|69Q7GHca>B3c*sJga~j(#CA zt#!YUR#I0^6V4QUMA^ScU8_9;v@T4P|AO-MX~Z#>Zp8XWNS!e1Y>)UdT>zN&Cw(jV zZn>(-v=BIglvVS4re3P%kpnqty!KH|UBlB|tm#nVRbTAt;UX*g9N+sYf6i+GCL z>1RPnv!Lu29a#7N_Yh)Y<>RR=P>CiDB^;<<5%cMs>D~NUCPr zutl(!LwDiOBi-0+<(|U4wuT%x?maIj{L|#A!XgftfD~k5BS%|6%D4RN(ySl1B@NM! z*C)Na<5hiTUAAwF*N?p=)nC%&Hu4XxFF&UDL{LVF%PSJ4vJuUqXCjH^ojfUXs8sAs zyS|ct6MSYwra`PstIvmV!=Uf3k$<$|#$)|d=BK6qxGMi0=c4wtjP7>vGfu9_Z@C-A zyIkNOo(7k%_+v}3A@dThzh$!9!GdPpUjS*2)0>c{K5RBv+HXGrc38IW-a zvA%YV*!m06@i^VUN_L-LsV$V1zezZH#yVMzrkVa2X(ctD4;QJ1lg!F!###NXk22s< zERU%E)5KjPdZuJ7>bYU5RCH3n&uu-81Y|%_ydHfSgQ|)|@iPy{jb;8je3|OrKI8^c zr7B!C<@w4!6Z##mLcH!sDnHie(cqRWsPA)_t*LyjY4nO)a-LT|O#7cBJV}jFgWKq7 zD1gSBRHK|#qsZlHA~C%D4SkuCdJ)y*#Ahl0B_{pfDtZOSc#-8lL?1m%Y*tKo>*2?( zY=kD)sPIuQLOo7&>Et7@E~f;h7}wtM%rcL3c+QlPS>2x@HejSAJ6b2xkW`(nC3>ok z$Yq`x%w?_(*zylB7I(krI4nE%O-w{DLDZcWuoc#tp(ZYWn`oeMgwiI7e$q%=A07K< z_I$Ll!(NLmnlUiu$WrXd1h3s5wQ1Ld>5H&W*1HVUn}*f*>=s|vN-$_9iSBc;Oy5fq z&5D+^&pLW3rXoHJ80WQz=2koVsxS`& zEWhrKR1E^gzwQn=Eb5A>=nezq9^1JM$B^6VxKq|yj1*lMn59*FbJ)e&R|;L5P7Gw) zLmiVukGRuRvQbI1atYDmy<%F~>WHtI_FCBvuLXDZPv`^Wx<0S!hz!qXGVH@=_*e{J z_FCXS9QJ0?9X=*PzrC*P64MI)6S_ld8=Y#uM-4E5F`qqRmfAFmyl7QoPvBw8n68s8*t z?Y2Y#bzcgZJ$hs^7-_-I>2(@peF3xPn7-k)y9ndE2wPPUDZ@=_?p|S&0&Y5s{-xe0 z*Zw-}HxI!bfy(J#)cpFeqs2U23lpeBb)y z!5rib*t%o<%Vs@!_JzEnN~ zBjH9!hST~gu6I}73&!_n^vrBNx^Ae(RPO4tXu>+pVC9}jgQ99ge4#V@_q_MTiIjH5 zsg!n>E2P&c?WqwQmOlF#J}GAaOP}F1pA-rfOYm&khqtr!>u$xp6h9s;+1hRomCQ3$ zSR2n!Iliv?UylBu*NW3I&O*xRpQ05X_h$x`N+i|d-)Tci)@-oeS?Cw?Ju?CeLE2r# zAK0@&pX7`#+>^#l2vo9A6bnFfz-Nr;3FV`7SQ!-S`oUHa_b*cf$GO#IKExJJ){F`Et2) zy|ZH?)lVQ6+Jx3D{J=k?+Ctl+Evxm}n#jy_TI>k ztML-NpYYA**e3Sx#?3EsGRJ-W!5TdxSUhX6Wn51+0%U@&sW_UeI%iv6=-}FMKyJVa zj9WZ@z99}?!2+GMXE#FXVMz~VBT&^Fv!7$Sw^JZw6SYIdx3xa_jDvQkB+X+4fQs2i zq+X|LlKpg$GU8Yz?x}Pm@$6LW*5hgWJmOHbuh)Wj2$x6s4s8~{K15QK2|ICu_spa&tKeHW2t)l_h>T*!FOlUZETe)b;St&4rx{1R%!1jRU7H1 z?|C`&&+q|EU<{3PTVXsZlz$KJ7Xt-P`!DVTQIT-pi0CGb@fgGU`d}?^(z_8s@#Zo3 zL;!RrI7!q~h^5d;c52#Y@ulFMR*1k8$ONUy`QSW%2}IcHH}v{zF%{QgK-Iv>b) zVhqsowlG#)jqO4y;;Jr1XzFGoLTF0WBD9xQXgTbdbe$uG_WosSv&Zk3ADcWVE-Tv- zTAjwj+FKiL8Ey$4+&TS@BRi9Mw|4hz$0gWdL|$p1=ZwyUvo7JyWmI6L%z2{X3jrbF z*gY;u5?))*H)l(+;F)>Y{kHa7G)DOrp6Y}0)O(ZN7@oH4KavA;pSKzs@n~GD!C#28 zW)H|e-kBSauiTNQ!p%Fp;kq2k3r;gj=MyxXOp0n#Ftp-4o{YxN-5R^9S-*(xZ~}>) z;p@K#MDS13HUFB_!|NY!;ZKhyS?~_Wldtg5;IyHdgl}5ah4?K90pvHo1^Gjg^#-Gf z|NS7>nJ#D<8=7=9xqaSn_XZldwsF4oJP4sM_``L4n}043g9Y>oemxhS0~4+mAiC9q z=YcI?#9GI`_%&ol44pfk1nmJWIuWjr^610#GnQc8j>+@DMX<3d^TAv0;oa@N7n-~P zoM55_Efz<9xyN&DWd}RLv0fharSFInN$~#2DhnpENj7zeJXh9Tz3AQy{T(*SL(GGB z`$IT#HaVl_K(ibVp-(vd-Q=u(JNEEu$$_R~hObt7by=?>7-gw)!mHN_N}_AVi_r3N z3%g#VKh{M$-DSJ-VVm4Y4Sm9062~SZR6)E_mCOq~$_FnQ&U(qlT$;|P!Eu5Ye)?Yd zGN!oWoMo7q})Jwrya>L$LKmKmoJLnh#2WbGrvIE{<=MT1m}X zNwXAxP&m&%T_&lLTqcQoO(bD@M1w2w-S?0n(q~lCifo_zXr|2iX{P)hybpQu_TRvf zTjNBnAJh8~MEwUc2^wl?MKsaGI!V+%AZq_>$PH)9I}#)&cOqu{o#^#HfIcQBCJ=}z z6cm<8#wTf|6|va=PbIJw$(BcsC#HZvh*iTl|Ggmp16d=A7)nmOghqvEZTe3uLdzuD zqr^&F{;ihvZ`ttgk{XEKX+-aLf43<~Y|fQNT5&q>-#m?3my*P*iN1~htyq#MR6!{` zuro6z&uykk95)MJ`TkDNzS!ksfHj~U$V$)o;5j}0D)(L1^Q~}%f!s=jYxRXgeiiaq7<4lxwW!MpMJ)!;xbX4V)R7B^Cdwza&V@?o? zy?q;?w0mUcxgd1z{U3EdT!GTiLT}&5BSJn9rv`7`_>fVzWQ{pusyy@PUA|Pzm=g+X zyXLDc$)Ujx()KNP({D-+!UHf6Q%Vw4`#k{8E!2N~82CO<3}$f)E1=rHeC(c48pN1} zXBwN{f-e)8SXZwY#%fmMhRFDO{XPj*2I_=)&S_<^ANH~~(4jm>@B9C}Cy_0|8fc}2 z8Pf*V$kQTbUfG-`gyN6*frqZ)N2MKQ*`WSLB(P>V{>N2?_1*07Oz%t4K*COj=&9D& z@7*j79<8k6j%(3vnKZT~hTH1DsP44>B%}Ag<>KhP2U`H5i%lxHYa>-Z}Htv~?`+xxHumaxpi zPQ+-a_c{RO_n38ZJLGD86sSrAmp#=$Zc+kOU{qM79$c>UQNZBy44jkonkI}k_%?PU z)`lbScI6MsegaP_zLT=>2e0Yn0Sz`j(gt?KwQ%+jDv%o~$wTY911h_+iflXe1;PL! zZHF7cbp5wZVCHY?x^O^;2(*+l(r4oboLW3YQ&_1VObhSE(*ijQ=Fg}5KOT(3efi@j z)+jD>ATC?!b%PGU@a_DBzSYkAiIyh)*Y=c^mxoFxozLC&+`cl!BmUq_t|9ZxxVd%( zfU?40O^Wm_3*FhPqz+B^WOE2&j~N0x^|{T{qS9|D(C0q2H^sDws6lJN1Y5$Ft-f|3 zIl7(Mdi7FD@L}9gBrSY-j|WQlCN|G$McMFy2NVCw+RdCAEBDFVdGez89!G)Cb;dSey3b`E~s%A((v^o3x^i>%~QB=ZZMBQPa+mBuwV z1|M5pdRVl!G}-4DKGCslQp>$(BeI>}jtj%HF5D36YRyWLUo6>-w*8CFx=6iqYvtwAVmQctGPx3J~7>Q1NzaNed$PFu$x)HCvGLk(|`F>mnPd)6# zly-L#^o|8?2}qJ6(j%umOG(VjK+D$x)=NwPr_S51zR1%AIh&2u5WIYxpE-z3xBvn% zV))Qc9d&Si|1!IUbhCjIp35F~ROo6m5tzE02f6ZqMdD8bLtV?NINJVL-xC9X6rcp) z(^aXypzM#bR}Wo0kUyGDoU>)3FNj4+OZ5g>^8t6L@RzH33REPg63>i3Cc~4p(AuFAu?j{3X6Q7R|)m>}3IDC{~kU6f1eWXZ`kUi@{ zZbp1ngO#8|5i%cxbUz%Z$$p}qst=7{czMWBc96b}MI%*Gz7CfRMIjHOPw<+yJ$aMh z?d09soov9uR3jA0JS~cw!u2Gs*vGQx9~Sxlbdf!i8Ig+zbny(rP17H)`|8@e&eWDPv0EiQfd$pr&hQV>w5-#DnJ;hiY$)}Fkk;3!kOE>GaHPS+7 z4(IN)iMOhFHk-y3WuNQNk&j2J9R0+lpAK90 z5ZRuIh?|wrxoW00j9mXZK;bRI?AVD>=M-`P-!HSNsXJpo$(|6>&nV^BNaG&q$&RGyHXsSDW z6K$jYF#7zL-TliW+NXkuUI>`Ek`flq(nE#Hk)VV~sUKZ)j~%Ihy~mmm5xP-p#lub| zZcV#Fg=JHO2#z;+D^OebaYGM5QQHeGeXMg@@@sb*Il`kF80zdeRr}qG2I(Z8{LHc$ z;|z0_1AC0|T%G1%X$sgzjVQQ{;R}wm;dBT|@CfVL@XH7;IfXYf!qW_F z*iW34by}f|$Kr5#C%q1RxqBzvvNDk+LXE3{uKVNPwtyZbWayuB_uJ|zV}17%w%^y- zoIAX#JIn%cHUIRQr*sRc!#vw%+Uozkn(>UlV*`l@y1t??-GcoQfzbw9A`b_Ao#kOr z=QGn?GN=9v1USk9M^O-&6F$GGqpY%~xdo;(1}R|fsB$6wIc(*2uyBsMa)+b9HLkAa zsYS=pswhc_B3k+GHyK^-fL|+$lTv`MM%oeZKU@tM{ zw=j(8ciKl%W78aA4j!Mu_?JM13lfx@*${>48AY% z{N(Ghra2iYiq}$xG|*^QeEO4PJSuLH6VYb!Ci5diJnnK*e{5eH`|Gtt4dH~zlyfrQ z2P8hm=Dd2zUB%Fim4KS0_jd+8z(j}w#4mJAFQ*8Nr^esMJ^EJBdVsPYwHyZ#IXM*Q zt$w;TADZ!Z4A!|?S7#DtSogr_5j;PJ;az-pG)l-)c&tMe8Kk_II8Mh};nZj5Ed}+^ zuVfrj-CZkDZRb$6^V@4{q{#11zqz~ck$+4uZcC&ozfNs0y{0oy+8OxgI@N_=_hfyyh2Ucb~~Q73hZmkp%a-{t6zeEt#5KVIlZZrrsD^ z?`IhbYent=*-QFAX55>$k^R3C%AI3&=0sAs+sloK*!lIoJNuRu9pGhUz^q?O<_4R% z<22p0@i&|#JQ>w$EtV}!HfZr`lyS-(M1HDuOL7rya z8GLNMrH95J+ytMm-aW=-`D%X9LX&h|hw2jlss8Ff(5>}*4W!+#udVBFbqF;7)l^4l z`uIH$x*9as9`jkK$;&*Mx$@6l4RSWk$kg9LOXv!x9Z%9+)|YBMJpj z&0LpBA{>QBe5?4Q;l?sW52g02*10T~p_Rv8Db6uTKj{=aP89D`g|Y|6#QeH3{@Obt zK}8Au?yWrj<18s|Bi<-STJVjB$|I1)ngEY>ZvDsI^|B^^2s7uhual4Or@_nP7yIGD z#dk%Lhi1%H%}uo>nO*9?XwD8WGa65;>4lwHm{hgzQX_qBegVfkmtG8e^-t^JOU0Gvfv207mKAUrl0qFmHHJ$IS9!#94 zoE@MJs+&)XQROll-a38ix@tvFo|mF~y<4 z)w!)pB;;rUxPD{oxHj{TE1H)NqrDwgSMTtpC&#JD7H@YjkqGTw54`RV@T%6@yVS51M_FDASZOy}H6E(yU2I`&Xk!aDGdTGWPY&auyFd zmWflmrpxra5gnoZrQUD1atF&N@}=AP2op~IY{hSKG0N49hp&4?FSU9$xl~GkQI}>= zE6g~*g67(p>7@hmS#jZ?3~ROe#U0;uW$UpmjK<6Xfz<8-moZ(%NJ_gv#%T7R(Y^Hp zEpIko=S`$1mZL+1FTTqVwx7K$am%vug0ZXGHMGgRlufTFl5 RT-fX(G1_ah6q_d@`G58sCM^H} literal 0 HcmV?d00001 diff --git a/tests/test_granges.py b/tests/test_granges.py index 4d6f94a..f64cf8a 100644 --- a/tests/test_granges.py +++ b/tests/test_granges.py @@ -1,9 +1,9 @@ import pytest -from rds2py.granges import as_granges +from rds2py.granges import as_granges, as_granges_list from rds2py.parser import read_rds -from genomicranges import GenomicRanges +from genomicranges import GenomicRanges, GenomicRangesList __author__ = "jkanche" __copyright__ = "jkanche" @@ -16,3 +16,12 @@ def test_granges(): gr = as_granges(robj=robj) assert isinstance(gr, GenomicRanges) + + +def test_granges_list(): + robj = read_rds("tests/data/grangeslist.rds") + + gr = as_granges_list(robj=robj) + + assert isinstance(gr, GenomicRangesList) + assert len(gr) == 5