From 25fcd87152f02cfb10cb0710851ece9cacfdfc32 Mon Sep 17 00:00:00 2001 From: Ray Plante Date: Tue, 6 Aug 2019 10:47:00 -0400 Subject: [PATCH 01/13] python: update distrib test data (pdr2210) for testing datachecker --- .../pdr/distrib/data/pdr2210.1_0.mbag0_3-0.zip | Bin 391 -> 2393 bytes .../pdr/distrib/data/pdr2210.1_0.mbag0_3-1.zip | Bin 0 -> 9767 bytes .../pdr/distrib/data/pdr2210.2.mbag0_3-1.zip | Bin 383 -> 0 bytes .../pdr/distrib/data/pdr2210.2.mbag0_3-2.zip | Bin 0 -> 9892 bytes .../distrib/data/pdr2210.3_1_3.mbag0_3-4.zip | Bin 399 -> 0 bytes .../distrib/data/pdr2210.3_1_3.mbag0_3-5.zip | Bin 0 -> 9887 bytes .../nistoar/pdr/distrib/test_bagclient.py | 13 +++++++------ 7 files changed, 7 insertions(+), 6 deletions(-) create mode 100644 python/tests/nistoar/pdr/distrib/data/pdr2210.1_0.mbag0_3-1.zip delete mode 100644 python/tests/nistoar/pdr/distrib/data/pdr2210.2.mbag0_3-1.zip create mode 100644 python/tests/nistoar/pdr/distrib/data/pdr2210.2.mbag0_3-2.zip delete mode 100644 python/tests/nistoar/pdr/distrib/data/pdr2210.3_1_3.mbag0_3-4.zip create mode 100644 python/tests/nistoar/pdr/distrib/data/pdr2210.3_1_3.mbag0_3-5.zip diff --git a/python/tests/nistoar/pdr/distrib/data/pdr2210.1_0.mbag0_3-0.zip b/python/tests/nistoar/pdr/distrib/data/pdr2210.1_0.mbag0_3-0.zip index dceab2b45dbc70712a7ad71c6303c00ddeb6aeea..82e0ea681f9ec98f601f2d4d16566c1f8996c1b5 100644 GIT binary patch literal 2393 zcmWIWW@h1H0D-rKto~pIln`T(VJJu`GBPqW&@+rT(92CqOgD%()-})%4dG;9wk`6A z&0p*hTUx=*z{v7~nSlXJ1i;N(4m5AVuZYv;K=VLY9;bOggELF?N-9dg2Jb5Lh<#g% zVX#wTx@U=QSZYymW`3TPf`Oi;Iaf$xx~^MhPO7eRMq*K7a!G2Du4`U$eoAIux|Kp` zh?}kj7s5Tq(EKNj(>*DPC5aGEegV4WUonPTK!%C{y~M!)^5isu^wp+7o+}Wm;WxCT zC^Ip~P%o=EKM(F#V@$uUeip>Qv~F!luMad~zfl!+YgVO-*_`dyMawPk-aCJMA%C}0 zs@vp?)!a3&XRLd7waJX<;Ho#zGL7~`oZGZWU_ocUec<

h<7AMWi%?b*|~`ObiUY zz}$vAQi0h?H#09SAMPg{IcjRq>Ac$p0yg3C8|4K~Es394x+-_)O1{qKQyKlNzUxvB z32A&xiQf6Vap{$<8)Iw4^(y(*vNd(B*e5VA7BM@h z#2*k*dQf`D^HsffmdW3H;mh=K_v2IZFYHR*;Oin{u=Hlu)STXLF%_F$C;9U!C~xMj z&0q6>&D77C$$BT>-CW%J%%jmS;wnFvg8iH+%@r0EC31ax7zINsrUT^ z^vLmm%#d>#r`$dy#{He4H`)E)B`Lj=|13+IpX$u*&C~wc>&iU0HydR>>12*9N9Cch_DM(s^hZ<;V0BG z9obK6zygPmpN#cUOL)R|VwTj%b{@fLC)RQsXdS5BCJ=3?r6;m=^MU0lb}yoqrMSGs T$_7kWz}5fl)~XhHx@4+q}t* zo(;sM72FJrEH9WD7{EjTih0jmCiwFM%>!X+oaPmml%}P@4FwsDW@utgadt{t7LZEG zPRrs7@MdI^W5(rb38?!882&nfn7I7F3h@J)Rp@4;`T$}!Bhb_>jR82#hWZDW1*~i! OXE6ccUXVc`4g&xn=uK4s diff --git a/python/tests/nistoar/pdr/distrib/data/pdr2210.1_0.mbag0_3-1.zip b/python/tests/nistoar/pdr/distrib/data/pdr2210.1_0.mbag0_3-1.zip new file mode 100644 index 0000000000000000000000000000000000000000..ce4fcd03f2685d21a136da65672f28e614b78789 GIT binary patch literal 9767 zcmbVR2RzjO|37x^cP(= zh0+0(h(Vhqz32p}g@Ir4z~TJ8P>cf-VI#n6?TEJh9oGk=>LA>V$`}2b99#q$VWeyv z$L_!tO-OX>>#N)JyQ+!r7z@bf)Mbcs$ds49NEp^ExWWtCX6o__(}OLKa- zX;g0J?`S&xy7dD$3?d*PAk-!pDB9MfqNpr-N3>ITP|=G^uqMBmN+Fg%Lh*&AwXLG^vL!V~U@2WdY=ivXa#?}={0ZK=*fMF~@6#grvQRXNUz?T5SIQ~?+V}DoR z+E>TZ)hDd$>*@%0sIloOSV60}Gk_Y|z&ZF*EX*;cR)2a9O{XA`qV_Jed{{G2y~4XD zKEJ=2ndV()X!;uyBw-}bLb{3IYG5&mL0DAcpB!YqEae<+^N-zlVjxH zd|9y?RH)a60R%-4=u{Fy$N8KmE|;T5j5*thkw>DSddHlOwHmfQK04{4!m|6d`M9#V zB;$ZUU_ub+_<`EmIhZ?|J2>;&pe=v*&@v(n(B4ow-^q1)%f%a|?x!65y!{(8eb;Yh zgr|Kb%!g>vdald#XOb1g8H7euwO&(e?YnX-*ezm?8#iL5V?Jl#m^#DZrX`5$93nRa zsXQ2o8mgaXB91~Ze7SicR5-2Hp0Sq}4W{nO^pZSr{+=(bX!`3I=RCp%cYeJUDs~bE zJzbvjuo#PKIozudifl}I0S9-gSTp!kE!s~uX*T~92kyOUTDmy2*lCuSl_{d6vF_I( zTGjeKd*!ZqDZjO;(?+C@Qcs}N36Ab*EGHdu6%ixT~E8gLgZAJKvp2Gw48(Q zDee`dSu_Ok)~-|Q^=!?sk`aGaA^F)BwkV^#ri;oBO-a_i;}Xqs9qtixqW#Ce9$Fx8NrVdm1p&F{#DO;tJX#*4tp)n0 zgC3AiBGsF0`=C@(I!0I)s>XDqP^Vl}c@BlYPovd60+j-@8XCydkW^JwL`WwjPKkgN zKdMAh@NSNB5pKgjBKi+>zD2$fmkJa`r*ypLVv+LBNtw}Cvd!>ut;MNRi0>5QQ<#4f z=wZOczd+wVi_PeGh&6C$;Jrr4c@@GVK@@_QMkn(81SMS8={eKmvD~2kOqPGfRdA;( zJ~Sq`Juxj(k)wlV^jJsQ1TE;h7JoKTqhMt#A@e}zZRADHB29T4zV)O9jl#TCb>C5v zvYt(d@NpG$#V`j~;!tb3Mo`J$lh269zP^3OMB`@r$>bL@Ii+~03liB{cfSz^Klcf^ zcvl35u^ZBk9i)N!*p0W%a!IL(R36u(ExpIY=={=Gk@ko}KlA7tmI{u*FvzhKm(E*B z5i}2V9-fP@wP|s#m|%FMPhV#fj&i~9n|`k2F|f$%<|Y*}!lrkiJGnY&zIsx^D#nw) zC(yVJ46h#PPnuPTYO$OpQIwyUK-Rr@1xISWwradEg)m%?;SIm|y&cTk@Le3^^N~9- z$d+q9;JiRPl5kALf#phJD%H#xPUB{<%2?N@in6VlN_0=q~>6nkk#RNrQf%M(6B z8E_<$z^yr=boOp`tbg-)&Fd5@%7v$n`Uof1hoCN>)~D9c`;Lc{#@Q}S)FXEnnFkvg zWsX5?7h?TIFoS=9vri$mRlWWynCrj*@l)`RVvPSsF}79z{03yJl$eenP-`Nn%u4~> zLkWmN!v77#*4)7iwO#yi0DR=IdpHHDtDyU!{Wt-B$@!-wB~0a@2*#U~x}4_GB`*5zXFV!Ku5#T!h81NnS@z7u?C zj2|0iShQ+SHcgIg#B2gqqs`nR{7o81)(Y)F(5HIbGOSo6y}#Y9+O?VIlPjRJ8PRjS z-fD+*JA_44%DH9KFKKHT$Y{}(eAFKf%N0LfgA4A4$yHuybJG^STD`2LAu8Np0ar+9 zLj`H{Cn<5CwqCAK5q5 zwvg4M3zU{79LL}KB2mq?Rx)bYgSH`{s}*p%7WypSl;wO$yruQNtggwi+wwY5b;^~4 z4|P1#8dHz3r9g`t{#l7`d{yVUwg{Ijeen6d8*PD`HLq`qP8WhEtFs~^=+^qt>~H6T zVHcHbHH_EAzs5T2K#C@g_G zPfas^@a_dRU)mwxq5EB;hVz~1`40Ull|r=p$qPaijKR9skLDz~hrkJyMEJ1_2@AEdi{<19 zqXfV8{FJoOFVI!8-I*#H-Dn&E6zqM#`P(q`uSXzkZPA$RFmynDq!p|{QOim$znE~s zB$0rCJmmp%Dub~d>nFNM%$p14*GWPpToFx^O&ixaRUmPG4DB9i$`xOwQ_g;Tl~Gh4 zrWoFmLj?x$g9Iz1o5k*dD|n%8Vv{}uLREj=z=$&Nu?o^kcc*#YKYqdIGI_6qI zsj+e|x83YRTvYhS`!{TJU1O3ytO%UR90E^V)UWSkm?7LY!BHTOTv-0r<_|mwG9Ux| zrf>OQ9{J@VxbFWD9%}Uk_lZz>yf0*p=o&Y9)syaagg2IY3`)IXNakUNhT$Ro*UbsbZt_zMK+ZkpZZE6&ecEWPt4194R&2E64UF`2 zs{S#PI#a_s8uG$(29veltmhg|U!ECUw%hnT)w$BGK{n2Mf7B8D?8Z|!Y=GBYc1Z9u z9$n~7mi4|@ovKrX<^-Ch<%=KeuGqN4QtL|Yi{8X~m7Z%zJp1r$ys0If2I|nOqXNfo zpT8?omA{x_e6`Y@Nc)|yER2?Cb(MyXD2K+uc z-LtESOq$7emNf5IWq=?JZ#9#A$6;gSX}BUo(mcfCpfnv4Y3v_zD=aZBj7@A!hEhgE z*DBl#r{U&WaP*2bkqRYz)HX6ig))(g#jJRGL)4r=_~Z;T#krd7i{9S7F7))~KGsR9 z8I(L?r_U^9@+m5{nT}!A_04dll$y-!k8u{(n7w)BV|L~^+_6NTH;h^RQS3nb(U&bw zQp~L=R!Z}{pp{!Ax6U~%(VYGYZmc2IPjR)<<2|pC(SF7$g`N^hN5U<5PIN#3<&)Uh z9|~qwrA9%GNAGam#4y7w_hlPCK^9ff(D zH={^wV36qY zbtfiH6=+VAag+A4?{-7@Z9jT#OZ< zM~w*R-qjbID;f}0KQD5$#I~bF>f#4*pnSm;Ep_ele(l>s?Nzino%LDhvL)GZap!K* z&tH8@*Qf(;uMKnXFL^$p^<(P~nqcuZjVnt2;xwFKi#nQLp3vMs1Xm5${#yFBGcgk>U{Bh^>JRJn&|IG+GcvfI6A&V^ zFkAAzWHCJFdZkOPqM=W;%G35MD1mcS+)L_=pBbz#($VDkj~wbOVODbXVi}+sVLE=B zUJ>k2Xr0d<-C6M!IT;fc<=OYybzUU-5W#o9OT{&zmTadBo=5TAL*Z*WwtL>k3x)G3VRO($Y8ggsylhxZ(Vw3nQlEC_x?( zyOjY6VSeq1CU-91Wk_{&)~luka4Ob670SwxCpcX8H46X3DE_0oyv|Yep6s#t_u!uU zueEIhh{{SeTHE~VwQY+pjPz-%@@=5*ySZE)Yk$1t=wdXQu<)q_qtlUO;d&$PmDfsA zcc7M6h1DrMpUO`a&I- z+;Mv(d{{0>&x>k9^$ID?9UOF3(?Fd|6DRV;@mz2sU7cJ<^YO!=E5*$Xa|XLd{-p}->;M>fY#0S^F83<90}vzhGBX4@r8{QmCVqBVf-g;IG= z8X>0p-7hufN@;=fUvxxTV{%K8l$_EuW&WrhLd6H)CnY6G*aH{$Y;R-a(TLuL5bM(Ws5N$+cB6 z=B%+Xyp3U<@OT9T(qfXG;c$u3Vvo|0^LN=2?@v6>)%9gGp1&fqE;!xj;jwttd`11F zUxr#0U4J(;`ty@vJ&jPW1|gj&B_m_rXy^07R9yb;)vgx)JU3y@^;Qaj9Z`1^>Nzga zZCG$H))1t;&^^uQ>wU-o4|0h)2#3s#1T05r){tVBe~SFMRh6!=(_--#2#b$P%R^b2 zI5xUlpjJ|$LYeMhx_OCm>WuU+t7S}O1x4DiLII2`haa7j3zKh_3z!&!`m;T|N+S*C zb}G`leXeaa6DM1JoYyHN)wCh1ul?kW*jC#HaOrbpzs`xJ$YB@x#27gk85ty*uwova zHQ9ln=!Se@yUqAL>s-ghuOUGrDIM1$ULYL!C79$F)0S)P*8C1{q<6V|j8>A`wxJW>~qsoI1qMA|?k5kt^T9Bem#%}Zu+;gRP8~ZwEkgtkul~+@Gtn*@9 zOAfR1Y5hjJ(HDf z*_%;q_a<*kHW1)IrC#s@Wdozw|Lyevkj;?x8<4&0xt#&E4E@F!wIgc4Bt_8FSBS4Zz%7t^Edb@8)bLrV-Eq?$2|_9_@EGw&fjr7hyX= z<_`ox?Y9;K;5OG{f8exxleH5r4gVlcyY^atX0>U$-D{JbFp0nlZGY44UZVUR&hFXy zP8<-)K{z`m>AwTnf5giI+5!QAHba~_@K5`b`PmixcvPE1-hD4-e-N%+AXR?`**#ST zSRDws<#>B%$~#TRM)pTmI{|lAkpT4Og!%u?Rq)W?LH`_P{+a9EQRYsrG5-zl7p|Lw z&3(CUNB`Y3vz^enl>dqA&I#J@{B{d^@2qMkbUxJ|p?_hzIl0=G>6Qm?zIh;9{`_$6 z^59rNr$0QCt*#bmo_|Me|JNe83v2nWv9=(8Hqw9gq`htQPMccL|1HT~z`xjev#H+K z&f6ZfyVu*Pf*ix&s_+Zyfo0Mb>fY9AC+bz;#~*HYyO9E{yxBW7AKGB8)X$&FqQ z#HAJ742&!C$rL;#9u&s-+>^8-x-VM!dO6_=EzrNNB^8H;9QVoq^(N?I0>O36;k z;tKF)WRhdXf z3s!JPxGBoqkz3x*6lrE{XDQC2qpiR#O1!Jh7@*BwZpRMM#te>v?+_#c2$BHLp#*;w z$?`jqC`W6!t-!A212|%or2%v#1YsnLOFA|Q18)kz!}S-EcL3l#Hcm*p@0P^qe*j9} zgeC|(qx!|5HajO?MmX`99Q%p8`bs85`fbg%9R@wMgm;bk<#QS{#Mq@P%U>jnY8PJS z0qq)6`?w&y`P zxqPP_dN7KzB9~$pFStdejjyYfr$Bu{ z8-?NnzDOlG9Xnh-+ZR#sm%|CWVWG)BuVGJ~R(GXF8hCJE?W%noP}`{9yM!H3S@v(# zMwlZ^fItNdG+&MsvY~VS} zF&5@1Q>#Cnx7M>@aB*jkdI7YJyIJvFE3Z<|9s_DY%Cw)^m0|&|=;JM#VwDs< z3V9l!Iz5l@XEl3BLfWL+!OCJ(x*S9A_97S@Zh3 z`s;zqw?o_`7rC$_*SnT-hE8bG9c^6&yDh?VL#^MHd&&yuA7<}8O1u~Vg<*s`Xi zG4U}Q-TqCjZ$ayypJz8yZ!;4(Hztr2iYc$<;ChL92WuCPfPHiu)%(3(XIM##J+G4Z zWCvZ5R@u}?WQV0B>)v&ZVZH(Lj6GI5)RS0j;kEY7nfonGPn@y?Jl?&Es#`Qa>&dv| zhAy$IiVin%bkd9~_#fKJi_XTDT&=0uMcUkZi@%Q;jB-2+)v(Ugsv=TC5mK;SXx1+7O-P86DSDI66x z9#bCJKl>e0C~rxC4FUxNh3v3I4jfu;YdZ_%kMKOCSVd{J+6_P`B=w9+m?>J)jl!IB z5tTV)0lqC(_wiH<&FZPZvm=rg$+t{9xKNVO^{B+qgO2Aq`RQw5 zZgGN8oHQEYx>MvZedjvHCzH9sgPF|#OsQheRgJYIcRV#MR+XcHWc1lY*#<8gxRrh~ zQ729}H7LeC?+?7C0qUg{x0q zp)SAANbl0*r$l{Bagb@^4RaNHP&oKRifi}nq)4hqdXHq|8*JNMs;2248_+h|Mj%{K ze5RioxeYDyd%1{(jY`tHklmbJRA0Qvp;c4K-x8?YhekJ!4JIupMz>oo5Glz|Pg^&> zcnP!Cer467FbluB70VNG@mnVY5Bi%J%J&0TVz3?OQs4#tPHX&0RY&H_MX40?=S~~9 zF{n=Vd_4K>PU+d|5?(PBa*1=8vD-@Hp6J53XR`Wfh1@gFy3;d3D$DX>mtyYg+#l^< zp-p=%mB5v;{dvJK)$z&cmbpSCrCNy>AuLWZ@}0MwWybbE1MklP=J06Vh!jXHA;5E3 zYKEI2o&VShAwpG^Ur3Ys%`9*_8jwFWG&D<$7(T_dTGDX`59x3k(a`;XpSohINTh(6 zm7jSvAhn%h`Ilvl9nd|3$G2D@+cK6|$K**FcSN%Dy1OSYwsOR-dB+}RIb zklpBoERDNN`XrAzOWDS342(A_c(%jd>9R+eJ<<*gSdQu_!$Ew#f2@d)W^H+Ae6{AX8N- zI(yt#D6u&daf!o#QtQe$9P9Jn3-M0xsu}fje|MOP5#}iPZjU!u3^ime`%RDct61B9mZ^@p?YU8*>>?HO+3N(7@>LU*vf zXJvl>mW{;d=8U=Wx$!(Mh(UpGLxY00o-ECt@`SF=aUzs?hy)StHsStfbYmhwVZ^ z*Q#JN?X+1uDQg81IICL&Sv@n8cjWb=8&#@@AL)6ewWJR1fputlscG?)-Ou&T5d$|lok*I&DRH9T6AkuY#d)>|T>FEO#_(&+O= z^gJ0sbYVofliGmL$=c)b%J3nNyk$8oZ*Qm|#Nb4US#tXV33e|rI04ofu5Qzzl!}`E z5Vjo0IX&Cx8KgH8mp50pnWae}(c_Y_$Ds7G`cvmboI z7O$Ug7LD2hZT5Y|H@X=tGWXb2_&g|(P%v-daq~w%(>r;zX65ynSA|-0(_8DAuV1M! z*P4^}F3$~2MO(Kk6Dy17%BgiDqbLg{VglJ;K9*^B;mZ1ej!$#?TG0fn`E+|Q6WMrS zQzFPobY%VRdiB~j?!L5=e^Q3;euAuhoLsr#=u;-g;UBE|IsyyF$JjLu_wCb_hr>K# z&O}|jeVq6!QZ<`i-qBW&>ov{I5?8Oxoym?|iX_7F4kqufAxOseTAfT0eApv$b*US< z)MYTMT7>jCqaavCAEJNbcutZ>C=6d&n6G3xVYxwOrIHkGl;FQrkdij>8M3jb?G1Op zQ;$Km7)JsP`T+3!T>|8 z6B=~N{i4=Pqjc!e5rbE432W~1vvfd}J>lUXqx5agXtSqY4DnWUqU#Nm_*1IE36nMAJ)o(LWe-Jem3WuSq7$VaNyVF5(r6)$ygbf-^3H{O!Zige z8OGPDJqUE)`N=?twwn?KRn93jIIlO{@G;8FZLnu&AU)rShd-gQk~if4!TG*@eN@tX zfs2Ispc)<6`s(dAqHkEN^xWvn(nM_|%#O-)p;5*Gp|`^m)52LrH>Js?h4rl>ys^-? zHbY|8Z3tA!VH0*yp{nGGoXlpWbK4^3ctU69naE`8voHGi^t;m1n)})$sb!FJi*lS> z&E!>5?l7G!(KIl_mQ-#vb2xFjsNU?&OJB2dCt*%y20Y0(>1@(uKZuhMPpf<$Ya842 z9F|K17PuiYeyXhWXqgq{wT&?C0MAjwgnVNcKfTMv`0u?S9r8kn6P2T=27h_jG^00l zmg5EIlf8ysUNp;YUz1XBq_lYp_2!lh%MIkwCXb-uVoVMzi4nmk^5;de#EP|pTvYA5 zkF%dLpI_Y;U9Sl`Qk?v@IUus&Ju$(x#*f7564Ziob8|jQT7DVDqC>+3mu@&So>qkf zdBm2kg@TXpe}z!C=9Xn@UseqIP$gt{Zrgd~8&!$Axx*8Q#C9X8FoBf^{IrO1e*Jp} zqKm~tBAOS3kC)kXwM$+cV+fKjoTa90s2kM1GtyZ@ozvZ%g{)YWnG$p9CI0lquY8j- z=+5RSJKw6;Q)++K!Qg3TAJe!pWyCW`7An|)zNE)NP~M|^_X53 zwqEV&?s4rvf0wv3j#EP1t8bH8Oc|}iKK9GJ-1ojEXOBBM>^IEMg*dm>3c_ayG`REP&-W{5sF!CN@0UYv7aHl5p}7yl?&*rS)N!tQ>`P(cJeDxY|ygUJr3XAR=@df&=|^ z%ZBOqqK%){vt?OR;By(?z8=n{X^+I_N_y!arthbB$zwBXfF$bForW?1StxY7(w zYaJP5w{u~^#UHNc@tCJrRXw0O#gN>!=-12KKCt+drgSz0ePP9w9@TXmFQ0(T%8&@Z zpkZ8_E0^~YxHcy1W$Q9SYDqvUgoQ3&V6@^(H1600?&JLY?g`Dl?8&9~41Et@>DmPn zRFrGI?g(hswJTK^AJA3f-9|igcfB^*`DE3})o3DN`C}PMuPe#I?PlCd@AagJhVkz~ zy5n0+kT%DNM?C|=)Cce{&tLlX7k6U9^?<4r467omhma0#(4NhQE%b_?oFh6xDSOhv z8a65ye8rn$TkSG2)mnk}_v@gMaaf4VZ# z1g*MxhJMJmFixA)dOj)0x9bcc`ymuwGe+>a zK@W|VTuB{Fe=QUg&es`}rI(!~ttC1!Ix86EVo)o1s(83eJ+&S=HzZIj@v{1~dD*wm*`Qj$@^{ z4SG#1SR~yWLbD`ZNtu!Ud82}{qOe%^fnXs0`q9U-a^doAa)Hw$kO0=_*Qm}ja5)!W zxg*=Lk%^V9ImP1~nre!U9_T!C^TBI7G(&lvihuX?YSgH!d}6E|l!OGFj9;~c%$n(f zll6i>v)-ZqmL=PD@k?m%cuLpx$QN)&K5<6*m9(`6`%V9&+vz>7A7V6F;dD~8Pj`C}{gwRapLebSDf#ki= zkS$y=i>L>StLO4U^p>xw)R+01nvP8=BnE$+-`~!OPWYU8Ic)hy|8JS30By-&ejdCE#M@Z~?%R^>O5VQ+ z+>s2VHc+{@+98sGVe9{|1px@mr1dutzpmnrf%YmtcnHM5b09!GW>@wf#IyV)o);i; zkZrz;->L0DRsS8zPTt?Y_1kNwfWJW5+erK#amVNTcX@lw#tx*`gLwT|!F`9oZ1xT` z-43b!d%eA+JpKf+Z_5Xm5wqp{4dVWl++IWh;EwlOQu`Kk03v2V_Z!6ho437)HHRSX z*~0-4hX(kL8TT*J_Cjdl9zyD%wHg41S*!hl&(C|Xyk z0_(Sf-1PGb=I>B`p1kixxkPj@%APs=?;sAI*q!((IaCAxbP$mr1LN<92s0!;&>;sy zfPaFZCiw%z&ogO&(V=iV!M1-Qz1L_i{{?PuPYM8I=F|T(&)Y}-hUbq_=%0D+AB67Z zIr(4Ue&LB3iyp{x$H4n1ZhL`O$o~V+z4JGKF{bJN4e+-)qgh{{{9J z3uCV22U>VHj()x=?p45_?(ca2f_bQ|cQE%~y!K+|124bz>h3iRh*r!sY#%*5AoYVl P$AG^Gz`Glan797}D!R`` literal 0 HcmV?d00001 diff --git a/python/tests/nistoar/pdr/distrib/data/pdr2210.3_1_3.mbag0_3-4.zip b/python/tests/nistoar/pdr/distrib/data/pdr2210.3_1_3.mbag0_3-4.zip deleted file mode 100644 index 3472d4f0a5d13ebf9814ad56e286c2fadbe6d6cd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 399 zcmWIWW@h1H00FN$E?+PMN=PusFchQ|85tQG=o!Zw#vAM9CMBjD#2f3H=!b@IGBDS? z$&Ee>#HAJ742&!C$rL;#A3&s-+>^8-x;VOcyT7MGN!rNNB_8I5LaVoq^(N?I0> zO36;k;tKF)WRhdX<#GwA9|RcwI)a$E{lN1ys}h_y6dWMi7(`P)AO>R8mJzS_NVBkTGI}2qGXLF}ftByHQjS2~kEUBGNnp zB4N=X`Tro#PfLgF;<%vg1cPI)7)Lwl|W;OBK>v5OVzX z`%i-ID@{wYM;s4ahu!eR4>u7rY#VEXaO=cerg&7F>=LNeiSR)2alEvLZXl8$bTLTD>rlhWH30p)H1aICP7Jm#?%S<=RjsUL}RY@0;0VgQ#!vJ%@2qr@f_9I+;huu;X6gz`Z*1nrOv>< zxLC1WPFIIEr%6-OltfL|#K#oHId|CUVzF_vt?g@fPRX$imWO7NPsdqq{pVO! z452y{jT~-TU?z@%Gkm^vK3pWd!Je^?7R5x}o#iEUTt3u~RxIOXoO1!;qWkG9D^#2$ z3|I8|TjQ zLA2^k{r0Ne3)22;)7(a+j*s*t7B!`fNsn0T_N{4c1h0I4n$twP$x7tZm`GM6q4F&k z-&4X1qg^r#_SS9G=<|G)X(c1^v`XreEp$mnbxj|c6P}u)d&eb~^#;r%?nv2ScT$Ok z=eG?fzBhE;@hbN41h*=hF7blwhocUwx}+|uI((!t$x|-if9M}wa5A>!ZNbd;zaka6 zUh0-nYSmWiycsf3SmNrN$I@~af@I6)bLvlX;wdP+#=oi4%_&K^fPsiA*jEP!fh?-% z%16u##<;~G@L24e*8S&>RGJK_+?Ceu6YjXb+8$D*Xi0<%0%3rx#Cd@2ftSpOu(d$_ zaM67-Pqb!>Z9jxcTFteyruJuF>Q6BX74j#nWd z#g8eM5(>&yEy2b9BWjSZ_ci*pgmkbNDz)<^5396yZtARoifyKkYXeTBQbL!ofYQS2 zU=Kr{(~I;2bETP`_e%}k8ThYLp120)lOzhmOQ#d9J5CAHcdBE4IG%?Y$YT9xLJfDO z>V0!c`(x7*H3d3IX0LU$4QA26wd|9L1_c{OIhhBlV6z~44q?iV{o0$38H0MM>%XNW zWjmJ`<>M;uie?F^#-Y}6jiOQvRm@B%ed&A0MC(?^$&_a@xfOWG^O89_L0<_&>U;t( z1c^e?cEh@NhiD)^b`$M$Jkn~S)$CVjD?*tWouB(D(;iV8U>SSOTE!I{0X~-M(sesI zisruFeYu1Nn>OdFNrndo^o=%=NEh^J(@%|jh86`qyriN=r5T;5E}l-BFP@aps)>}1 zL>jljk<}vu$#Y6EZI*K+%8HYdh{k6xU!bGdDla8KyZrJkdN;grZh2^(2PHOGmx+ zQn1Y2>~9eG)yeGd&l-^f0VV`E2Zd?43Cih@wGA>%P34&kx!=?rkAnf_14Bczq{yM; zyvwESz62=y6Uc_HyMnZp6UAbMq@*mk*XL#;POgRsxzgRn8Kk~~%izOC~I|0>NikLQyvPM;{m&VR6Gak#u-3VQMEV zZHV|Q8kZa}NN_6-U()#6h}eMFL)!jPA^4XCxQY~Ufze$Ud|MX6@=uSMxW zYDSu);XjMEffA@;dHJpqa?65yi{?8ZO#BeNqj1}TaPCiB~P;ZSx%Z+G72^ z+N)L$Jw`0o8gGj73E+2}M>IBD37n1lhe<7d&gDYiQioO6g`3R9gM|YAev<+yv_A)B zM67ym4o$9Y)LdezR=c@H=oKU%*Q6&%2M%Y`T)Dz4QsP?Q!AiznX&VI-WrGW zJ4Hm*9`VX(Uewhwl+mFp|6ni@kte}kkIU2pRj9t)?xrhpt@fLamY4|E0;ZJMj>KpU zB&%@Ufiu5#Co_K{J}JTU!jo5Vxk$$?;hc~EybI%^aid}0itFlv%7c^RKC-W=ZNaNY z7bz`GxY*zLA&{*NRx%nnL$+a{YgI71Hu`M-)Nh4Sc+2bk+1*p)zKVJ=jjGi{_w_u} zo70YPq(aKD|E$C|zi9MaTY^bd88LcS6G|8P9P?{4syj<*6ho`1(mPCDRts{hIH{)PF? z@x=FYyziN@dX)$NLwsBruX2nA4$~-X9;XuvCdLh{D(iuA%68cH)oxymh|y#w4xEtl zl1l7NO6tBe@^l_MOF#ldo)1>KIopmsDR_;1r>%E94j?TX`3U%?I8sx z!a5=}tlO2-(32m+7vp&*ryD(j^`_$UXX@6nb;|q$h}s=E&@d8h=6MJ-k-9)bJ7Fm3 zJcl3cu^*5tq6MpjaXE(aTHL%_XFqegVDb}0k^e9p??OWDEmEwkxqybMFShT;!@&q zlOzHH^3;5mGzMckwvTkt=-1~T-5?2;oeQ#JlDA7_bY;DvWA%^FBmj+G0YPFRKigpr=0)xwLJiM6l6dy zIIxrXznt>hV{qO3G2GYb59t@B@_1Lw7S%mr@}f7x?J}qOMLgLj`Y7sqim>)ksLAng ze5KwQ!qur`7a3Ig#1Skb4A_xjgO{y|-`o_Z8GsCY%-vpAd1J@^z`Nvx6(kbAWm?| zQ$BsjE!Oq^7hUSp#pVRs6_1wQ+g-MCho&`_-xIr4>Qy0!O*(h~T!N`3ofh)Yi=%?< zzVbn$HHAx=#@DLdiFDuk$wEjrpC<{co>6XaT4}i9ZIqSQV8_Koezt>vaO~nz{-FN{ zr%=24=;YZ#XDRambp|lv>g`sN4IBgS&$rN1wMj zNwd5{vQe5BU{-FA-j;J%rs4j=)Lc($km_o6g66qv z5YEJ=PK|^ZkKN(9g=T?T#^(f2ATHV%MVV?v(PTFd)#V7^6==&yIomLqL!ROXaTMWi zQOxIP<(!X(FnT1 zUlBIR>_waHAn(N9W9aEgx9IvcITi1mwm^~I%%Wk1fg;xAJ~Tpt#eOL%5_7CzRxDeh zL_647&93Vx*KzaNm zh7&G3B7DnlQrJwH5#b;EWZ#@l=CppuwamFxOPZ@3osxCs zP^{+FN@hd`hArFSVez5%mBJArg9w9c0VDO&hc&RdJd{tQPH2r=^$O+BY`YcmfJpb_ z@1WV-uVtRlcB0*iasR216#454Pe|Z||vIV#>|O~y|Q))(k#3IawCb(OQIID4@UQjIdR`_d~j zITTwLa>jO5eL+mcMZ|dae{x+AO*urc;eWBLKHQRnyQnUPFBFNd?bzXY4=)5F#;_{f z-&eP2m=P3X{J5So8$m_LYk2#51h1wY3YRzeg}s7ShJ-I&y2J_Ls~wO~yvJrZ8hH3J zOwYfyXTHCk2g8(nxSr2%o@!Nfm*zNAO6R;^4{KZh{A0SZ=@6{^k_#ic^C&?95vP?Q z31MNws5Wn&z$I{PZ1#(mMW(dUfHVjjLxIpp<(C-z_ha}E3JSW$G<$Q#7v3@T-g~KQ z8%R`Hq4lagph?%Z?EGlIuDZY`@}8T^wegOJ%Z@HaV~L9&%h7tB$ri3R<6n5KBu6%k zes|dIhGENLn_?xSp8ziF4cJ)lKiKy-lkStAF|aC%dI^+BjSr;Ok; z1MU}_^Q3i{3ZHdGTch(T5R@m*(o_Z@dkK~EH>RYdNH~L)JaR=@ZjHXIdH3Smnw#9T z(8G!=1U^yZ`sb07sWD6?K5rzU<>bMSyhNh-6$=k@z3Lk>$@D5Zt1=d|IX|_wD$bHU zK909Jq8FK<1V>m*aWWh(H(Kgd8Ft=~E%*M&SEp$pqgD5U%({rX*~4S$n)!<6N&ifZ z8oGfVNbIM_BUiM-y|BW1F)BvJ{ISmRB2+v99ks3&0erWhtxZ-+!JRQdiA`J=={7C6 z80!gApXqZm`gtER!~F+ps^X@BKEK^0Ds(;tya`cFcVp5y}l#C3VLRhtc%AV?k zQ}lp8bNDiDWXpA4_!5R0P3^oM^$hNCT9R3DDg9f6-J1X5&5UlB53!mYaJ}pGj<%)s zHZ#Qf=;n6ukZbc`+oD9+^AVnU7cUp)nfkXz9&xQ%8C{`lhXu8qL+HlypxEl+K*}Cy z$T~ilP28Q$#ba?UX5Gj1+?Uy!#p)%|MZYWl7`(!ij*0rJx58JYn3IjG!s8dRKX`!$ zUoTOt6M2(95-Me`b*wlauV%^LLT|YqZqQ{mWkoR<({CfXI>*svo>76eAT)ruQZPM~ zC3q0VYXu`Z!{{|E@P*vPd{)OWy@BZ&4r8(xM2gK{JOh_nL<0Vxps$SwoA_0|qYk3} zTy_n7CsH)9$x$u;y_mMN`mP3Jxv{-hNv^#(z^M6=+>{>+AkIL4xg_;I4h(PwBfi zg4@!8FbAse+FC2@Cmk5n{@;EOfZ3YT9ssj@&9{@p=m9YQ&W`|yt)<_d5Zh6D_sVZ4 z6DfeueoVeY?3g3zXai_l3&1_mwym;zled#l#{pTghY?d1T}t-agfLlwnKk)i>^R*MO?Et`E_F#Wzwxz&d*DO1Mu)xx7KL!4} zboo1^UuW$*k@!jWN7^xYKL}>;fiOF08wLd03VqSQKkY~8N8j-Mk!_8H_cYMnFnqtj zO#cncj_3jOC;+p4A-A1y_Y8Wc0`HO8)q0wtL`hN8Xp--fu_bm$xH9|MzIypg$V#Kl{|~wtJ^$bN*N8 z-?Y8egzu^CpT6~LZ@BXkcNqSj|8Kba>U|q`cdNA%w-osCM=SqqyZ~KqHD0@D5&$U! R1Q?VN Date: Tue, 6 Aug 2019 10:48:59 -0400 Subject: [PATCH 02/13] distrib.client: add head() function; update tests and mock service to support --- python/nistoar/pdr/distrib/client.py | 41 +++++++++++++++++++ .../nistoar/pdr/distrib/sim_distrib_srv.py | 30 +++++++++++++- .../tests/nistoar/pdr/distrib/test_client.py | 16 +++++++- 3 files changed, 85 insertions(+), 2 deletions(-) diff --git a/python/nistoar/pdr/distrib/client.py b/python/nistoar/pdr/distrib/client.py index 04d0d19b7..fff7beeaa 100644 --- a/python/nistoar/pdr/distrib/client.py +++ b/python/nistoar/pdr/distrib/client.py @@ -141,6 +141,47 @@ def retrieve_file(self, relurl, filepath): if resp is not None: resp.close() + def head(self, relurl): + """ + send a HEAD request to the given relative URL to determine if the + resource it refers to is available. + + :rtype tuple: a 2-tuple including the integer response status (e.g. + 200, 404, etc) and the associated message. + :raises DistribServerError: if there is a failure while trying to + connect to the server. + """ + if not relurl.startswith('/'): + relurl = '/'+relurl + + resp = None + try: + resp = requests.get(self.base+relurl, allow_redirects=True) + return (resp.status_code, resp.reason) + + except requests.RequestException as ex: + raise DistribServerError(message="Trouble connecting to distribution" + +" service: "+ str(ex), cause=ex) + + finally: + if resp is not None: + resp.close() + + def is_available(self, relurl): + """ + return True if the resource pointed to by the given URL is retrievable + by sending a HEAD request to it and ensuring a response in the 200 range. + False is returned if any other status code is returned or if there is + an error connecting to the service. + """ + try: + stat = self.head(relurl)[0] + return stat >= 200 and stat < 300 + except DistribServerError as ex: + return False + + + class DistribServiceException(PDRServiceException): """ an exception indicating a problem using the distribution service. diff --git a/python/tests/nistoar/pdr/distrib/sim_distrib_srv.py b/python/tests/nistoar/pdr/distrib/sim_distrib_srv.py index c5a0d8b08..5eda66d9e 100644 --- a/python/tests/nistoar/pdr/distrib/sim_distrib_srv.py +++ b/python/tests/nistoar/pdr/distrib/sim_distrib_srv.py @@ -196,10 +196,15 @@ def handle(self, env, start_resp): return self.send_error(403, self._meth + " not supported on this resource") - def do_GET(self, path, params=None): + def do_HEAD(self, path, params=None, forhead=False): + return self.do_GET(path, params, True) + + def do_GET(self, path, params=None, forhead=False): aid = None vers = None path = path.strip('/') + if path.startswith("od/ds/"): + path = path[len("od/ds/"):] print("processing "+path) # refresh the archive @@ -213,7 +218,10 @@ def do_GET(self, path, params=None): self.set_response(200, "AIP Identifiers") self.add_header('Content-Type', 'application/json') + self.add_header('Content-Length', str(len(out))) self.end_headers() + if forhead: + return [] return [out] elif path.startswith("_aip/"): @@ -224,6 +232,8 @@ def do_GET(self, path, params=None): self.set_response(200, "Bag file found") self.add_header('Content-Type', "application/zip") self.end_headers() + if forhead: + return [] return self.iter_file(filepath) else: return self.send_error(404, "bag file does not exist") @@ -248,7 +258,10 @@ def do_GET(self, path, params=None): if not path: self.set_response(200, "AIP Identifier exists") self.add_header('Content-Type', 'application/json') + self.add_header('Content-Length', str(len(aid)+4)) self.end_headers() + if forhead: + return [] return ['["'+aid+'"]'] elif path == "_aip": @@ -259,7 +272,10 @@ def do_GET(self, path, params=None): self.set_response(200, "All bags for ID") self.add_header('Content-Type', 'application/json') + self.add_header('Content-Length', str(len(out))) self.end_headers() + if forhead: + return [] return [out] elif path == "_aip/_head": @@ -275,7 +291,10 @@ def do_GET(self, path, params=None): if out: self.set_response(200, "Head bags for ID/vers") self.add_header('Content-Type', 'application/json') + self.add_header('Content-Length', str(len(out))) self.end_headers() + if forhead: + return [] return [out] else: return self.send_error(404, "resource does not exist") @@ -288,7 +307,10 @@ def do_GET(self, path, params=None): self.set_response(200, "versions for ID") self.add_header('Content-Type', 'application/json') + self.add_header('Content-Length', str(len(out))) self.end_headers() + if forhead: + return [] return [out] elif path.startswith("_aip/_v/"): @@ -320,7 +342,10 @@ def do_GET(self, path, params=None): if out: self.set_response(200, "All bags for ID/vers") self.add_header('Content-Type', 'application/json') + self.add_header('Content-Length', str(len(out))) self.end_headers() + if forhead: + return [] return [out] else: return self.send_error(404, "resource does not exist") @@ -338,7 +363,10 @@ def do_GET(self, path, params=None): if out: self.set_response(200, "Head bags for ID/vers") self.add_header('Content-Type', 'application/json') + self.add_header('Content-Length', str(len(out))) self.end_headers() + if forhead: + return [] return [out] else: return self.send_error(404, "resource does not exist") diff --git a/python/tests/nistoar/pdr/distrib/test_client.py b/python/tests/nistoar/pdr/distrib/test_client.py index fd62ff80b..e4f6ef1c6 100644 --- a/python/tests/nistoar/pdr/distrib/test_client.py +++ b/python/tests/nistoar/pdr/distrib/test_client.py @@ -117,7 +117,21 @@ def test_retrieve_file(self): with self.assertRaises(dcli.DistribResourceNotFound): self.cli.retrieve_file("/_aip/goob.zip", out) - + def test_head(self): + resp = self.cli.head("/_aip/pdr1010.mbag0_3-2.zip") + self.assertTrue(isinstance(resp, tuple)) + self.assertEqual(len(resp), 2) + self.assertEqual(resp[0], 200) + self.assertEqual(resp[1], "Bag file found") + + resp = self.cli.head("/_aip/goob.zip") + self.assertEqual(resp[0], 404) + self.assertNotEqual(resp[1], "Bag file found") + + def test_is_available(self): + self.assertTrue(self.cli.is_available("/_aip/pdr1010.mbag0_3-2.zip")) + self.assertFalse(self.cli.is_available("/_aip/goob.zip")) + if __name__ == '__main__': test.main() From b83434bb6c96c1be2cfda44f625cee0377565578 Mon Sep 17 00:00:00 2001 From: Ray Plante Date: Tue, 6 Aug 2019 10:50:58 -0400 Subject: [PATCH 03/13] python: add bagger.datachecker to check for missing data --- .../nistoar/pdr/preserv/bagger/datachecker.py | 412 ++++++++++++++++++ .../pdr/preserv/bagger/test_datachecker.py | 337 ++++++++++++++ 2 files changed, 749 insertions(+) create mode 100644 python/nistoar/pdr/preserv/bagger/datachecker.py create mode 100644 python/tests/nistoar/pdr/preserv/bagger/test_datachecker.py diff --git a/python/nistoar/pdr/preserv/bagger/datachecker.py b/python/nistoar/pdr/preserv/bagger/datachecker.py new file mode 100644 index 000000000..cd2f60598 --- /dev/null +++ b/python/nistoar/pdr/preserv/bagger/datachecker.py @@ -0,0 +1,412 @@ +""" +tools for checking the availability of distributions described in a NIST bag. +""" +import os, re +from collections import Mapping + +import multibag as mb +import requests + +from .utils import parse_bag_name +from ...exceptions import ConfigurationException, StateException +from ...distrib import (RESTServiceClient, BagDistribClient, DistribServerError, + DistribServiceException, DistribResourceNotFound) + +class DataChecker(object): + """ + a class that will run checks to ensure all data distributions are accounted + for and available. + + A distribution that is listed as a downloadable component in the NERDm + metadata must be available from one of the following sources: + 1) under the bag's data payload directory (at the location given by + the component's filepath property) + 2) from the URL given by the downloadURL property (tested via a HEAD request) + 3) in a multibag member bag as indicated in the multibag/file-lookup.tsv + file found in either, + a) a cached copy of the specified member bag + b) in a remote copy of the specified member bag available via the + distribution service. + """ + + AVAIL_NOT = "not available" + AVAIL_IN_BAG = "available in current bag" + AVAIL_IN_CACHED_BAG = "available in cached bag" + AVAIL_IN_VIA_URL = "available via download URL" + AVAIL_IN_REMOTE_BAG = "available in remote bag via service" + + def __init__(self, bag, config=None, log=None): + """ + initialize the checker around the bag to be checked + """ + self.bag = bag + if not config: + config = {} + self.cfg = config + self.log = log + + self._store = config.get('store_dir') + self._mbag = mb.open_headbag(bag.dir) + self._disturlpat = self.cfg.get('pdr_dist_url_pattern', + r'^https?://[^/]+/od/ds/(.+)') + try: + self._disturlpat = re.compile(self._disturlpat) + if self._disturlpat.groups < 1: + raise ConfigurationException("pdr_dist_url_pattern: regex is " + + "missing group to capture filepath: "+ + self._disturlpat.pattern) + except re.error as ex: + raise ConfigurationException("pdr_dist_url_pattern: regex does " + + "not compile: " + self._disturlpat) + + self._distsvc = None + svcurl = self.cfg.get('repo_access',{}).get('distrib_service',{}) \ + .get('service_endpoint') + if svcurl: + self._distsvc = RESTServiceClient(svcurl) + + def available_in_bag(self, cmp): + """ + return True if the specified data is found in the bag. + + The file can be specified either via its component metadata (as a + dict) or directly by its filepath property (as a string). False is + returned if either the filepath is not found in the bag or the filepath + property is not included in the input metadata. + + :param cmp: either a dict containing the component metadata describing + the data file or a string giving the file's filepath. + """ + if isinstance(cmp, Mapping): + if 'filepath' not in cmp: + return False + cmp = cmp['filepath'] + + path = os.path.join(self.bag.data_dir, cmp) + return os.path.isfile(path) + + def bag_location(self, cmp): + """ + return the name of the member bag that contains specified the data file + or None if a member bag is not specified. + + The file can be specified either via its component metadata (as a + dict) or directly by its filepath property (as a string). None is + returned if either the filepath is not found in the bag or the filepath + property is not included in the input metadata. + + :param cmp: either a dict containing the component metadata describing + the data file or a string giving the file's filepath. + """ + if isinstance(cmp, Mapping): + if 'filepath' not in cmp: + return None + cmp = cmp['filepath'] + + path = '/'.join(['data', cmp]) + return self._mbag.lookup_file(path) + + def located_here(self, cmp): + """ + return True if the the downloadable file should be located in the + current bag. + + The file can be specified either via its component metadata (as a + dict) or directly by its filepath property (as a string). False is + returned if either the filepath is not found in the bag or the filepath + property is not included in the input metadata. + + :param cmp: either a dict containing the component metadata describing + the data file or a string giving the file's filepath. + """ + loc = self.bag_location(cmp) + return loc == self.bag.name + + def available_in_cached_bag(self, cmp, inbag=None): + """ + return true if the specified data file can be found in a cached + member bag. + + The file can be specified either via its component metadata (as a + dict) or directly by its filepath property (as a string). False is + returned if either the filepath is not found in a cached bag, if + the location of the bag cache directory is not known, or if the + filepath property is not included in the given component metadata. + + :param cmp: either a dict containing the component metadata describing + the data file or a string giving the file's filepath. + :param str inbag: the name of the bag that should contain the file. + If None, this path will be looked up in the current bag's + file lookup list. + """ + if isinstance(cmp, Mapping): + if 'filepath' not in cmp: + return False + cmp = cmp['filepath'] + + if not inbag: + inbag = self.bag_location(cmp) + if not inbag: + return False + + locs = [ os.path.join(self._store, inbag) ] + if not os.path.isdir(locs[0]): + locs = [os.path.join(self._store, f) for f in os.listdir(self._store) + if f.startswith(inbag+".")] + if len(locs) == 0: + return False + + for loc in locs: + if not os.path.isfile(loc): + continue + try: + mbag = mb.open_bag(loc) + except Exception as ex: + continue + if mbag.isfile('/'.join(['data', cmp])): + return True + + return False + + def has_pdr_url(self, cmp): + """ + return True if the specified data file is downloadable via the PDR's + distribution service. + + The data file can either be specified via its component metadata (as a + dict) or directly by its downloadURL property (as a string). False + is returned if the property is not included in the component metadata + or if the URL does not match the base associated with the distribution + service. + + :param cmp: either a dict containing the component metadata describing + the data file or a string giving the file's download URL. + """ + if isinstance(cmp, Mapping): + if 'downloadURL' not in cmp: + return False + cmp = cmp['downloadURL'] + + return bool(self._disturlpat.match(cmp)) + + @classmethod + def head_url(cls, url): + """ + make a HEAD request on the given URL and return the status code + and associated message as a tuple. + + This raises a requests.RequestsException if a connection cannot be + made. + """ + resp = None + try: + resp = requests.head(url, allow_redirects=True) + return (resp.status_code, resp.reason) + finally: + if resp is not None: + resp.close() + + + def available_via_url(self, cmp): + """ + return True if the specified data file appears available via its + download URL. A HEAD request is conducted on the download URL; True + is returned if the request returns a 2XX status. + + The data file can either be specified via its component metadata (as a + dict) or directly by its downloadURL property (as a string). False + is returned if the property is not included in the component metadata + or if the URL does not match the base associated with the distribution + service. + + :param cmp: either a dict containing the component metadata describing + the data file or a string giving the file's download URL. + """ + dlurl = cmp + if isinstance(cmp, Mapping): + if 'downloadURL' not in cmp: + return False + dlurl = cmp['downloadURL'] + cmp = cmp.get('filepath', dlurl) + + try: + (stat, msg) = self.head_url(dlurl) + ok = stat >= 200 and stat < 300 + if not ok and self.log: + self.log.debug("HEAD on %s: %s (%i)", cmp, msg, stat) + return ok + except requests.RequestException as ex: + if self.log: + self.log.warn("Trouble accessing download URL: " + str(ex) + + "\n ({0})".format(cmp)) + return False + + def available_as(self, cmp, strict=False, viadistrib=True): + """ + return an enumeration value indicating how the specified data file is + found to be available. + + :param dict cmp: a dict containing the component metadata describing + the data file + :param bool strict: if True, don't assume if remote bag containing the + file is available that the file is actually in the + bag. Currently, this implementation will return + False if the file is not available from any other + source. + :param bool viadistrib: if True, only check to see if the file is + available via its downloadURL if the URL points + to the PDR's distribution service. + """ + if self.available_in_bag(cmp): + return self.AVAIL_IN_BAG + if self.available_in_cached_bag(cmp): + return self.AVAIL_IN_CACHED_BAG + if (not viadistrib or self.has_pdr_url(cmp.get('downloadURL',''))) and \ + self.available_via_url(cmp): + return self.AVAIL_VIA_URL + if not strict and self._distsvc and self.containing_bag_available(cmp): + return self.AVAIL_IN_REMOTE_BAG + return self.AVAIL_NOT + + def available(self, cmp, strict=False): + """ + return True if the specified data file is currently available somewhere. + This function (using available_as()) will cycle through possible + locations of the file, searching until it finds it. This includes: + 1. the current bag + 2. in a bag located in a local cache + 3. at its download URL + 4. in a remote bag available via the distribution service* + + When the file is found, True is returned; otherwise, False is returned. + + *in this implementation with location (4), the remote bag's contents + are not examined; only the availability of that bag is checked. + """ + return self.available_as(cmp, strict) is not self.AVAIL_NOT + + def containing_bag_available(self, cmp): + """ + return True if the member bag that contains the specified component + is available via the distribution service. An exception is raised + if this checker was not configured with the distribution service + endpoint configured or if the service is not available. + + The file can be specified either via its component metadata (as a + dict) or directly by its filepath property (as a string). False is + returned if either the filepath is not found in a cached bag, if + the location of the bag cache directory is not known, or if the + filepath property is not included in the given component metadata. + + :param cmp: either a dict containing the component metadata describing + the data file or a string giving the file's filepath. + """ + if isinstance(cmp, Mapping): + if 'filepath' not in cmp: + return False + cmp = cmp['filepath'] + + mbagname = self.bag_location(cmp) + if not mbagname: + return False + try: + parts = parse_bag_name(mbagname) + except ValueError as ex: + if self.log: + self.log.warn("data file listed as in bag with illegal name: "+ + mbagname) + return False + parts[1] = parts[1] or "0" + parts[1] = re.sub(r'_','.',parts[1]) + + if not self._distsvc: + raise StateException("Distribution service not configured") + bagsvc = BagDistribClient(parts[0], self._distsvc) + + try: + matches = [f for f in bagsvc.list_for_version(parts[1]) + if f.startswith(mbagname+".")] + return len(matches) > 0 + + except DistribResourceNotFound as ex: + if self.log: + self.log.debug("No bags for %s found via bag service", parts[0]) + return False + + except DistribServerError as ex: + if self.log: + self.log.error("query on %s: service connect error: %s", + mbagname, str(ex)) + return False + + except DistribServiceError as ex: + if self.log: + self.log.error("unexpected error while querying on %s: %s", + mbagname, str(ex)) + + + def unavailable_files(self): + """ + return a list of the data file component filepaths that appear to + be unavailable via any means. This is a check to make sure that all + of the distributions listed in the NERDm record are either in the + present bag or otherwise previously preserved and available; in this + case, the returned list will be empty. + """ + missing = [] + nerd = self.bag.nerdm_record(False) + for cmp in nerd.get('components',[]): + if "dcat:Distribution" not in cmp.get('@type',[]) or \ + 'downloadURL' not in cmp: + continue + if not self.available(cmp): + missing.append(cmp.get('filepath') or cmp.get('downloadURL')) + + return missing + + def all_files_available(self): + """ + return True if all of the data file components are available in some + form. This is a check to make sure that all + of the distributions listed in the NERDm record are either in the + present bag or otherwise previously preserved and available; in this + case, the returned list will be empty. + """ + return len(self.unavailable_files()) == 0 + + def unindexed_files(self): + """ + return the data file component filepaths that are missing from the + mulitbag file lookup list. This is a check to make sure that all + of the distributions listed in the NERDm record are findable either in + the present bag or other member bags; in this case, the returned list + will be empty. + """ + missing = [] + nerd = self.bag.nerdm_record(False) + for cmp in nerd.get('components',[]): + if "dcat:Distribution" not in cmp.get('@type',[]) or \ + 'filepath' not in cmp: + continue + if not self.bag_location(cmp): + missing.append(cmp.get('filepath') or cmp.get('downloadURL')) + + return missing + + def all_files_indexed(self): + """ + return True if all the data file components given in the NERDm metadata + are included in the multibag file lookup list. This is a check to make + sure that all of the distributions listed in the NERDm record are + findable either in the present bag or other member bags. + """ + return len(self.unindexed_files()) == 0 + + def check_all_data_files(self): + """ + return True if all the data files described in the NERDm metadata are + findable and available. This returns False if either + all_files_indexed() or all_files_available() return False. + """ + return self.all_files_indexed() and self.all_files_available() + diff --git a/python/tests/nistoar/pdr/preserv/bagger/test_datachecker.py b/python/tests/nistoar/pdr/preserv/bagger/test_datachecker.py new file mode 100644 index 000000000..7b3fb27c5 --- /dev/null +++ b/python/tests/nistoar/pdr/preserv/bagger/test_datachecker.py @@ -0,0 +1,337 @@ +from __future__ import print_function +import os, sys, pdb, shutil, logging, json, time, re + +import unittest as test + +from nistoar.testing import * +from nistoar.pdr.distrib import client as dcli +from nistoar.pdr.preserv.bagit.bag import NISTBag +import nistoar.pdr.preserv.bagger.datachecker as dc + +storedir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "distrib", "data") +basedir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(storedir)))))) + +port = 9091 +baseurl = "http://localhost:{0}/".format(port) + +def startService(authmeth=None): + tdir = tmpdir() + srvport = port + if authmeth == 'header': + srvport += 1 + pidfile = os.path.join(tdir,"simsrv"+str(srvport)+".pid") + + wpy = "python/tests/nistoar/pdr/distrib/sim_distrib_srv.py" + cmd = "uwsgi --daemonize {0} --plugin python --http-socket :{1} " \ + "--wsgi-file {2} --pidfile {3}" + cmd = cmd.format(os.path.join(tdir,"simsrv.log"), srvport, + os.path.join(basedir, wpy), pidfile) + return os.system(cmd) == 0 + +def stopService(authmeth=None): + tdir = tmpdir() + srvport = port + if authmeth == 'header': + srvport += 1 + pidfile = os.path.join(tdir,"simsrv"+str(srvport)+".pid") + + cmd = "uwsgi --stop {0}".format(os.path.join(tdir, + "simsrv"+str(srvport)+".pid")) + os.system(cmd) + time.sleep(1) + +loghdlr = None +rootlog = None +def setUpModule(): + ensure_tmpdir() + rootlog = logging.getLogger() + loghdlr = logging.FileHandler(os.path.join(tmpdir(),"test_simsrv.log")) + loghdlr.setLevel(logging.DEBUG) + rootlog.addHandler(loghdlr) + +def tearDownModule(): + global loghdlr + if loghdlr: + if rootlog: + rootlog.removeLog(loghdlr) + loghdlr = None + rmtmpdir() + +class TestDataChecker(test.TestCase): + + hbagsrc = os.path.join(storedir, "pdr2210.3_1_3.mbag0_3-5.zip") + + def setUp(self): + self.tf = Tempfiles() + bagp = self.tf.mkdir("preserv") + + uz = "cd %s && unzip -q %s" % (bagp, self.hbagsrc) + if os.system(uz) != 0: + raise RuntimeError("Failed to unpack sample bag") + self.hbag = os.path.join(bagp, os.path.basename(self.hbagsrc[:-4])) + + self.config = { 'store_dir': storedir } + self.ckr = dc.DataChecker(NISTBag(self.hbag), self.config, + logging.getLogger("datachecker")) + + def tearDown(self): + self.ckr = None + self.tf.clean() + + def test_ctor(self): + self.assertEqual(self.ckr.bag.name, "pdr2210.3_1_3.mbag0_3-5") + self.assertTrue(self.ckr.log) + self.assertFalse(self.ckr._distsvc) + + def test_available_in_bag(self): + self.assertTrue(self.ckr.available_in_bag('trial1.json')) + self.assertTrue(not self.ckr.available_in_bag('trial2.json')) + self.assertTrue(not self.ckr.available_in_bag('trial3/trial3a.json')) + self.assertTrue(not self.ckr.available_in_bag('goob.txt')) + + cmp = self.ckr.bag.nerd_metadata_for('trial1.json') + self.assertTrue(self.ckr.available_in_bag(cmp)) + del cmp['filepath'] + self.assertTrue(not self.ckr.available_in_bag(cmp)) + cmp = self.ckr.bag.nerd_metadata_for('trial2.json') + self.assertTrue(not self.ckr.available_in_bag(cmp)) + cmp = self.ckr.bag.nerd_metadata_for('trial3/trial3a.json') + self.assertTrue(not self.ckr.available_in_bag(cmp)) + + def test_bag_location(self): + self.assertEqual(self.ckr.bag_location('trial1.json'), + "pdr2210.3_1_3.mbag0_3-5") + self.assertEqual(self.ckr.bag_location('trial2.json'), + "pdr2210.1_0.mbag0_3-1") + self.assertEqual(self.ckr.bag_location('trial3/trial3a.json'), + "pdr2210.2.mbag0_3-2") + self.assertIsNone(self.ckr.bag_location('goob.txt')); + + cmp = self.ckr.bag.nerd_metadata_for('trial1.json') + self.assertEqual(self.ckr.bag_location(cmp), "pdr2210.3_1_3.mbag0_3-5") + del cmp['filepath'] + self.assertIsNone(self.ckr.bag_location(cmp)); + cmp = self.ckr.bag.nerd_metadata_for('trial2.json') + self.assertEqual(self.ckr.bag_location(cmp), "pdr2210.1_0.mbag0_3-1") + cmp = self.ckr.bag.nerd_metadata_for('trial3/trial3a.json') + self.assertEqual(self.ckr.bag_location(cmp), "pdr2210.2.mbag0_3-2") + + def test_located_here(self): + self.assertTrue(self.ckr.located_here('trial1.json')) + self.assertTrue(not self.ckr.located_here('trial2.json')) + self.assertTrue(not self.ckr.located_here('trial3/trial3a.json')) + self.assertTrue(not self.ckr.located_here('goob.txt')) + + cmp = self.ckr.bag.nerd_metadata_for('trial1.json') + self.assertTrue(self.ckr.located_here(cmp)) + del cmp['filepath'] + self.assertTrue(not self.ckr.located_here(cmp)) + cmp = self.ckr.bag.nerd_metadata_for('trial2.json') + self.assertTrue(not self.ckr.located_here(cmp)) + cmp = self.ckr.bag.nerd_metadata_for('trial3/trial3a.json') + self.assertTrue(not self.ckr.located_here(cmp)) + + def test_availabe_in_cached_bag(self): + self.assertTrue(self.ckr.available_in_cached_bag('trial1.json')) + self.assertTrue(self.ckr.available_in_cached_bag('trial2.json')) + self.assertTrue(self.ckr.available_in_cached_bag('trial3/trial3a.json')) + self.assertTrue(not self.ckr.available_in_cached_bag('goob.txt')) + + cmp = self.ckr.bag.nerd_metadata_for('trial1.json') + self.assertTrue(self.ckr.available_in_cached_bag(cmp)) + del cmp['filepath'] + self.assertTrue(not self.ckr.available_in_cached_bag(cmp)) + cmp = self.ckr.bag.nerd_metadata_for('trial2.json') + self.assertTrue(self.ckr.available_in_cached_bag(cmp)) + cmp = self.ckr.bag.nerd_metadata_for('trial3/trial3a.json') + self.assertTrue(self.ckr.available_in_cached_bag(cmp)) + + cmp['filepath'] = "goob.txt" + self.assertFalse(self.ckr.available_in_cached_bag(cmp)) + + with open(os.path.join(self.ckr.bag.dir,"multibag","file-lookup.tsv"), + 'a') as fd: + fd.write("data/goob.txt\t"+self.ckr.bag.name+"\n") + self.ckr = dc.DataChecker(NISTBag(self.hbag), self.config, + logging.getLogger("datachecker")) + self.assertTrue(self.ckr.bag_location("goob.txt")) + self.assertFalse(self.ckr.available_in_cached_bag(cmp)) + + def test_has_pdr_url(self): + self.assertTrue(self.ckr.has_pdr_url("http://localhost:8888/od/ds/blah")) + self.assertFalse(self.ckr.has_pdr_url("http://localhost:8888/goob/blah")) + + cmp = self.ckr.bag.nerd_metadata_for('trial1.json') + self.assertTrue(self.ckr.has_pdr_url(cmp)) + del cmp['downloadURL'] + self.assertTrue(not self.ckr.has_pdr_url(cmp)) + cmp = self.ckr.bag.nerd_metadata_for('trial2.json') + self.assertTrue(self.ckr.has_pdr_url(cmp)) + cmp = self.ckr.bag.nerd_metadata_for('trial3/trial3a.json') + self.assertTrue(self.ckr.has_pdr_url(cmp)) + + def test_unindexed_files(self): + self.assertEqual(len(self.ckr.unindexed_files()), 0) + self.assertTrue(self.ckr.all_files_indexed()) + + def test_unindexed_files_false(self): + self.assertTrue(self.ckr.all_files_indexed()) + + shutil.copytree(os.path.join(self.ckr.bag.metadata_dir, "trial1.json"), + os.path.join(self.ckr.bag.metadata_dir, "goob.json")) + nerdm = self.ckr.bag.nerd_metadata_for("goob.json") + nerdm['filepath'] = "goob.json" + nerdm['downloadURL'] = re.sub(r'trial1.json','goob.json', + nerdm['downloadURL']) + with open(os.path.join(self.ckr.bag.metadata_dir, "goob.json", + "nerdm.json"), 'w') as fd: + json.dump(nerdm, fd, indent=2) + + self.assertEqual(self.ckr.unindexed_files(), ['goob.json']) + self.assertFalse(self.ckr.all_files_indexed()) + self.assertFalse(self.ckr.available_in_cached_bag(nerdm)) + self.assertEqual(self.ckr.unavailable_files(), ['goob.json']) + self.assertFalse(self.ckr.all_files_available()) + + +class TestDataCheckerWithService(test.TestCase): + + hbagsrc = os.path.join(storedir, "pdr2210.3_1_3.mbag0_3-5.zip") + hbag = None + + @classmethod + def setUpClass(cls): + if not startService(): + raise RuntimeError("Failed to start mock service") + + @classmethod + def tearDownClass(cls): + stopService() + + def setUp(self): + self.tf = Tempfiles() + bagp = self.tf.mkdir("preserv") + + uz = "cd %s && unzip -q %s" % (bagp, self.hbagsrc) + if os.system(uz) != 0: + raise RuntimeError("Failed to unpack sample bag") + self.hbag = os.path.join(bagp, os.path.basename(self.hbagsrc[:-4])) + + self.config = { + 'store_dir': storedir, + 'repo_access': { + 'distrib_service': { + 'service_endpoint': 'http://localhost:9091/' + } + } + } + self.ckr = dc.DataChecker(NISTBag(self.hbag), self.config, + logging.getLogger("datachecker")) + + def tearDown(self): + self.ckr = None + self.tf.clean() + + def test_ctor(self): + self.assertEqual(self.ckr.bag.name, "pdr2210.3_1_3.mbag0_3-5") + self.assertTrue(self.ckr.log) + self.assertTrue(self.ckr._distsvc) + + def test_head_url(self): + (stat, msg) = dc.DataChecker.head_url( + "http://localhost:9091/od/ds/_aip/pdr1010.mbag0_3-1.zip") + self.assertEqual(stat, 200, msg) + (stat, msg) = dc.DataChecker.head_url( + "http://localhost:9091/_aip/pdr1010.mbag0_3-1.zip") + self.assertEqual(stat, 200, msg) + + def test_available_via_url(self): + self.assertTrue(self.ckr.available_via_url( + "http://localhost:9091/od/ds/_aip/pdr1010.mbag0_3-1.zip")) + self.assertTrue(self.ckr.available_via_url( + "http://localhost:9091/_aip/pdr1010.mbag0_3-1.zip")) + + ## mock server not capable of extracting distributions from bags + ## + # cmp = self.ckr.bag.nerd_metadata_for('trial1.json') + # cmp['downloadURL'] = \ + # re.sub(r'data\.nist\.gov', 'localhost:9091', cmp['downloadURL']) + # self.assertTrue(self.ckr.available_via_url(cmp)) + + # cmp = self.ckr.bag.nerd_metadata_for('trial3/trial3a.json') + # cmp['downloadURL'] = \ + # re.sub(r'data\.nist\.gov', 'localhost:9091', cmp['downloadURL']) + # self.assertTrue(self.ckr.available_via_url(cmp)) + + def test_containing_bag_available(self): + self.assertTrue(self.ckr.containing_bag_available("trial1.json")) + self.assertTrue(self.ckr.containing_bag_available("trial2.json")) + self.assertTrue(self.ckr.containing_bag_available("trial3/trial3a.json")) + self.assertFalse(self.ckr.containing_bag_available("goober")) + + cmp = self.ckr.bag.nerd_metadata_for('trial1.json') + self.assertTrue(self.ckr.containing_bag_available(cmp)) + + cmp = self.ckr.bag.nerd_metadata_for('trial2.json') + self.assertTrue(self.ckr.containing_bag_available(cmp)) + + cmp = self.ckr.bag.nerd_metadata_for('trial3/trial3a.json') + self.assertTrue(self.ckr.containing_bag_available(cmp)) + + cmp['filepath'] = "goob/gurn.txt" + self.assertFalse(self.ckr.containing_bag_available(cmp)) + + + def test_available_as(self): + cmp = self.ckr.bag.nerd_metadata_for('trial1.json') + self.assertIs(self.ckr.available_as(cmp), self.ckr.AVAIL_IN_BAG) + + cmp = self.ckr.bag.nerd_metadata_for('trial2.json') + self.assertIs(self.ckr.available_as(cmp), self.ckr.AVAIL_IN_CACHED_BAG) + + cmp = self.ckr.bag.nerd_metadata_for('trial3/trial3a.json') + self.assertIs(self.ckr.available_as(cmp), self.ckr.AVAIL_IN_CACHED_BAG) + + del cmp['filepath'] + self.assertIs(self.ckr.available_as(cmp, True), self.ckr.AVAIL_NOT) + + def test_available(self): + cmp = self.ckr.bag.nerd_metadata_for('trial1.json') + self.assertTrue(self.ckr.available(cmp)) + + cmp = self.ckr.bag.nerd_metadata_for('trial2.json') + self.assertTrue(self.ckr.available(cmp)) + + cmp = self.ckr.bag.nerd_metadata_for('trial3/trial3a.json') + self.assertTrue(self.ckr.available(cmp)) + + del cmp['filepath'] + self.assertFalse(self.ckr.available(cmp, True)) + + def test_unindexed_files_false(self): + self.assertTrue(self.ckr.all_files_indexed()) + + shutil.copytree(os.path.join(self.ckr.bag.metadata_dir, "trial1.json"), + os.path.join(self.ckr.bag.metadata_dir, "goob.json")) + nerdm = self.ckr.bag.nerd_metadata_for("goob.json") + nerdm['filepath'] = "goob.json" + nerdm['downloadURL'] = re.sub(r'trial1.json','goob.json', + nerdm['downloadURL']) + with open(os.path.join(self.ckr.bag.metadata_dir, "goob.json", + "nerdm.json"), 'w') as fd: + json.dump(nerdm, fd, indent=2) + + self.assertEqual(self.ckr.unindexed_files(), ['goob.json']) + self.assertFalse(self.ckr.all_files_indexed()) + self.assertFalse(self.ckr.available_in_cached_bag(nerdm)) + self.assertEqual(self.ckr.unavailable_files(), ['goob.json']) + self.assertFalse(self.ckr.all_files_available()) + + def test_unavailable_files(self): + self.assertEqual(len(self.ckr.unavailable_files()), 0) + self.assertTrue(self.ckr.all_files_available()) + + + +if __name__ == '__main__': + test.main() From f3b56c5887194c96898bcc960f9b7e109699dc18 Mon Sep 17 00:00:00 2001 From: Ray Plante Date: Tue, 6 Aug 2019 12:53:18 -0400 Subject: [PATCH 04/13] preservation: integrate datachecker into midas bagger; debug --- .../nistoar/pdr/preserv/bagger/datachecker.py | 54 +++++++++++++++---- python/nistoar/pdr/preserv/bagger/midas.py | 39 ++++++++++++++ .../nistoar/pdr/preserv/bagger/test_midas.py | 6 +++ 3 files changed, 88 insertions(+), 11 deletions(-) diff --git a/python/nistoar/pdr/preserv/bagger/datachecker.py b/python/nistoar/pdr/preserv/bagger/datachecker.py index cd2f60598..423bcbf1c 100644 --- a/python/nistoar/pdr/preserv/bagger/datachecker.py +++ b/python/nistoar/pdr/preserv/bagger/datachecker.py @@ -268,7 +268,7 @@ def available_as(self, cmp, strict=False, viadistrib=True): return self.AVAIL_IN_REMOTE_BAG return self.AVAIL_NOT - def available(self, cmp, strict=False): + def available(self, cmp, strict=False, viadistrib=True): """ return True if the specified data file is currently available somewhere. This function (using available_as()) will cycle through possible @@ -283,7 +283,7 @@ def available(self, cmp, strict=False): *in this implementation with location (4), the remote bag's contents are not examined; only the availability of that bag is checked. """ - return self.available_as(cmp, strict) is not self.AVAIL_NOT + return self.available_as(cmp, strict, viadistrib) is not self.AVAIL_NOT def containing_bag_available(self, cmp): """ @@ -345,13 +345,22 @@ def containing_bag_available(self, cmp): mbagname, str(ex)) - def unavailable_files(self): + def unavailable_files(self, strict=False, viadistrib=True): """ return a list of the data file component filepaths that appear to be unavailable via any means. This is a check to make sure that all of the distributions listed in the NERDm record are either in the present bag or otherwise previously preserved and available; in this case, the returned list will be empty. + + :param bool strict: if True, don't assume if remote bag containing the + file is available that the file is actually in the + bag. Currently, this implementation will return + False if the file is not available from any other + source. + :param bool viadistrib: if True, check a file's availability only if + its download URL points to the PDR's + distribution service. """ missing = [] nerd = self.bag.nerdm_record(False) @@ -359,28 +368,39 @@ def unavailable_files(self): if "dcat:Distribution" not in cmp.get('@type',[]) or \ 'downloadURL' not in cmp: continue - if not self.available(cmp): + if viadistrib and 'downloadURL' in cmp and \ + not self.has_pdr_url(cmp['downloadURL']): + continue + if not self.available(cmp, strict, False): missing.append(cmp.get('filepath') or cmp.get('downloadURL')) return missing - def all_files_available(self): + def all_files_available(self, strict=False, viadistrib=True): """ return True if all of the data file components are available in some form. This is a check to make sure that all of the distributions listed in the NERDm record are either in the present bag or otherwise previously preserved and available; in this case, the returned list will be empty. + + :param bool viadistrib: if True, check only those files if its + downloadURL if the URL points to the PDR's + distribution service. """ - return len(self.unavailable_files()) == 0 + return len(self.unavailable_files(strict, viadistrib)) == 0 - def unindexed_files(self): + def unindexed_files(self, viadistrib=True): """ return the data file component filepaths that are missing from the mulitbag file lookup list. This is a check to make sure that all of the distributions listed in the NERDm record are findable either in the present bag or other member bags; in this case, the returned list will be empty. + + :param bool viadistrib: if True, check only those files if its + downloadURL if the URL points to the PDR's + distribution service. """ missing = [] nerd = self.bag.nerdm_record(False) @@ -388,25 +408,37 @@ def unindexed_files(self): if "dcat:Distribution" not in cmp.get('@type',[]) or \ 'filepath' not in cmp: continue + if viadistrib and 'downloadURL' in cmp and \ + not self.has_pdr_url(cmp['downloadURL']): + continue if not self.bag_location(cmp): missing.append(cmp.get('filepath') or cmp.get('downloadURL')) return missing - def all_files_indexed(self): + def all_files_indexed(self, viadistrib=True): """ return True if all the data file components given in the NERDm metadata are included in the multibag file lookup list. This is a check to make sure that all of the distributions listed in the NERDm record are findable either in the present bag or other member bags. + + :param bool viadistrib: if True, check only those files if its + downloadURL if the URL points to the PDR's + distribution service. """ - return len(self.unindexed_files()) == 0 + return len(self.unindexed_files(viadistrib)) == 0 - def check_all_data_files(self): + def check_all_data_files(self, strict=False, viadistrib=True): """ return True if all the data files described in the NERDm metadata are findable and available. This returns False if either all_files_indexed() or all_files_available() return False. + + :param bool viadistrib: if True, check only those files if its + downloadURL if the URL points to the PDR's + distribution service. """ - return self.all_files_indexed() and self.all_files_available() + return self.all_files_indexed(viadistrib) and \ + self.all_files_available(strict, viadistrib) diff --git a/python/nistoar/pdr/preserv/bagger/midas.py b/python/nistoar/pdr/preserv/bagger/midas.py index c6d73b26b..93446b5f4 100644 --- a/python/nistoar/pdr/preserv/bagger/midas.py +++ b/python/nistoar/pdr/preserv/bagger/midas.py @@ -12,6 +12,7 @@ import os, errno, logging, re, json, shutil, threading, time from abc import ABCMeta, abstractmethod, abstractproperty from collections import OrderedDict +from copy import deepcopy from .base import SIPBagger, moddate_of, checksum_of, read_pod from .base import sys as _sys @@ -25,6 +26,7 @@ ConfigurationException, StateException, PODError, PreservationStateException) from .prepupd import UpdatePrepService +from .datachecker import DataChecker from nistoar.nerdm.merge import MergerFactory # _sys = PreservationSystem() @@ -1084,6 +1086,9 @@ def _make_bag_impl(self): if finalcfg.get('validate', True): # this will raise an exception if any issues are found self._validate(finalcfg.get('validator', {})) + if finalcfg.get('check_data_files', True): + # this will raise an exception if any issues are found + self._check_data_files(finalcfg.get('data_checker', {})) return self.bagbldr.bagdir @@ -1241,3 +1246,37 @@ def _validate(self, config): else: log.info("%s: bag validation completed without issues", self.bagbldr.bagname) + + def _check_data_files(self, data_checker_config, viadistrib=True): + """ + make sure all of the data files are accounted for. The bag must + either contain all of the data files listed in the nerdm components + or they must be available else where in the publishing pipeline: + the output storage dir (possibly still avaiting migration to the + repository) or already published in the repository. + """ + config = { + "repo_access": self.cfg.get('repo_access', {}), + "store_dir": self.cfg.get('store_dir') + } + config.update( deepcopy(data_checker_config) ) + + chkr = DataChecker(self.bagbldr.bag, config,log.getChild("data_checker")) + + missing = chkr.unindexed_files(viadistrib=viadistrib) + if len(missing) > 0: + log.error("master bag for id=%s is missing the following "+ + "files from the multibag file index:\n %s", + self.name, "\n ".join(missing)) + raise AIPValidationError("Bag data check failure: data files are " + + "missing from the multibag file index") + + missing = chkr.unavailable_files(viadistrib=viadistrib) + if len(missing) > 0: + log.error("unable to locate the following files described " + + "in master bag for id=%s:\n %s", + self.name, "\n ".join(missing)) + raise AIPValidationError("Bag data check failure: unable to locate "+ + "some data files in any available bags") + + diff --git a/python/tests/nistoar/pdr/preserv/bagger/test_midas.py b/python/tests/nistoar/pdr/preserv/bagger/test_midas.py index a018ff46d..3d8e065d4 100644 --- a/python/tests/nistoar/pdr/preserv/bagger/test_midas.py +++ b/python/tests/nistoar/pdr/preserv/bagger/test_midas.py @@ -1001,6 +1001,12 @@ def test_make_bag(self): self.assertTrue(os.path.isfile(os.path.join(self.bagr.bagdir, "about.txt"))) + # make sure we could've found missing files + self.bagr._check_data_files(self.bagr.cfg.get('data_checker',{})) + with self.assertRaises(AIPValidationError): + self.bagr._check_data_files(self.bagr.cfg.get('data_checker',{}), + viadistrib=False) + def test_determine_updated_version(self): self.bagr.prepare(nodata=False) bag = NISTBag(self.bagr.bagdir) From 3633d62e59db2014b332f630a1be7a2f03622a43 Mon Sep 17 00:00:00 2001 From: Ray Plante Date: Tue, 6 Aug 2019 13:57:54 -0400 Subject: [PATCH 05/13] python: fix tests for changes to test aip names --- .../pdr/distrib/test_sim_distrib_srv.py | 52 ++++++++++--------- .../pdr/preserv/bagger/test_prepupd_cacher.py | 4 +- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/python/tests/nistoar/pdr/distrib/test_sim_distrib_srv.py b/python/tests/nistoar/pdr/distrib/test_sim_distrib_srv.py index c02b0c971..613f1d75b 100644 --- a/python/tests/nistoar/pdr/distrib/test_sim_distrib_srv.py +++ b/python/tests/nistoar/pdr/distrib/test_sim_distrib_srv.py @@ -103,13 +103,15 @@ def test_ctor(self): self.assertIn("pdr2210.1_0.mbag0_3-0.zip", self.arch._aips['pdr2210']['1.0']) - self.assertEqual(len(self.arch._aips['pdr2210']['1.0']), 1) - self.assertIn("pdr2210.2.mbag0_3-1.zip", + self.assertIn("pdr2210.1_0.mbag0_3-1.zip", + self.arch._aips['pdr2210']['1.0']) + self.assertEqual(len(self.arch._aips['pdr2210']['1.0']), 2) + self.assertIn("pdr2210.2.mbag0_3-2.zip", self.arch._aips['pdr2210']['2']) - self.assertEqual(len(self.arch._aips['pdr2210']['1.0']), 1) - self.assertIn("pdr2210.3_1_3.mbag0_3-4.zip", + self.assertEqual(len(self.arch._aips['pdr2210']['2']), 1) + self.assertIn("pdr2210.3_1_3.mbag0_3-5.zip", self.arch._aips['pdr2210']['3.1.3']) - self.assertEqual(len(self.arch._aips['pdr2210']['1.0']), 1) + self.assertEqual(len(self.arch._aips['pdr2210']['3.1.3']), 1) self.assertIn("1491.1_0.mbag0_4-0.zip", self.arch._aips['1491']['1.0']) @@ -131,8 +133,8 @@ def test_list_bags(self): self.assertEqual([f['name'] for f in self.arch.list_bags('pdr1010')], ["pdr1010.mbag0_3-1.zip", "pdr1010.mbag0_3-2.zip"]) self.assertEqual([f['name'] for f in self.arch.list_bags('pdr2210')], - ["pdr2210.1_0.mbag0_3-0.zip", "pdr2210.2.mbag0_3-1.zip", - "pdr2210.3_1_3.mbag0_3-4.zip"]) + ["pdr2210.1_0.mbag0_3-0.zip", "pdr2210.1_0.mbag0_3-1.zip", + "pdr2210.2.mbag0_3-2.zip", "pdr2210.3_1_3.mbag0_3-5.zip"]) self.assertEqual(self.arch.list_bags('pdr1010')[0], {'name': 'pdr1010.mbag0_3-1.zip', 'aipid': 'pdr1010', 'contentLength': 375, 'sinceVersion': '1', @@ -152,13 +154,13 @@ def test_list_for_version(self): self.assertEqual([f['name'] for f in self.arch.list_for_version('pdr2210', '1.0')], - ["pdr2210.1_0.mbag0_3-0.zip"]) + ["pdr2210.1_0.mbag0_3-0.zip", "pdr2210.1_0.mbag0_3-1.zip"]) self.assertEqual([f['name'] for f in self.arch.list_for_version('pdr2210', '2')], - ["pdr2210.2.mbag0_3-1.zip"]) + ["pdr2210.2.mbag0_3-2.zip"]) self.assertEqual([f['name'] for f in self.arch.list_for_version('pdr2210', '3.1.3')], - ["pdr2210.3_1_3.mbag0_3-4.zip"]) + ["pdr2210.3_1_3.mbag0_3-5.zip"]) self.assertEqual([f['name'] for f in self.arch.list_for_version('pdr2210', '3.1.2')], []) @@ -171,20 +173,20 @@ def test_list_for_latest_version(self): ["pdr1010.mbag0_3-1.zip", "pdr1010.mbag0_3-2.zip"]) self.assertEqual([f['name'] for f in self.arch.list_for_version('pdr2210', 'latest')], - ["pdr2210.3_1_3.mbag0_3-4.zip"]) + ["pdr2210.3_1_3.mbag0_3-5.zip"]) self.assertEqual([f['name'] for f in self.arch.list_for_version('pdr2210')], - ["pdr2210.3_1_3.mbag0_3-4.zip"]) + ["pdr2210.3_1_3.mbag0_3-5.zip"]) def test_head_for(self): self.assertEqual(self.arch.head_for('pdr1010', '1')['name'], "pdr1010.mbag0_3-2.zip") self.assertEqual(self.arch.head_for('pdr2210', '1.0')['name'], - "pdr2210.1_0.mbag0_3-0.zip") + "pdr2210.1_0.mbag0_3-1.zip") self.assertEqual(self.arch.head_for('pdr2210', '2')['name'], - "pdr2210.2.mbag0_3-1.zip") + "pdr2210.2.mbag0_3-2.zip") self.assertEqual(self.arch.head_for('pdr2210', '3.1.3')['name'], - "pdr2210.3_1_3.mbag0_3-4.zip") + "pdr2210.3_1_3.mbag0_3-5.zip") self.assertIsNone(self.arch.head_for('pdr2210', '3')) def test_head_for_latest(self): @@ -236,8 +238,8 @@ def test_list_bags(self): self.assertEqual(resp.status_code, 200) self.assertEqual(resp.reason, "All bags for ID") self.assertEqual([f['name'] for f in resp.json()], - ["pdr2210.1_0.mbag0_3-0.zip", "pdr2210.2.mbag0_3-1.zip", - "pdr2210.3_1_3.mbag0_3-4.zip"]) + ["pdr2210.1_0.mbag0_3-0.zip", "pdr2210.1_0.mbag0_3-1.zip", + "pdr2210.2.mbag0_3-2.zip", "pdr2210.3_1_3.mbag0_3-5.zip"]) def test_versions_for(self): resp = requests.get(baseurl+"/pdr1010/_aip/_v") @@ -265,19 +267,19 @@ def test_list_for_version(self): self.assertEqual(resp.status_code, 200) self.assertEqual(resp.reason, "All bags for ID/vers") self.assertEqual([f['name'] for f in resp.json()], - ["pdr2210.1_0.mbag0_3-0.zip"]) + ["pdr2210.1_0.mbag0_3-0.zip", "pdr2210.1_0.mbag0_3-1.zip"]) resp = requests.get(baseurl+"/pdr2210/_aip/_v/2") self.assertEqual(resp.status_code, 200) self.assertEqual(resp.reason, "All bags for ID/vers") self.assertEqual([f['name'] for f in resp.json()], - ["pdr2210.2.mbag0_3-1.zip"]) + ["pdr2210.2.mbag0_3-2.zip"]) resp = requests.get(baseurl+"/pdr2210/_aip/_v/3.1.3") self.assertEqual(resp.status_code, 200) self.assertEqual(resp.reason, "All bags for ID/vers") self.assertEqual([f['name'] for f in resp.json()], - ["pdr2210.3_1_3.mbag0_3-4.zip"]) + ["pdr2210.3_1_3.mbag0_3-5.zip"]) def test_list_for_latest_version(self): resp = requests.get(baseurl+"/pdr1010/_aip/_v/latest") @@ -290,7 +292,7 @@ def test_list_for_latest_version(self): self.assertEqual(resp.status_code, 200) self.assertEqual(resp.reason, "All bags for ID/vers") self.assertEqual([f['name'] for f in resp.json()], - ["pdr2210.3_1_3.mbag0_3-4.zip"]) + ["pdr2210.3_1_3.mbag0_3-5.zip"]) def test_head(self): resp = requests.get(baseurl+"/pdr1010/_aip/_v/1/_head") @@ -301,17 +303,17 @@ def test_head(self): resp = requests.get(baseurl+"/pdr2210/_aip/_v/1.0/_head") self.assertEqual(resp.status_code, 200) self.assertEqual(resp.reason, "Head bags for ID/vers") - self.assertEqual(resp.json()['name'], "pdr2210.1_0.mbag0_3-0.zip") + self.assertEqual(resp.json()['name'], "pdr2210.1_0.mbag0_3-1.zip") resp = requests.get(baseurl+"/pdr2210/_aip/_v/2/_head") self.assertEqual(resp.status_code, 200) self.assertEqual(resp.reason, "Head bags for ID/vers") - self.assertEqual(resp.json()['name'], "pdr2210.2.mbag0_3-1.zip") + self.assertEqual(resp.json()['name'], "pdr2210.2.mbag0_3-2.zip") resp = requests.get(baseurl+"/pdr2210/_aip/_v/3.1.3/_head") self.assertEqual(resp.status_code, 200) self.assertEqual(resp.reason, "Head bags for ID/vers") - self.assertEqual(resp.json()['name'], "pdr2210.3_1_3.mbag0_3-4.zip") + self.assertEqual(resp.json()['name'], "pdr2210.3_1_3.mbag0_3-5.zip") resp = requests.get(baseurl+"/pdr1010/_aip/_v/2/_head") self.assertEqual(resp.status_code, 404) @@ -326,7 +328,7 @@ def test_head_latest(self): resp = requests.get(baseurl+"/pdr2210/_aip/_v/latest/_head") self.assertEqual(resp.status_code, 200) self.assertEqual(resp.reason, "Head bags for ID/vers") - self.assertEqual(resp.json()['name'], "pdr2210.3_1_3.mbag0_3-4.zip") + self.assertEqual(resp.json()['name'], "pdr2210.3_1_3.mbag0_3-5.zip") def test_download(self): out = os.path.join(tmpdir(), "bag.zip") diff --git a/python/tests/nistoar/pdr/preserv/bagger/test_prepupd_cacher.py b/python/tests/nistoar/pdr/preserv/bagger/test_prepupd_cacher.py index 8b2c315cd..6ad4746b7 100644 --- a/python/tests/nistoar/pdr/preserv/bagger/test_prepupd_cacher.py +++ b/python/tests/nistoar/pdr/preserv/bagger/test_prepupd_cacher.py @@ -177,7 +177,7 @@ def test_cache_headbag(self): info = self.cacher._recall_head_info("pdr1010") self.assertIn("1", info) - hbfile = os.path.join(self.cachedir, "pdr2210.2.mbag0_3-1.zip") + hbfile = os.path.join(self.cachedir, "pdr2210.2.mbag0_3-2.zip") infofile = os.path.join(self.infodir, "pdr2210") self.assertTrue(not os.path.exists(hbfile)) self.assertTrue(not os.path.exists(infofile)) @@ -188,7 +188,7 @@ def test_cache_headbag(self): info = self.cacher._recall_head_info("pdr2210") self.assertIn("2", info) - hbfile = os.path.join(self.cachedir, "pdr2210.3_1_3.mbag0_3-4.zip") + hbfile = os.path.join(self.cachedir, "pdr2210.3_1_3.mbag0_3-5.zip") self.assertTrue(not os.path.exists(hbfile)) self.assertTrue(os.path.exists(infofile)) self.assertEqual(self.cacher.cache_headbag("pdr2210"), hbfile) From c054d04b321f495448971b21a2e998c71d491461 Mon Sep 17 00:00:00 2001 From: Ray Plante Date: Tue, 6 Aug 2019 13:59:15 -0400 Subject: [PATCH 06/13] midas: require store_dir data file checks; supply store_dir --- python/nistoar/pdr/preserv/bagger/midas.py | 7 +++++++ python/nistoar/pdr/preserv/service/siphandler.py | 2 ++ python/tests/nistoar/pdr/preserv/bagger/test_midas.py | 3 ++- .../tests/nistoar/pdr/preserv/bagger/test_midas_update.py | 3 ++- 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/python/nistoar/pdr/preserv/bagger/midas.py b/python/nistoar/pdr/preserv/bagger/midas.py index 93446b5f4..a4c1602a2 100644 --- a/python/nistoar/pdr/preserv/bagger/midas.py +++ b/python/nistoar/pdr/preserv/bagger/midas.py @@ -896,6 +896,13 @@ def __init__(self, midasid, bagparent, reviewdir, mddir, config = {} super(PreservationBagger, self).__init__(bagparent, config) + # check for needed configuration + if self.cfg.get('check_data_files', True) and \ + not self.cfg.get('store_dir'): + raise ConfigurationException("PreservationBagger: store_dir " + + "config param needed") + + # do a sanity check on the bag parent directory if not self.cfg.get('relative_to_indir', False): sipdir = os.path.abspath(self.indir) diff --git a/python/nistoar/pdr/preserv/service/siphandler.py b/python/nistoar/pdr/preserv/service/siphandler.py index 47d4d88b6..88a4d836b 100644 --- a/python/nistoar/pdr/preserv/service/siphandler.py +++ b/python/nistoar/pdr/preserv/service/siphandler.py @@ -365,6 +365,8 @@ def __init__(self, sipid, config, minter=None, serializer=None, bgrcfg = config.get('bagger', {}) if 'repo_access' not in bgrcfg and 'repo_access' in config: bgrcfg['repo_access'] = config['repo_access'] + if 'store_dir' not in bgrcfg and 'store_dir' in config: + bgrcfg['store_dir'] = config['store_dir'] self.bagger = PreservationBagger(sipid, bagparent, self.sipparent, self.mdbagdir, bgrcfg, self._minter, self._asupdate, sipdirname) diff --git a/python/tests/nistoar/pdr/preserv/bagger/test_midas.py b/python/tests/nistoar/pdr/preserv/bagger/test_midas.py index 3d8e065d4..ec386d853 100644 --- a/python/tests/nistoar/pdr/preserv/bagger/test_midas.py +++ b/python/tests/nistoar/pdr/preserv/bagger/test_midas.py @@ -825,7 +825,8 @@ def setUp(self): 'NIST-BagIt-Version': "0.4", 'Multibag-Version': "0.4" } - } + }, + 'store_dir': '/tmp' } self.bagr = midas.PreservationBagger(self.midasid, '_preserv', diff --git a/python/tests/nistoar/pdr/preserv/bagger/test_midas_update.py b/python/tests/nistoar/pdr/preserv/bagger/test_midas_update.py index 6bd82ba03..4a953b3b8 100644 --- a/python/tests/nistoar/pdr/preserv/bagger/test_midas_update.py +++ b/python/tests/nistoar/pdr/preserv/bagger/test_midas_update.py @@ -160,7 +160,8 @@ def setUp(self): 'metadata_service': { 'service_endpoint': "http://localhost:9092/" } - } + }, + 'store_dir': distarchdir } self.bagr = midas.PreservationBagger(self.midasid, '_preserv', From 0f64820b4d09d6ccb9fbfeb99a540f7e6e54dd23 Mon Sep 17 00:00:00 2001 From: Ray Plante Date: Tue, 6 Aug 2019 21:23:28 -0400 Subject: [PATCH 07/13] preservation bug fix: allow preupd to pull head bags from the store dir --- python/nistoar/pdr/preserv/bagger/prepupd.py | 53 +++++++++++++++----- 1 file changed, 41 insertions(+), 12 deletions(-) diff --git a/python/nistoar/pdr/preserv/bagger/prepupd.py b/python/nistoar/pdr/preserv/bagger/prepupd.py index 2ca7c0b29..dfe35fdb2 100644 --- a/python/nistoar/pdr/preserv/bagger/prepupd.py +++ b/python/nistoar/pdr/preserv/bagger/prepupd.py @@ -152,6 +152,7 @@ def __init__(self, config): if not self.sercache: raise ConfigurationException("UpdatePrepService: Missing property: "+ "headbag_cache") + self.stagedir = self.cfg.get('store_dir') scfg = self.cfg.get('distrib_service', {}) self.distsvc = distrib.RESTServiceClient(scfg.get('service_endpoint')) scfg = self.cfg.get('metadata_service', {}) @@ -163,7 +164,7 @@ def prepper_for(self, aipid, version=None, log=None): return an UpdatePrepper instance for the given dataset identifier """ return UpdatePrepper(aipid, self.cfg, self.cacher, self.mdsvc, - version, log) + self.storedir, version, log) class UpdatePrepper(object): @@ -173,7 +174,7 @@ class UpdatePrepper(object): system. """ - def __init__(self, aipid, config, headcacher, pubmdclient, + def __init__(self, aipid, config, headcacher, pubmdclient, storedir=None, version=None, log=None): """ create the prepper for the given dataset identifier. @@ -183,6 +184,7 @@ def __init__(self, aipid, config, headcacher, pubmdclient, """ self.aipid = aipid self.cacher = headcacher + self.storedir = storedir self.version = version self.mdcli = pubmdclient self.mdcache = os.path.join(self.cacher.cachedir, "_nerd") @@ -225,6 +227,23 @@ def cache_nerdm_rec(self): return None return out + def find_bag_in_store(self, version): + """ + look for a bag for a particular version of the current AIP + in the bag storage directory. (This is the directory where + AIP bags are copied to for long-term storage.) + """ + if not self.storedir: + return None + + foraip = [f for f in os.listdir(self.storedir) + if f.startswith(self.aipid+'.') ] + foraip = bagutils.select_version(foraip, version) + if len(foraip) == 0: + return None + + return bagutils.find_latest_head_bag(foraip) + def aip_exists(self): """ return true if a previously ingested AIP with the current ID exists in @@ -300,22 +319,32 @@ def create_new_update(self, destbag): "\n (may indicate earlier failure)") shutil.rmtree(mdbag) - latest_headbag = self.cache_headbag() + latest_nerd = self.cache_nerdm_rec() + if not latest_nerd: + self.log.info("ID not published previously; will start afresh") + return False + version = self.version + if not version: + nerd = utils.read_nerd(latest_nerd) + version = nerd.get('version', '0') + + # This has been published before; look for a head bag in the store dir + latest_headbag = self.find_bag_in_store(version) + if not latest_headbag: + # store dir came up empty; try the distribution service + latest_headbag = self.cache_headbag() + if latest_headbag: fmt = "Preparing update based on previous head preservation bag (%s)" self.log.info(fmt, os.path.basename(latest_headbag)) self.create_from_headbag(latest_headbag, mdbag) return True - latest_nerd = self.cache_nerdm_rec() - if latest_nerd: - self.log.info("No previous bag available; preparing based on " + - "existing published NERDm record") - self.create_from_nerdm(latest_nerd, mdbag) - return True - - self.log.info("ID not published previously; will start afresh") - return False + # This dataset was "published" without a preservation bag + self.log.info("No previous bag available; preparing based on " + + "existing published NERDm record") + self.create_from_nerdm(latest_nerd, mdbag) + return True def create_from_headbag(self, headbag, mdbag): From 2e03126ca22e11550feb095e35a405f7c3f382c6 Mon Sep 17 00:00:00 2001 From: Ray Plante Date: Tue, 6 Aug 2019 21:33:10 -0400 Subject: [PATCH 08/13] preservation bug fix: pass store dir from siphandler down to UpdatePresService --- python/nistoar/pdr/preserv/bagger/midas.py | 1 - python/nistoar/pdr/preserv/bagger/prepupd.py | 2 +- python/nistoar/pdr/preserv/service/siphandler.py | 3 +++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/nistoar/pdr/preserv/bagger/midas.py b/python/nistoar/pdr/preserv/bagger/midas.py index a4c1602a2..69fddb78d 100644 --- a/python/nistoar/pdr/preserv/bagger/midas.py +++ b/python/nistoar/pdr/preserv/bagger/midas.py @@ -901,7 +901,6 @@ def __init__(self, midasid, bagparent, reviewdir, mddir, not self.cfg.get('store_dir'): raise ConfigurationException("PreservationBagger: store_dir " + "config param needed") - # do a sanity check on the bag parent directory if not self.cfg.get('relative_to_indir', False): diff --git a/python/nistoar/pdr/preserv/bagger/prepupd.py b/python/nistoar/pdr/preserv/bagger/prepupd.py index dfe35fdb2..63f9e5476 100644 --- a/python/nistoar/pdr/preserv/bagger/prepupd.py +++ b/python/nistoar/pdr/preserv/bagger/prepupd.py @@ -152,7 +152,7 @@ def __init__(self, config): if not self.sercache: raise ConfigurationException("UpdatePrepService: Missing property: "+ "headbag_cache") - self.stagedir = self.cfg.get('store_dir') + self.storedir = self.cfg.get('store_dir') scfg = self.cfg.get('distrib_service', {}) self.distsvc = distrib.RESTServiceClient(scfg.get('service_endpoint')) scfg = self.cfg.get('metadata_service', {}) diff --git a/python/nistoar/pdr/preserv/service/siphandler.py b/python/nistoar/pdr/preserv/service/siphandler.py index 88a4d836b..ff2a0d963 100644 --- a/python/nistoar/pdr/preserv/service/siphandler.py +++ b/python/nistoar/pdr/preserv/service/siphandler.py @@ -367,6 +367,9 @@ def __init__(self, sipid, config, minter=None, serializer=None, bgrcfg['repo_access'] = config['repo_access'] if 'store_dir' not in bgrcfg and 'store_dir' in config: bgrcfg['store_dir'] = config['store_dir'] + if 'store_dir' not in bgrcfg['repo_access']: + bgrcfg['repo_access']['store_dir'] = config['store_dir'] + self.bagger = PreservationBagger(sipid, bagparent, self.sipparent, self.mdbagdir, bgrcfg, self._minter, self._asupdate, sipdirname) From 996f570e8038227b2e1688952ec02ce66adc0145 Mon Sep 17 00:00:00 2001 From: Ray Plante Date: Tue, 6 Aug 2019 22:14:24 -0400 Subject: [PATCH 09/13] bagutils bug fix: select_version(): properly handle 0.2 bag form --- python/nistoar/pdr/preserv/bagger/utils.py | 10 ++++++---- python/tests/nistoar/pdr/preserv/bagger/test_utils.py | 6 ++++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/python/nistoar/pdr/preserv/bagger/utils.py b/python/nistoar/pdr/preserv/bagger/utils.py index 8cb9e63fc..4ed9e3df1 100644 --- a/python/nistoar/pdr/preserv/bagger/utils.py +++ b/python/nistoar/pdr/preserv/bagger/utils.py @@ -384,14 +384,16 @@ def select_version(bagnames, version): # Most likely given current NIST practice, if version is simply "0" or "1", # we're refering to bags following the 0.2 naming convention. if version == "0" or version == "1": - out = selectVersion(bagnames, "") + out = select_version(bagnames, "") if len(out) > 0: return out - out = [] - vernamere = (version == "" and re.compile(r"^(\w+)\.mbag")) \ - or re.compile(r"^(\w+)\."+version+"\.") + if version == "": + vernamere = re.compile(r"^(\w[\w\-]+)\.mbag") + return [b for b in bagnames if vernamere.match(b)] + out = [] + vernamere = re.compile(r"^(\w[\w\-]+)\."+version+r"\.") while len(version) > 0: for name in bagnames: if vernamere.match(name): diff --git a/python/tests/nistoar/pdr/preserv/bagger/test_utils.py b/python/tests/nistoar/pdr/preserv/bagger/test_utils.py index ca9a27fe4..5d2e304e1 100644 --- a/python/tests/nistoar/pdr/preserv/bagger/test_utils.py +++ b/python/tests/nistoar/pdr/preserv/bagger/test_utils.py @@ -401,6 +401,12 @@ def test_select_version(self): ["mds3812.1_4.mbag0_4-20.tgz"]) self.assertEqual(bagut.select_version(names, "1.3"), names[0:2]+names[3:4]+names[5:8]) + self.assertEqual(bagut.select_version(names, ""), + ["mds3812.mbag0_4-4.tgz"]) + self.assertEqual(bagut.select_version(names, "0"), + ["mds3812.mbag0_4-4.tgz"]) + self.assertEqual(bagut.select_version(names, "1"), + ["mds3812.mbag0_4-4.tgz"]) def test_schuripat(self): self.assertTrue( From d396c12726be05bc0532e41582dcd5b44d393f58 Mon Sep 17 00:00:00 2001 From: Ray Plante Date: Wed, 7 Aug 2019 06:43:21 -0400 Subject: [PATCH 10/13] preservation bug fix: debug fix, update related tests --- python/nistoar/pdr/preserv/bagger/prepupd.py | 2 +- .../nistoar/pdr/preserv/service/siphandler.py | 4 +- .../nistoar/pdr/describe/data/pdr2210.json | 156 ++++++++++++++++++ .../distrib/data/pdr2210.1_0.mbag0_3-1.zip | Bin 9767 -> 10893 bytes .../pdr/distrib/data/pdr2210.2.mbag0_3-2.zip | Bin 9892 -> 11013 bytes .../distrib/data/pdr2210.3_1_3.mbag0_3-5.zip | Bin 9887 -> 11019 bytes .../pdr/preserv/bagger/test_midas_update.py | 10 ++ .../pdr/preserv/bagger/test_prepupd.py | 65 +++++++- .../nistoar/pdr/preserv/bagger/test_utils.py | 26 +++ .../service/test_siphandler_multibag.py | 9 + 10 files changed, 265 insertions(+), 7 deletions(-) create mode 100644 python/tests/nistoar/pdr/describe/data/pdr2210.json diff --git a/python/nistoar/pdr/preserv/bagger/prepupd.py b/python/nistoar/pdr/preserv/bagger/prepupd.py index 63f9e5476..b07e562a6 100644 --- a/python/nistoar/pdr/preserv/bagger/prepupd.py +++ b/python/nistoar/pdr/preserv/bagger/prepupd.py @@ -242,7 +242,7 @@ def find_bag_in_store(self, version): if len(foraip) == 0: return None - return bagutils.find_latest_head_bag(foraip) + return os.path.join(self.storedir, bagutils.find_latest_head_bag(foraip)) def aip_exists(self): """ diff --git a/python/nistoar/pdr/preserv/service/siphandler.py b/python/nistoar/pdr/preserv/service/siphandler.py index ff2a0d963..e016721a3 100644 --- a/python/nistoar/pdr/preserv/service/siphandler.py +++ b/python/nistoar/pdr/preserv/service/siphandler.py @@ -363,10 +363,10 @@ def __init__(self, sipid, config, minter=None, serializer=None, "directory: " + self.mdbagdir) bgrcfg = config.get('bagger', {}) - if 'repo_access' not in bgrcfg and 'repo_access' in config: - bgrcfg['repo_access'] = config['repo_access'] if 'store_dir' not in bgrcfg and 'store_dir' in config: bgrcfg['store_dir'] = config['store_dir'] + if 'repo_access' not in bgrcfg and 'repo_access' in config: + bgrcfg['repo_access'] = config['repo_access'] if 'store_dir' not in bgrcfg['repo_access']: bgrcfg['repo_access']['store_dir'] = config['store_dir'] diff --git a/python/tests/nistoar/pdr/describe/data/pdr2210.json b/python/tests/nistoar/pdr/describe/data/pdr2210.json new file mode 100644 index 000000000..cfab40278 --- /dev/null +++ b/python/tests/nistoar/pdr/describe/data/pdr2210.json @@ -0,0 +1,156 @@ +{ + "@context": [ + "https://data.nist.gov/od/dm/nerdm-pub-context.jsonld", + { + "@base": "ark:/88434/edi00hw91c" + } + ], + "_schema": "https://data.nist.gov/od/dm/nerdm-schema/v0.1#", + "_extensionSchemas": [ + "https://data.nist.gov/od/dm/nerdm-schema/pub/v0.1#/definitions/PublicDataResource" + ], + "@type": [ + "nrdp:PublicDataResource" + ], + "@id": "ark:/88434/edi00hw91c", + "doi": "doi:10.18434/T4SW26", + "title": "OptSortSph: Sorting Spherical Dielectric Particles in a Standing-Wave Interference Field", + "version": "3.1.3", + "contactPoint": { + "fn": "Zachary Levine", + "hasEmail": "mailto:zachary.levine@nist.gov" + }, + "modified": "2016-07-01", + "ediid": "ark:/88434/pdr2210", + "landingPage": "https://www.nist.gov/nvl/project-index-optical-method-sorting-nanoparticles-size", + "description": [ + "Software to predict the optical sorting of particles in a standing-wave laser interference field" + ], + "keyword": [ + "optical sorting", + "laser interference field", + "nanoparticles", + "convection of fluid" + ], + "theme": [ + "Optical physics" + ], + "topic": [], + "references": [ + { + "@type": "deo:BibliographicReference", + "@id": "#ref:10.1364/OE.24.014100", + "refType": "IsReferencedBy", + "location": "https://doi.org/10.1364/OE.24.014100", + "_extensionSchemas": [ + "https://data.nist.gov/od/dm/nerdm-schema/v0.1#/definitions/DCiteDocumentReference" + ] + } + ], + "accessLevel": "public", + "license": "http://www.nist.gov/open/license.cfm", + "components": [ + { + "accessURL": "https://doi.org/10.18434/T4SW26", + "description": "Software to predict the optical sorting of particles in a standing-wave laser interference field", + "format": "Digital Object Identifier, a persistent identifier", + "mediaType": "application/zip", + "title": "DOI access for OptSortSph: Sorting Spherical Dielectric Particles in a Standing-Wave Interference Field", + "@type": [ + "nrd:Hidden", + "dcat:Distribution" + ], + "@id": "#doi:10.18434/T4SW26" + }, + { + "description": "First trial of experiment", + "filepath": "trial1.json", + "checksum": { + "hash": "d155d99281ace123351a311084cd8e34edda6a9afcddd76eb039bad479595ec9", + "algorithm": { + "tag": "sha256", + "@type": "Thing" + } + }, + "title": "JSON version of the Mathematica notebook", + "mediaType": "application/json", + "downloadURL": "https://data.nist.gov/od/ds/pdr2210/trial1.json", + "size": 69, + "@id": "cmps/trial1.json", + "@type": [ + "nrdp:DataFile", + "dcat:Distribution" + ], + "_extensionSchemas": [ + "https://data.nist.gov/od/dm/nerdm-schema/pub/v0.1#/definitions/DataFile" + ] + }, + { + "description": "Second trial of experiment", + "filepath": "trial2.json", + "checksum": { + "hash": "d5eed5092f409bce7e88d057eb98b376534b372f9f6b7c14e57744b259c65d35", + "algorithm": { + "tag": "sha256", + "@type": "Thing" + } + }, + "title": "JSON version of the Mathematica notebook", + "mediaType": "application/json", + "downloadURL": "https://data.nist.gov/od/ds/pdr2210/trial2.json", + "size": 69, + "@id": "cmps/trial2.json", + "@type": [ + "nrdp:DataFile", + "dcat:Distribution" + ], + "_extensionSchemas": [ + "https://data.nist.gov/od/dm/nerdm-schema/pub/v0.1#/definitions/DataFile" + ] + }, + { + "filepath": "trial3", + "@id": "cmps/trial3", + "@type": [ + "nrdp:Subcollection" + ], + "_extensionSchemas": [ + "https://data.nist.gov/od/dm/nerdm-schema/pub/v0.1#/definitions/Subcollection" + ] + }, + { + "filepath": "trial3/trial3a.json", + "checksum": { + "hash": "7b58010c841b7748a48a7ac6366258d5b5a8d23d756951b6059c0e80daad516b", + "algorithm": { + "tag": "sha256", + "@type": "Thing" + } + }, + "mediaType": "application/json", + "downloadURL": "https://data.nist.gov/od/ds/pdr2210/trial3/trial3a.json", + "size": 70, + "@id": "cmps/trial3/trial3a.json", + "@type": [ + "nrdp:DataFile", + "dcat:Distribution" + ], + "_extensionSchemas": [ + "https://data.nist.gov/od/dm/nerdm-schema/pub/v0.1#/definitions/DataFile" + ] + } + ], + "publisher": { + "@type": "org:Organization", + "name": "National Institute of Standards and Technology" + }, + "language": [ + "en" + ], + "bureauCode": [ + "006:55" + ], + "programCode": [ + "006:045" + ] +} diff --git a/python/tests/nistoar/pdr/distrib/data/pdr2210.1_0.mbag0_3-1.zip b/python/tests/nistoar/pdr/distrib/data/pdr2210.1_0.mbag0_3-1.zip index ce4fcd03f2685d21a136da65672f28e614b78789..e445f041517c12c79f45df4961d81d31cdf7dfa9 100644 GIT binary patch delta 1290 zcmZ4P(;GU$mU*JO*F=X$%oekOl*nXD#_Le#W_2bH4rU7-_Q}^7H6|~R;NS@j;bdU` zGs`O$tX69B2`&$~TJAU|7<&${G_xR3pZ~{8tFKIAW?)d{Vqnmm+$gFv`IP`aJIDYa zo+Kz&|IA``h=2DH%`Al#;;(g?ws3IGl8N+DnWUg;eJ|nvUgcLwp3>@Po&P9B+RQmU zbEa|pdX9fh(#w`K>+793VgIlDf0V`YT(RSGtfV{>W4DC2M|_+SzLDkd>VWIJH|&Q>tOWFfS@DD~I`}}@+t8qYSA3qm&bsOr;dNi86F+JBT(Xl}@_}>htl3AE>SdkuZr9|$ zlsl$dZXi6+JkDb-z5Fq zQ-7GW%v@aT)jUMJ>F(F>?uq@&KT2ACSp4l#m1O=s*Y4CQOHP}3 zKC3u#x7_bq*@jd1=R7+QWO&Q;U}Wc%jwv^f|37~5`_bHnzjKQGSn|GxSwv`tg)7{h zS-+sOzh8XIY$Ki4qc7R)Pu$ABXOP*H_~h`Ev+X{a6AIfqBjVcr+sUGFyq;q zplJOwn$e2lZf~;Uw`LZ{&3Kx>_AGbp8|mvwr!K3Sad4W2ZZoXbI_hkFR@m~(EU`Z? z{U>bPy2YdP?ShvQ{z9gAglZ;LC$xqvy)}PXH1G6F`|`{$c}P3@JuL{AUhp;gX!;J7 zo#*2A%}FY(UHFVU^gf&VX}Q|36<_5i&lK^6WMELf7MLt7>Hy2nS)w6qlh+GKFls^b zFc)tfxG1RD;1PRbme(f1+02{wNxx+Q7Z&RjoIz#8y)KLSlK`pNdnfCxnjB$>Z zIk-bZI2oAlZ19M^w*jg%fC<9ftje9h1Y^(Qm1P!`nab+#_Ve9^PG$xMZY~A}&B+g> zb=bfrR80QJD>hj`kZ-fOOfB=|B_iGvrwLCC;F`=M3OBD%G>mQY7Wuagq4i}2qb_o L6zHEl(jXoHwb^On diff --git a/python/tests/nistoar/pdr/distrib/data/pdr2210.2.mbag0_3-2.zip b/python/tests/nistoar/pdr/distrib/data/pdr2210.2.mbag0_3-2.zip index a600f0b2d8b17fd35945ac22ae1c0881a35ed392..b71196649d5088484070f94e0c0c54193bda869f 100644 GIT binary patch delta 1132 zcmZ4D+Zr~(nz>_^*JJ}xsfiIsm^EgDnUgIUZJ}(D$qh_wlP5D?gYh>vFu8FsbL+5A zKFMXwzHF9P?0T?%t;zFv)r1Z8vWoNb^g}~98JO>E@Q8&N2sUJM19u1$%oLERn?Ldj zGV=mW^1u4RzhELW1B3izenE(tKyx>H3c4`Xvm5Xp?p>Ywf9*@|Q+L)1ZF;%7*qmi1 z`#;}E$FrS#+M6dByqdjW!M9jf*P9DwEf0R_%-!e3zwagY)k)uORIU*bpUH6kj!|Sx z`^nV?kzYP4H2Qm7y`wX|drAA0+-=)U+6xcM_8%|KKH)5xz1U7Map44+eJx(Q+m3Bu z3fGXkHSKbr&0Vm&;HuB%Yl-1yot}UVy1P6xzF&t^qBcg8*@;bruoJ6 zRNb_T3G=Sz1Z|7e*<}%V@D0!UEeX8){wFNx&a!$~Ec%K~sJ?r_C3B|b0Z-drIc?be z@1Mq-k2-eF>~$C7V%9%;yE|iN*QBFa6IF_B9=)?xm$r_0n(tSfuu|gftOskHy2}Z^39h# z?N88oBCg+e@c(67OPPoJeEti_T%Hq~^}5w^Y4(#t9kp*)o8_M4-u!3n(#nPP{k4yy zd@6SxVN*P~%xk;a@*tZnmu)6Gug^R<{~wFwZIAAd>wL5NKB@#B);*}clReBMbk!e| zyIDJ?eA;!_Lphhla_ZsQ+=grUqVpe4dSX@kv#9In*Vb9#wbFCeRDHSn_-cKCH!}+t z0|-niWc3F#po9v848tT*U1*ZcKu)@ow~9tEu9m^Q<>nN0oSaD^wuc_#C!IWTRJ01Fu?b571vQ=0ri ziEHv3WdQ*V1_^M`FapUfjm$D|ML_Rx0KLP?z+ACmvZI{vO`gnn4anSF!{o&= zd77%gWP2vb$?v&%+3sxch*jPU(aAAcmfMX<7tEVn$HcL@fjfkW$pyjx$ScUqD>IeV z-|gqS3!Tgi4BV6X1&!G&fM)P-_Sozx=)$;prffLVIZ4!x zX${<*$$Ld382Kk(5<6fLM3saZ#X9J=VhHdLBoZ~K@>&k ymqsAdYIGs~$qkx5Ozq+*D$k4aP115;Tr=5G%SnWljR6QGfp9Vx1H*4=5Dx&GSa+`g diff --git a/python/tests/nistoar/pdr/distrib/data/pdr2210.3_1_3.mbag0_3-5.zip b/python/tests/nistoar/pdr/distrib/data/pdr2210.3_1_3.mbag0_3-5.zip index c925e2c5073cd34d7a7bf8f12fc4414e67b0c22f..32326a5e48b4e6a1c76df0f6f39adb1dad13e77a 100644 GIT binary patch delta 1440 zcmbR5+Z{H+o>_91*W?FWQWIm&Fw4&NntXs)YO*WiRTz7-Ia3A)^D}Ms$@dwxCNJRS z;0X=kWMGb;#3s9v zt}|h|s7Q&{!pm37`X@MW?EJRGP-R=HwQlT||9^evcu&@O`pF@F(y{r^Kb<{eeqW#C zUsLw7CC&MI9w*)ZAFV$-_p(`UvY%}4k_~#dR@SbmICDCp>0#=M)w%UI;(GT*sB+8M zynSE){=SrabQYs5%at16$Kv{8)4L^hcue~9q3BqX+@A;c?M%y>&QE(9aOu~*^@lSz ze6;IK^KUxGaE$E|m#X={EB`Y4)&}})HIykn(IUd9d+W!tTVhSRA3y0vgi5j<)cBKK ze#PLM2JiKk`tLdG=B*MtJavWPk9v=Ei5c#9b(7ugDd}AGx(c-y=#xr9FS)rFF}UbPhi4YTx$IVoHeC-9V|RLjKy12aJ@rWJgGE zIqsJF&Vg58d54*VLGh8N4pIC|OnpP#zU+wDrhTn`p1aZ4ykER*IZ>xoATuY!|ufXDT${_zeEOhdYa5gx12exRJ!|If0%*wL2ccL-uHGA z8H=U-e};74T6jch#h0lEJVUIVOZ9H#me&h>E0CMNfZ0B0dU$umU$t5Ki&D1DP(3Nq zzhn2clc&Cg{j6X8SR_Kj?Y5dl=0u^1M!SEM%lwVGS9N24=5von)7G~(FHH*Nyfl|X z>gSIVx4oK&gCgeSeP~{@_tn?QY7%bu{BM?=n8GHxeMU#>Q_oE0WYX>Sys~!@X-k&S5f}*Ihkp<68fC*R`1LqvbAN z*n^{0+q`%CEs0W|zwhp&>V?a1)J5xD*S_?@-9YTq^y%*J|9@QfroSM4*?*3{Og}x- zuZQMbQY}h6vh7o-wE6Ryk^e3}vblVGU!m?Yo4BOC6B2@#W^Y~`bjSR%%_QgbnFsg( zW08#Z><$U%d)4<*CGfEBLG_)?(~P`BSKKY!bG1P8?W#0o-YtSpBKWpXWsKSzR@3A? zY4w+zn+?UTi)Y3COfJ~;;(|E8_;kw1 z*!)CXmT|HRxA0~W$xlp@%q$`d3=A9$GE-Uo-G08i&*Nj<7bX``6royC!O85Z4osWTh508psQNHX z7DrKiMx1Xli<$$|9to(h_GAMEuF2<>1q5^$B!E6;U|m4u8_%20uU{U+#$?Kn>{w~;AUll3dVs24EY3^7?UPD3v8GyDI+kM zS;fa0@m^e(kuhmAuOtie zCIwd3$vLV5lSNf6CbMWtPX4dLwmC;NmT7XomcZnj>Q0j%X^BoY(9mGMF2XQ5QB-qs zi-r@kr6`DPF_}@*fq4a(Z8o_|(}%f39HQo?IM3vNnhuN`C);beiLkOU0D%M$wsJ8r Iu*iUT0IR88DF6Tf diff --git a/python/tests/nistoar/pdr/preserv/bagger/test_midas_update.py b/python/tests/nistoar/pdr/preserv/bagger/test_midas_update.py index 4a953b3b8..c570c5b4e 100644 --- a/python/tests/nistoar/pdr/preserv/bagger/test_midas_update.py +++ b/python/tests/nistoar/pdr/preserv/bagger/test_midas_update.py @@ -232,6 +232,8 @@ def test_ensure_metadata_preparation_withupdate(self): try: shutil.copyfile(srczip, destzip) + shutil.copy(os.path.join(datadir, self.midasid+".json"), + mdarchive) self.bagr.ensure_metadata_preparation() @@ -287,9 +289,11 @@ def test_finalize_version(self): srczip = os.path.join(distarchive, "1491.1_0.mbag0_4-0.zip") destzip = os.path.join(distarchive, self.midasid+".1_0.mbag0_4-0.zip") cached = os.path.join(self.pubcache, os.path.basename(destzip)) + rmmrec = os.path.join(mdarchive, self.midasid+".json") try: shutil.copyfile(srczip, destzip) + shutil.copyfile(os.path.join(datadir, self.midasid+".json"), rmmrec) self.bagr.prepare(nodata=True) @@ -310,14 +314,18 @@ def test_finalize_version(self): os.remove(destzip) if os.path.exists(cached): os.remove(cached) + if os.path.exists(rmmrec): + os.remove(rmmrec) def test_make_updated_bag(self): srczip = os.path.join(distarchive, "1491.1_0.mbag0_4-0.zip") destzip = os.path.join(distarchive, self.midasid+".1_0.mbag0_4-0.zip") cached = os.path.join(self.pubcache, os.path.basename(destzip)) + rmmrec = os.path.join(mdarchive, self.midasid+".json") try: shutil.copyfile(srczip, destzip) + shutil.copyfile(os.path.join(datadir, self.midasid+".json"), rmmrec) try: self.bagr.make_bag() @@ -355,6 +363,8 @@ def test_make_updated_bag(self): os.remove(destzip) if os.path.exists(cached): os.remove(cached) + if os.path.exists(rmmrec): + os.remove(rmmrec) if __name__ == '__main__': diff --git a/python/tests/nistoar/pdr/preserv/bagger/test_prepupd.py b/python/tests/nistoar/pdr/preserv/bagger/test_prepupd.py index 34abe9410..e2495a2b0 100644 --- a/python/tests/nistoar/pdr/preserv/bagger/test_prepupd.py +++ b/python/tests/nistoar/pdr/preserv/bagger/test_prepupd.py @@ -13,6 +13,7 @@ bagsrcdir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data") mdsrcdir = os.path.join(os.path.dirname(os.path.dirname(bagsrcdir)), "describe", "data") +storedir = os.path.join(os.path.dirname(os.path.dirname(mdsrcdir)), "distrib", "data") loghdlr = None rootlog = None @@ -51,6 +52,8 @@ def fill_cache(cls, cachedir): os.path.join(cachedir, "ABCDEFG.json")) shutil.copyfile(os.path.join(mdsrcdir, "pdr02d4t.json"), os.path.join(cachedir, "pdr02d4t.json")) + shutil.copyfile(os.path.join(mdsrcdir, "pdr2210.json"), + os.path.join(cachedir, "pdr2210.json")) class SimDistClient(object): @@ -199,6 +202,7 @@ class TestUpdatePrepper(test.TestCase): def setUp(self): self.tf = Tempfiles() self.workdir = self.tf.mkdir("mdserv") + self.storedir = self.tf.mkdir("store") self.headcache = self.tf.mkdir("headcache") self.bagsdir = self.tf.mkdir("bags") self.nerddir = self.tf.mkdir("nerds") @@ -210,6 +214,7 @@ def setUp(self): self.config = { "working_dir": self.workdir, + "store_dir": self.storedir, "headbag_cache": self.headcache, "distrib_service": { "service_endpoint": "http://dummy/ds" @@ -318,21 +323,28 @@ def test_create_from_nerdm(self): self.assertTrue(not os.path.isfile(depinfof)) def test_create_new_update(self): - headbag = os.path.join(self.bagsdir, "ABCDEFG.1.mbag0_4-2.zip") + headbag = os.path.join(self.bagsdir, "ABCDEFG.2.mbag0_4-4.zip") + cached = os.path.join(self.headcache, "ABCDEFG.2.mbag0_4-4.zip") root = os.path.join(self.workdir, "ABCDEFG") self.assertTrue(not os.path.exists(root)) + self.assertTrue(not os.path.exists(cached)) self.assertTrue(self.prepr.create_new_update(root)) self.assertTrue(os.path.isdir(root)) + self.assertTrue(os.path.exists(cached)) contents = [f for f in os.listdir(root)] self.assertIn("metadata", contents) self.assertIn("data", contents) self.assertNotIn("manifest-sha256.txt", contents) self.assertNotIn("bag-info.txt", contents) + self.assertIn("multibag", contents) - # test using headbag from local cache - os.remove(headbag) # prevents retrieving headbag via dist service + # test using headbag made from nerdm record + shutil.rmtree(root) # reset + os.remove(headbag) # prevents retrieving headbag via dist service + os.remove(cached) # prevents using cached version + self.assertTrue(not os.path.isdir(root)) self.assertTrue(self.prepr.create_new_update(root)) self.assertTrue(os.path.isdir(root)) @@ -341,6 +353,7 @@ def test_create_new_update(self): self.assertIn("data", contents) self.assertNotIn("manifest-sha256.txt", contents) self.assertNotIn("bag-info.txt", contents) + self.assertNotIn("multibag", contents) bag = NISTBag(root) mdata = bag.nerdm_record(True) @@ -357,7 +370,51 @@ def test_no_create_new_update(self): self.assertFalse(self.prepr.create_new_update(root)) self.assertTrue(not os.path.isdir(root)) - + def test_find_bag_in_store(self): + sf12_7 = os.path.join(self.storedir, "ABCDEFG.12_7.mbag0_3-2.zip") + + # The way we will test if the file was retreive from the + # store dir is by making that copy an empty file + with open(sf12_7,'w') as fd: + pass + + sf12_8 = os.path.join(self.storedir, "ABCDEFG.12_8.mbag0_3-4.zip") + with open(sf12_8,'w') as fd: + pass + sf12_8 = os.path.join(self.storedir, "ABCDEFG.12_8.mbag0_3-5.zip") + with open(sf12_8,'w') as fd: + pass + sf0 = os.path.join(self.storedir, "ABCDEFG.mbag0_3-5.zip") + with open(sf0,'w') as fd: + pass + + self.assertEqual(self.prepr.find_bag_in_store("12.7"), sf12_7) + self.assertEqual(self.prepr.find_bag_in_store("12.8"), sf12_8) + self.assertEqual(self.prepr.find_bag_in_store("0"), sf0) + + def test_create_new_update_fromstore(self): + shutil.copy(os.path.join(storedir, "pdr2210.3_1_3.mbag0_3-5.zip"), + self.storedir) + cachedbag = os.path.join(self.headcache, "pdr2210.3_1_3.mbag0_3-5.zip") + + self.prepr = self.prepsvc.prepper_for("pdr2210") + self.prepr.mdcli = self.mdcli + self.prepr.cacher.distsvc = self.distcli + + root = os.path.join(self.workdir, "pdr2210") + self.assertTrue(not os.path.exists(cachedbag)) + self.assertTrue(not os.path.exists(root)) + + self.assertTrue(self.prepr.create_new_update(root)) + self.assertTrue(os.path.isdir(root)) + + # these demonstrates that it came from the stored version + self.assertTrue(os.path.isdir(os.path.join(root,"multibag"))) + self.assertTrue(not os.path.isfile(cachedbag)) + + bag = NISTBag(root) + mdata = bag.nerdm_record(True) + self.assertEquals(mdata['version'], "3.1.3+ (in edit)") diff --git a/python/tests/nistoar/pdr/preserv/bagger/test_utils.py b/python/tests/nistoar/pdr/preserv/bagger/test_utils.py index 5d2e304e1..82656f376 100644 --- a/python/tests/nistoar/pdr/preserv/bagger/test_utils.py +++ b/python/tests/nistoar/pdr/preserv/bagger/test_utils.py @@ -408,6 +408,32 @@ def test_select_version(self): self.assertEqual(bagut.select_version(names, "1"), ["mds3812.mbag0_4-4.tgz"]) + def test_select_version_newpat(self): + names = [ + "mds2-3812.1_3.mbag0_4-10.tgz", + "mds2-3812.1_3.mbag0_2-8.tgz", + "mds2-3812.2_0.mbag0_4-18.tgz", + "mds2-3812.1_3.mbag0_4-14.7z", + "mds2-3812.mbag0_4-4.tgz", + "mds2-3812.1_3.mbag1_0-16.tgz", + "mds2-3812.1_3.mbag0_4-14.zip", + "mds2-3812.1_3.mbag0_4-14.tgz", + "mds2-3812.1_4.mbag0_4-20.tgz" + ]; + + self.assertEqual(bagut.select_version(names, "2.0"), + ["mds2-3812.2_0.mbag0_4-18.tgz"]) + self.assertEqual(bagut.select_version(names, "1.4"), + ["mds2-3812.1_4.mbag0_4-20.tgz"]) + self.assertEqual(bagut.select_version(names, "1.3"), + names[0:2]+names[3:4]+names[5:8]) + self.assertEqual(bagut.select_version(names, ""), + ["mds2-3812.mbag0_4-4.tgz"]) + self.assertEqual(bagut.select_version(names, "0"), + ["mds2-3812.mbag0_4-4.tgz"]) + self.assertEqual(bagut.select_version(names, "1"), + ["mds2-3812.mbag0_4-4.tgz"]) + def test_schuripat(self): self.assertTrue( bagut._nrdpat.match("https://data.nist.gov/od/dm/nerdm-schema/pub/v1.0#Res") ) diff --git a/python/tests/nistoar/pdr/preserv/service/test_siphandler_multibag.py b/python/tests/nistoar/pdr/preserv/service/test_siphandler_multibag.py index 5d319240b..0726c086f 100644 --- a/python/tests/nistoar/pdr/preserv/service/test_siphandler_multibag.py +++ b/python/tests/nistoar/pdr/preserv/service/test_siphandler_multibag.py @@ -222,9 +222,12 @@ def test_small_revision(self): srczip = os.path.join(distarchdir, "1491.1_0.mbag0_4-0.zip") destzip = os.path.join(distarchive, self.midasid+".1_0.mbag0_4-0.zip") cached = os.path.join(self.pubcache, os.path.basename(destzip)) + rmmrec = os.path.join(mdarchive, self.midasid+".json") try: shutil.copyfile(srczip, destzip) + shutil.copy(os.path.join(datadir, self.midasid+".json"), + mdarchive) try: self.sip.bagit() @@ -276,6 +279,8 @@ def test_small_revision(self): os.remove(destzip) if os.path.exists(cached): os.remove(cached) + if os.path.exists(rmmrec): + os.remove(rmmrec) @@ -290,9 +295,11 @@ def test_metadata_revision(self): srczip = os.path.join(distarchdir, "1491.1_0.mbag0_4-0.zip") destzip = os.path.join(distarchive, self.midasid+".1_0.mbag0_4-0.zip") cached = os.path.join(self.pubcache, os.path.basename(destzip)) + rmmrec = os.path.join(mdarchive, self.midasid+".json") try: shutil.copyfile(srczip, destzip) + shutil.copyfile(os.path.join(datadir, self.midasid+".json"), rmmrec) try: self.sip.bagit() @@ -340,6 +347,8 @@ def test_metadata_revision(self): os.remove(destzip) if os.path.exists(cached): os.remove(cached) + if os.path.exists(rmmrec): + os.remove(rmmrec) From c1c4b240974ad7d02c733e34995f0c070d3d9a96 Mon Sep 17 00:00:00 2001 From: Ray Plante Date: Wed, 7 Aug 2019 08:17:14 -0400 Subject: [PATCH 11/13] preservation bug fix: pass in store_dir into mdserv --- python/nistoar/pdr/preserv/service/siphandler.py | 4 ++-- python/nistoar/pdr/publish/mdserv/serv.py | 4 ++++ python/tests/nistoar/pdr/describe/test_sim_describe_svc.py | 3 ++- .../tests/nistoar/pdr/publish/mdserv/test_serv_update.py | 7 +++++++ 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/python/nistoar/pdr/preserv/service/siphandler.py b/python/nistoar/pdr/preserv/service/siphandler.py index e016721a3..0f4154b73 100644 --- a/python/nistoar/pdr/preserv/service/siphandler.py +++ b/python/nistoar/pdr/preserv/service/siphandler.py @@ -367,8 +367,8 @@ def __init__(self, sipid, config, minter=None, serializer=None, bgrcfg['store_dir'] = config['store_dir'] if 'repo_access' not in bgrcfg and 'repo_access' in config: bgrcfg['repo_access'] = config['repo_access'] - if 'store_dir' not in bgrcfg['repo_access']: - bgrcfg['repo_access']['store_dir'] = config['store_dir'] + if 'store_dir' not in bgrcfg['repo_access'] and 'store_dir' in bgrcfg: + bgrcfg['repo_access']['store_dir'] = bgrcfg['store_dir'] self.bagger = PreservationBagger(sipid, bagparent, self.sipparent, self.mdbagdir, bgrcfg, self._minter, diff --git a/python/nistoar/pdr/publish/mdserv/serv.py b/python/nistoar/pdr/publish/mdserv/serv.py index 7ea574ba4..c913197ba 100644 --- a/python/nistoar/pdr/publish/mdserv/serv.py +++ b/python/nistoar/pdr/publish/mdserv/serv.py @@ -162,8 +162,12 @@ def open_bagger(self, id): metadata bag. """ cfg = self.cfg.get('bagger', {}) + if 'store_dir' not in cfg and 'store_dir' in self.cfg: + cfg['store_dir'] = self.cfg['store_dir'] if 'repo_access' not in cfg and 'repo_access' in self.cfg: cfg['repo_access'] = self.cfg['repo_access'] + if 'store_dir' not in cfg['repo_access'] and 'store_dir' in cfg: + cfg['repo_access']['store_dir'] = cfg['store_dir'] if not os.path.exists(self.workdir): os.mkdir(workdir) elif not os.path.isdir(self.workdir): diff --git a/python/tests/nistoar/pdr/describe/test_sim_describe_svc.py b/python/tests/nistoar/pdr/describe/test_sim_describe_svc.py index c81decc41..9d6864db2 100644 --- a/python/tests/nistoar/pdr/describe/test_sim_describe_svc.py +++ b/python/tests/nistoar/pdr/describe/test_sim_describe_svc.py @@ -67,7 +67,8 @@ def setUp(self): def test_ctor(self): self.assertEqual(self.arch.dir, datadir) - self.assertEqual(self.arch.lu, {"ABCDEFG": "pdr02d4t"}) + self.assertEqual(self.arch.lu, {"ABCDEFG": "pdr02d4t", + "ark:/88434/pdr2210": "pdr2210"}) def test_ediid_to_id(self): self.assertEqual(self.arch.ediid_to_id("ABCDEFG"), "pdr02d4t") diff --git a/python/tests/nistoar/pdr/publish/mdserv/test_serv_update.py b/python/tests/nistoar/pdr/publish/mdserv/test_serv_update.py index 1b3aa1a70..541d6b5bb 100644 --- a/python/tests/nistoar/pdr/publish/mdserv/test_serv_update.py +++ b/python/tests/nistoar/pdr/publish/mdserv/test_serv_update.py @@ -150,6 +150,7 @@ def setUp(self): 'working_dir': self.workdir, 'review_dir': self.revdir, 'upload_dir': self.upldir, + 'store_dir': distarchdir, 'id_registry_dir': self.workdir, 'repo_access': { 'headbag_cache': self.pubcache, @@ -224,9 +225,12 @@ def test_resolve_id_withupdate(self): srczip = os.path.join(distarchive, "1491.1_0.mbag0_4-0.zip") destzip = os.path.join(distarchive, self.midasid+".1_0.mbag0_4-0.zip") cached = os.path.join(self.pubcache, os.path.basename(destzip)) + rmmrec = os.path.join(mdarchive, self.midasid+".json") try: shutil.copyfile(srczip, destzip) + shutil.copy(os.path.join(datadir, self.midasid+".json"), + mdarchive) data = self.srv.resolve_id(self.midasid) self.assertIn("ediid", data) @@ -239,6 +243,9 @@ def test_resolve_id_withupdate(self): os.remove(destzip) if os.path.exists(cached): os.remove(cached) + if os.path.exists(rmmrec): + os.remove(rmmrec) + time.sleep(0.2) # wait for metadata thread to finish From 78ac97f15b645385aa3f985b0fee798d11cc2ef1 Mon Sep 17 00:00:00 2001 From: Ray Plante Date: Wed, 7 Aug 2019 10:43:47 -0400 Subject: [PATCH 12/13] preservation bug fix: one more test to ensure use of store works from mdserver --- .../nistoar/pdr/describe/sim_describe_svc.py | 6 +- .../pdr/describe/test_sim_describe_svc.py | 3 +- .../nistoar/pdr/preserv/data/pdr2210_pod.json | 60 +++++++++++++++++++ .../pdr/publish/mdserv/test_serv_update.py | 25 ++++++++ 4 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 python/tests/nistoar/pdr/preserv/data/pdr2210_pod.json diff --git a/python/tests/nistoar/pdr/describe/sim_describe_svc.py b/python/tests/nistoar/pdr/describe/sim_describe_svc.py index 19cbc44f5..42d5b938b 100644 --- a/python/tests/nistoar/pdr/describe/sim_describe_svc.py +++ b/python/tests/nistoar/pdr/describe/sim_describe_svc.py @@ -26,7 +26,11 @@ def loadlu(self): with open(os.path.join(self.dir,rec)) as fd: data = json.load(fd, object_pairs_hook=OrderedDict) if "ediid" in data: - self.lu[data["ediid"]] = rec[:-1*len(".json")] + ediid = data["ediid"] + self.lu[ediid] = rec[:-1*len(".json")] + ediid = re.sub(r'^ark:/\d+/', '', ediid) + self.lu[ediid] = rec[:-1*len(".json")] + except: pass def ediid_to_id(self, ediid): diff --git a/python/tests/nistoar/pdr/describe/test_sim_describe_svc.py b/python/tests/nistoar/pdr/describe/test_sim_describe_svc.py index 9d6864db2..da49a91f6 100644 --- a/python/tests/nistoar/pdr/describe/test_sim_describe_svc.py +++ b/python/tests/nistoar/pdr/describe/test_sim_describe_svc.py @@ -68,7 +68,8 @@ def setUp(self): def test_ctor(self): self.assertEqual(self.arch.dir, datadir) self.assertEqual(self.arch.lu, {"ABCDEFG": "pdr02d4t", - "ark:/88434/pdr2210": "pdr2210"}) + "ark:/88434/pdr2210": "pdr2210", + "pdr2210": "pdr2210"}) def test_ediid_to_id(self): self.assertEqual(self.arch.ediid_to_id("ABCDEFG"), "pdr02d4t") diff --git a/python/tests/nistoar/pdr/preserv/data/pdr2210_pod.json b/python/tests/nistoar/pdr/preserv/data/pdr2210_pod.json new file mode 100644 index 000000000..45ba58a32 --- /dev/null +++ b/python/tests/nistoar/pdr/preserv/data/pdr2210_pod.json @@ -0,0 +1,60 @@ +{ + "@type": "dcat:Dataset", + "accessLevel": "public", + "bureauCode": [ + "006:55" + ], + "contactPoint": { + "fn": "Zachary Levine", + "hasEmail": "mailto:zachary.levine@nist.gov" + }, + "description": "Software to predict the optical sorting of particles in a standing-wave laser interference field", + "distribution": [ + { + "description": "First trial of experiment", + "downloadURL": "https://data.nist.gov/od/ds/pdr2210/trial1.json", + "mediaType": "application/json", + "title": "JSON version of the Mathematica notebook" + }, + { + "description": "Second trial of experiment", + "downloadURL": "https://data.nist.gov/od/ds/pdr2210/trial2.json", + "mediaType": "application/json", + "title": "JSON version of the Mathematica notebook" + }, + { + "accessURL": "https://doi.org/10.18434/T4SW26", + "description": "Software to predict the optical sorting of particles in a standing-wave laser interference field", + "format": "Digital Object Identifier, a persistent identifier", + "mediaType": "application/zip", + "title": "DOI access for OptSortSph: Sorting Spherical Dielectric Particles in a Standing-Wave Interference Field" + } + ], + "identifier": "ark:/88434/pdr2210", + "keyword": [ + "optical sorting", + "laser interference field", + "nanoparticles", + "convection of fluid" + ], + "landingPage": "https://www.nist.gov/nvl/project-index-optical-method-sorting-nanoparticles-size", + "language": [ + "en" + ], + "license": "http://www.nist.gov/open/license.cfm", + "modified": "2016-07-01", + "programCode": [ + "006:045" + ], + "publisher": { + "@type": "org:Organization", + "name": "National Institute of Standards and Technology" + }, + "references": [ + "https://doi.org/10.1364/OE.24.014100" + ], + "theme": [ + "Optical physics" + ], + "title": "OptSortSph: Sorting Spherical Dielectric Particles in a Standing-Wave Interference Field" +} diff --git a/python/tests/nistoar/pdr/publish/mdserv/test_serv_update.py b/python/tests/nistoar/pdr/publish/mdserv/test_serv_update.py index 541d6b5bb..b80b9ae2a 100644 --- a/python/tests/nistoar/pdr/publish/mdserv/test_serv_update.py +++ b/python/tests/nistoar/pdr/publish/mdserv/test_serv_update.py @@ -246,6 +246,31 @@ def test_resolve_id_withupdate(self): if os.path.exists(rmmrec): os.remove(rmmrec) time.sleep(0.2) # wait for metadata thread to finish + + def test_resolve_id_usestore(self): + # test resolving an identifier for a dataset being updated (after + # an initial publication) + midasid = "pdr2210" + self.revdir = self.tf.mkdir("review") + self.config['review_dir'] = self.revdir + self.srv = serv.PrePubMetadataService(self.config) + self.bagdir = os.path.join(self.bagparent, midasid) + + os.mkdir(os.path.join(self.revdir, "pdr2210")) + shutil.copyfile(os.path.join(datadir, "pdr2210_pod.json"), + os.path.join(self.revdir,"pdr2210","_pod.json")) + + cached = os.path.join(self.pubcache, "pdr2210.3_1_3.mbag0_3-5.zip") + + data = self.srv.resolve_id(midasid) + self.assertIn("ediid", data) + self.assertEqual(data['version'], "3.1.3+ (in edit)") + + self.assertTrue(not os.path.exists(cached)) + self.assertTrue(os.path.isdir(self.bagdir)) + self.assertTrue(os.path.isdir(os.path.join(self.bagdir,"multibag"))) + + From b9f566241cc9393acccb1afab72ab34a209de1fb Mon Sep 17 00:00:00 2001 From: Ray Plante Date: Wed, 7 Aug 2019 13:29:46 -0400 Subject: [PATCH 13/13] prepupd fix: don't get mixed up by hash files in store dir --- python/nistoar/pdr/preserv/bagger/prepupd.py | 3 ++- python/tests/nistoar/pdr/preserv/bagger/test_prepupd.py | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/python/nistoar/pdr/preserv/bagger/prepupd.py b/python/nistoar/pdr/preserv/bagger/prepupd.py index b07e562a6..a1e223b8a 100644 --- a/python/nistoar/pdr/preserv/bagger/prepupd.py +++ b/python/nistoar/pdr/preserv/bagger/prepupd.py @@ -237,7 +237,8 @@ def find_bag_in_store(self, version): return None foraip = [f for f in os.listdir(self.storedir) - if f.startswith(self.aipid+'.') ] + if f.startswith(self.aipid+'.') and + not f.endswith('.sha256') ] foraip = bagutils.select_version(foraip, version) if len(foraip) == 0: return None diff --git a/python/tests/nistoar/pdr/preserv/bagger/test_prepupd.py b/python/tests/nistoar/pdr/preserv/bagger/test_prepupd.py index e2495a2b0..89f818da1 100644 --- a/python/tests/nistoar/pdr/preserv/bagger/test_prepupd.py +++ b/python/tests/nistoar/pdr/preserv/bagger/test_prepupd.py @@ -379,6 +379,9 @@ def test_find_bag_in_store(self): pass sf12_8 = os.path.join(self.storedir, "ABCDEFG.12_8.mbag0_3-4.zip") + with open(sf12_8,'w') as fd: + pass + sf12_8 = os.path.join(self.storedir, "ABCDEFG.12_8.mbag0_3-5.zip.sha256") with open(sf12_8,'w') as fd: pass sf12_8 = os.path.join(self.storedir, "ABCDEFG.12_8.mbag0_3-5.zip") @@ -395,6 +398,10 @@ def test_find_bag_in_store(self): def test_create_new_update_fromstore(self): shutil.copy(os.path.join(storedir, "pdr2210.3_1_3.mbag0_3-5.zip"), self.storedir) + with open(os.path.join(self.storedir, + "pdr2210.3_1_3.mbag0_3-5.zip.sha256"),'w') as fd: + pass + cachedbag = os.path.join(self.headcache, "pdr2210.3_1_3.mbag0_3-5.zip") self.prepr = self.prepsvc.prepper_for("pdr2210")