From 04ec5e8677e2db48dc5a08703f907e60c52c39ce Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 8 Jul 2024 11:43:22 +0100 Subject: [PATCH 1/3] Handle case where GT is defined in header, but not used in FORMAT --- bio2zarr/vcf2zarr/icf.py | 6 +++++- bio2zarr/vcf2zarr/vcz.py | 10 +++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/bio2zarr/vcf2zarr/icf.py b/bio2zarr/vcf2zarr/icf.py index e0e3ea6..0559c34 100644 --- a/bio2zarr/vcf2zarr/icf.py +++ b/bio2zarr/vcf2zarr/icf.py @@ -1079,7 +1079,11 @@ def process_partition(self, partition_index): for field in info_fields: tcw.append(field.full_name, variant.INFO.get(field.name, None)) if has_gt: - tcw.append("FORMAT/GT", variant.genotype.array()) + if variant.genotype is None: + val = None + else: + val = variant.genotype.array() + tcw.append("FORMAT/GT", val) for field in format_fields: val = variant.format(field.name) tcw.append(field.full_name, val) diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py index f0cd1e5..bd75320 100644 --- a/bio2zarr/vcf2zarr/vcz.py +++ b/bio2zarr/vcf2zarr/vcz.py @@ -321,7 +321,7 @@ def fixed_field_spec( array_specs.append(spec_from_field(field)) if gt_field is not None: - ploidy = gt_field.summary.max_number - 1 + ploidy = max(gt_field.summary.max_number - 1, 1) shape = [m, n] chunks = [variants_chunk_size, samples_chunk_size] dimensions = ["variants", "samples"] @@ -728,9 +728,13 @@ def encode_genotypes_partition(self, partition_index): source_field = self.icf.fields["FORMAT/GT"] for value in source_field.iter_values(partition.start, partition.stop): j = gt.next_buffer_row() - icf.sanitise_value_int_2d(gt.buff, j, value[:, :-1]) + icf.sanitise_value_int_2d( + gt.buff, j, value[:, :-1] if value is not None else None + ) j = gt_phased.next_buffer_row() - icf.sanitise_value_int_1d(gt_phased.buff, j, value[:, -1]) + icf.sanitise_value_int_1d( + gt_phased.buff, j, value[:, -1] if value is not None else None + ) # TODO check is this the correct semantics when we are padding # with mixed ploidies? j = gt_mask.next_buffer_row() From 431eddf8b5b00e239c5c9bdf7727e1322476ec2c Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 8 Jul 2024 14:30:42 +0100 Subject: [PATCH 2/3] Add test for GT in header, but not FORMAT --- bio2zarr/vcf2zarr/verification.py | 2 +- .../sample_no_genotypes_with_gt_header.vcf.gz | Bin 0 -> 1032 bytes ...sample_no_genotypes_with_gt_header.vcf.gz.csi | Bin 0 -> 168 bytes tests/test_vcf_examples.py | 1 + 4 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz create mode 100644 tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz.csi diff --git a/bio2zarr/vcf2zarr/verification.py b/bio2zarr/vcf2zarr/verification.py index a89cf8f..27e86fe 100644 --- a/bio2zarr/vcf2zarr/verification.py +++ b/bio2zarr/vcf2zarr/verification.py @@ -153,7 +153,7 @@ def verify(vcf_path, zarr_path, show_progress=False): chrom = root["contig_id"][:][root["variant_contig"][:]] vid = root["variant_id"][:] call_genotype = None - if "call_genotype" in root: + if "call_genotype" in root and root["call_genotype"].size > 0: call_genotype = iter(root["call_genotype"]) vcf = cyvcf2.VCF(vcf_path) diff --git a/tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz b/tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..3029a95a01c6cc520401f2eb82aaf15706f14816 GIT binary patch literal 1032 zcmV+j1o!(NiwFb&00000{{{d;LjnNn1C3W*Pg_S2ect?x@M@u*kNuMBN>+$nBnSlD zD18KKh+D9!ang`qpP{|>or|v5^XfUrGdt((?A(Kc+xh+McDZ`ES`Rg!}b01@S4)Euh4!22MU2WajL>W!M%?G zVik@poN|C%LKRM1ILJSjoN^V;TewfeC9to;Lkq`1Kn1JtxPcc>fm5I=I&IJypag_^ zDS3lV*{6hDKWEvXLjlYsS1Z9*e}Ymd)zMr1BS}{RR_n&C{G0*H9yxJaN&xa4V%00H z_9c6i;;WUk(iiaxW2*jlt9LXLNo`!=hN371J+RcVTDo-vc7XybCr$`GdoS^fF{Vlb{ zmP1(*87N6n5~QlL8%k~|#rtfb6m^)F0k^+0^j7H@1WKj;tK=m=%P_zLOhipls48S0bH&B!T7-RKbHVmylM#BX_ul1LCVYmSDl(Za~ zbJp2^#AP}?l#U}!Vw84wxCavVJaC?&Au=~w^*8?FGv4hM_jszK^-fGy|A zx(5LD1*+j_8>9l3n}fcSN8U&SNBcn5yJDyfU-=*!?hba|0(t5U`XG0D5Q|~ppz?v3 z9O}DG!_hvFoz>E6Z3E0?nbN3jfU>6Gd>x^oF!tTE z7!1QoeU?FLkdn3f6ws1It50N6pSja9G<1x$`tqzEOfHhwIsD2!rYK z<7{}jUd_#V3_wo19 zi5nlEOy6I);}7GrH{fJJb60NSM($n|s(164s(8 zg#R^h*&7KM++%-)v-_vFPw3=})~$M3Nise4M96_#_x?`8__{Olt-nX)Sc3od(AY(Z zeD95<6x{QUNOV8?Q~d;c?l}jG6951oiwFb&00000{{{d;LjnLB00RI3000000000= CXZx7| literal 0 HcmV?d00001 diff --git a/tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz.csi b/tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz.csi new file mode 100644 index 0000000000000000000000000000000000000000..9270122e3548e26960a93bb19a416a8fbb491322 GIT binary patch literal 168 zcmb2|=3rp}f&Xj_PR>jW-3-Ni-%_3=CnO{=Cy6SgF(tWa@G_kWn8n<|E7>E_Bk3a% zvEb9Z$r2KJ+zwA1s}geJ6*=CTcD6FI&De0_v*7}!88Zzk-Bh-U7tZL^JKuHqOt(kc zp&Ma`Zk% Date: Tue, 9 Jul 2024 09:38:59 +0100 Subject: [PATCH 3/3] Fix du tests --- tests/test_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 7ff33b6..74943c5 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -202,8 +202,8 @@ def test_5_chunk_1(self, n, expected): # It works in CI on Linux, but it'll probably break at some point. # It's also necessary to update these numbers each time a new data # file gets added - ("tests/data", 4973751), - ("tests/data/vcf", 4961614), + ("tests/data", 4974951), + ("tests/data/vcf", 4962814), ("tests/data/vcf/sample.vcf.gz", 1089), ], )