diff --git a/bio2zarr/vcf2zarr/icf.py b/bio2zarr/vcf2zarr/icf.py index e0e3ea6..0559c34 100644 --- a/bio2zarr/vcf2zarr/icf.py +++ b/bio2zarr/vcf2zarr/icf.py @@ -1079,7 +1079,11 @@ def process_partition(self, partition_index): for field in info_fields: tcw.append(field.full_name, variant.INFO.get(field.name, None)) if has_gt: - tcw.append("FORMAT/GT", variant.genotype.array()) + if variant.genotype is None: + val = None + else: + val = variant.genotype.array() + tcw.append("FORMAT/GT", val) for field in format_fields: val = variant.format(field.name) tcw.append(field.full_name, val) diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py index f0cd1e5..bd75320 100644 --- a/bio2zarr/vcf2zarr/vcz.py +++ b/bio2zarr/vcf2zarr/vcz.py @@ -321,7 +321,7 @@ def fixed_field_spec( array_specs.append(spec_from_field(field)) if gt_field is not None: - ploidy = gt_field.summary.max_number - 1 + ploidy = max(gt_field.summary.max_number - 1, 1) shape = [m, n] chunks = [variants_chunk_size, samples_chunk_size] dimensions = ["variants", "samples"] @@ -728,9 +728,13 @@ def encode_genotypes_partition(self, partition_index): source_field = self.icf.fields["FORMAT/GT"] for value in source_field.iter_values(partition.start, partition.stop): j = gt.next_buffer_row() - icf.sanitise_value_int_2d(gt.buff, j, value[:, :-1]) + icf.sanitise_value_int_2d( + gt.buff, j, value[:, :-1] if value is not None else None + ) j = gt_phased.next_buffer_row() - icf.sanitise_value_int_1d(gt_phased.buff, j, value[:, -1]) + icf.sanitise_value_int_1d( + gt_phased.buff, j, value[:, -1] if value is not None else None + ) # TODO check is this the correct semantics when we are padding # with mixed ploidies? j = gt_mask.next_buffer_row() diff --git a/bio2zarr/vcf2zarr/verification.py b/bio2zarr/vcf2zarr/verification.py index a89cf8f..27e86fe 100644 --- a/bio2zarr/vcf2zarr/verification.py +++ b/bio2zarr/vcf2zarr/verification.py @@ -153,7 +153,7 @@ def verify(vcf_path, zarr_path, show_progress=False): chrom = root["contig_id"][:][root["variant_contig"][:]] vid = root["variant_id"][:] call_genotype = None - if "call_genotype" in root: + if "call_genotype" in root and root["call_genotype"].size > 0: call_genotype = iter(root["call_genotype"]) vcf = cyvcf2.VCF(vcf_path) diff --git a/tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz b/tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz new file mode 100644 index 0000000..3029a95 Binary files /dev/null and b/tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz differ diff --git a/tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz.csi b/tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz.csi new file mode 100644 index 0000000..9270122 Binary files /dev/null and b/tests/data/vcf/sample_no_genotypes_with_gt_header.vcf.gz.csi differ diff --git a/tests/test_core.py b/tests/test_core.py index 7ff33b6..74943c5 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -202,8 +202,8 @@ def test_5_chunk_1(self, n, expected): # It works in CI on Linux, but it'll probably break at some point. # It's also necessary to update these numbers each time a new data # file gets added - ("tests/data", 4973751), - ("tests/data/vcf", 4961614), + ("tests/data", 4974951), + ("tests/data/vcf", 4962814), ("tests/data/vcf/sample.vcf.gz", 1089), ], ) diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index 4e6ac35..5b0aeeb 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -834,6 +834,7 @@ def test_duplicate_paths(self, tmp_path): "sample.vcf.gz", "sample_old_tabix.vcf.gz", "sample_no_genotypes.vcf.gz", + "sample_no_genotypes_with_gt_header.vcf.gz", "1kg_2020_chrM.vcf.gz", "field_type_combos.vcf.gz", "out_of_order_contigs.vcf.gz",