diff --git a/tsinfer/formats.py b/tsinfer/formats.py index d46d4c17..16622737 100644 --- a/tsinfer/formats.py +++ b/tsinfer/formats.py @@ -31,6 +31,7 @@ import sys import threading import warnings +from typing import Union # noqa: F401 import attr import humanize @@ -2293,6 +2294,48 @@ def populations(self): class VariantData(SampleData): + """ + Class representing input variant data used for inference. This is + mostly a thin wrapper for a Zarr dataset storing information in + the VCF Zarr (.vcz) format, plus information specifing the ancestral allele + and (optional) data masks. It then provides various derived properties and + methods for accessing the data in a form suitable for inference. + + .. note:: + In the VariantData object, "samples" refer to the individuals in the dataset, + each of which can be of arbitrary ploidy. This is in contrast to ``tskit``, + in which each *haploid genome* is treated as a separate "sample". For example + in a diploid dataset, the inferred tree sequence returned at the end of + the inference process will have ``inferred_ts.num_samples`` equal to double + the number returned by ``VariantData.num_samples``. + + :param str path: The path to the file containing the input dataset in VCF-Zarr + format. + :param Union(array, str) ancestral_allele: A numpy array of strings specifying + the ancestral alleles used in inference. This must be the same length as + the number of unmasked sites in the dataset. If a single string is provided, + it is assumed to be the name of an array in the input dataset which contains + the ancestral alleles. Unknown ancestral alleles can be specified using "N". + Any ancestral alleles which do not match any of the known alleles at that site, + will be tallied, and a warning issued summarizing the unknown ancestral states. + :param Union(array, str) sample_mask: A numpy array of booleans specifying which + samples to mask out (exclude) from the dataset. If a string is provided + it is assumed to be the name of an array in the input dataset which contains the + sample mask. If ``None`` (default), all samples are included. + :param Union(array, str) site_mask: A numpy array of booleans specifying which + sites to mask out (exclude) from the dataset. If a string is provided, + it is assumed to be the name of an array in the input dataset which contains the + site mask. If ``None`` (default), all sites are included. + :param Union(array, str) sites_time: A numpy array of floats specifying the relative + time of occurrence of the mutation to the derived state at each site. This must + be of the same length as the number of unmasked sites. If a string is provided, + it is assumed to be the name of an array in the input dataset + which contains the site times. If ``None`` (default), the frequency of the + derived allele is used as a proxy for the time of occurrence: this is usually a + reasonable approximation to the relative order of ancestors used for inference. + Values are ignored for sites not used in inference, such as singletons, sites + with more than two alleles, or sites with an unknown ancestral allele. + """ FORMAT_NAME = "tsinfer-variant-data" FORMAT_VERSION = (0, 1)