Add a docstring for VariantData

This describes using `N` to mean "unknown ancestral allele". Fixes #955
tskit-dev · Sep 5, 2024 · 76a155f · 76a155f
1 parent 6514312
commit 76a155f
Showing 1 changed file with 43 additions and 0 deletions.
diff --git a/tsinfer/formats.py b/tsinfer/formats.py
@@ -31,6 +31,7 @@
 import sys
 import threading
 import warnings
+from typing import Union  # noqa: F401
 
 import attr
 import humanize
@@ -2293,6 +2294,48 @@ def populations(self):
 
 
 class VariantData(SampleData):
+    """
+    Class representing input variant data used for inference. This is
+    mostly a thin wrapper for a Zarr dataset storing information in
+    the VCF Zarr (.vcz) format, plus information specifing the ancestral allele
+    and (optional) data masks. It then provides various derived properties and
+    methods for accessing the data in a form suitable for inference.
+
+    .. note::
+        In the VariantData object, "samples" refer to the individuals in the dataset,
+        each of which can be of arbitrary ploidy. This is in contrast to ``tskit``,
+        in which each *haploid genome* is treated as a separate "sample". For example
+        in a diploid dataset, the inferred tree sequence returned at the end of
+        the inference process will have ``inferred_ts.num_samples`` equal to double
+        the number returned by ``VariantData.num_samples``.
+
+    :param str path: The path to the file containing the input dataset in VCF-Zarr
+        format.
+    :param Union(array, str) ancestral_allele: A numpy array of strings specifying
+        the ancestral alleles used in inference. This must be the same length as
+        the number of unmasked sites in the dataset. Alternatively, a single string
+        can be provided, giving the name of an array in the input dataset which contains
+        the ancestral alleles. Unknown ancestral alleles can be specified using "N".
+        Any ancestral alleles which do not match any of the known alleles at that site,
+        will be tallied, and a warning issued summarizing the unknown ancestral states.
+    :param Union(array, str) sample_mask: A numpy array of booleans specifying which
+        samples to mask out (exclude) from the dataset. Alternatively, a string
+        can be provided, giving the name of an array in the input dataset which contains
+        the sample mask. If ``None`` (default), all samples are included.
+    :param Union(array, str) site_mask: A numpy array of booleans specifying which
+        sites to mask out (exclude) from the dataset. Alternatively, a string
+        can be provided, giving the name of an array in the input dataset which contains
+        the site mask. If ``None`` (default), all sites are included.
+    :param Union(array, str) sites_time: A numpy array of floats specifying the relative
+        time of occurrence of the mutation to the derived state at each site. This must
+        be of the same length as the number of unmasked sites. Alternatively, a
+        string can be provided, giving the name of an array in the input dataset
+        which contains the site times. If ``None`` (default), the frequency of the
+        derived allele is used as a proxy for the time of occurrence: this is usually a
+        reasonable approximation to the relative order of ancestors used for inference.
+        Time values are ignored for sites not used in inference, such as singletons,
+        sites with more than two alleles, or sites with an unknown ancestral allele.
+    """
 
     FORMAT_NAME = "tsinfer-variant-data"
     FORMAT_VERSION = (0, 1)