diff --git a/docs/api/haptools.rst b/docs/api/haptools.rst index 54c94cc6..d37b5ab7 100644 --- a/docs/api/haptools.rst +++ b/docs/api/haptools.rst @@ -36,3 +36,18 @@ haptools.data.phenotypes module :undoc-members: :show-inheritance: +haptools.data.covariates module +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: haptools.data.covariates + :members: + :undoc-members: + :show-inheritance: + +haptools.data.haplotypes module +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: haptools.data.haplotypes + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/commands/simgenotype.md b/docs/commands/simgenotype.md index 8fb4e324..433505d8 100644 --- a/docs/commands/simgenotype.md +++ b/docs/commands/simgenotype.md @@ -1,4 +1,4 @@ -# Haptools simgenotype +# simgenotype `haptools simgenotype` takes as input a reference set of haplotypes in VCF format and a user-specified admixture model. It outputs a VCF file with simulated genotype information for admixed genotypes, as well as a breakpoints file that can be used for visualization. diff --git a/docs/commands/simgenotype.rst b/docs/commands/simgenotype.rst index 5e2ec827..9673929a 100644 --- a/docs/commands/simgenotype.rst +++ b/docs/commands/simgenotype.rst @@ -1,4 +1,4 @@ -.. _subcommands-simgenotype: +.. _commands-simgenotype: .. include:: simgenotype.md :parser: myst_parser.sphinx_ \ No newline at end of file diff --git a/docs/commands/simphenotype.md b/docs/commands/simphenotype.md index a21b86fe..629a95be 100644 --- a/docs/commands/simphenotype.md +++ b/docs/commands/simphenotype.md @@ -1,4 +1,4 @@ -# Haptools simphenotype +# simphenotype Haptools simphenotype simulates a complex trait, taking into account haplotype- or local-ancestry- specific effects as well as traditional variant-level effects. It takes causal effects and genotypes as input and outputs simulated phenotypes. @@ -19,9 +19,9 @@ haptools simphenotype \ Required parameters: -* `--vcf `: A bgzipped, tabix-indexed, phased VCF file. If you are simulating local-ancestry effects, the VCF file must contain the `FORMAT/LA` tag included in output of `haptools simgenotype`. See [haptools file formats](../../docs/project_info/haptools_file_formats.rst) for more details. +* `--vcf `: A bgzipped, tabix-indexed, phased VCF file. If you are simulating local-ancestry effects, the VCF file must contain the `FORMAT/LA` tag included in output of `haptools simgenotype`. See [haptools file formats](../../docs/formats/inputs.rst) for more details. -* `--hap `: A bgzipped, tabix-indexed HAP file, which specifies causal effects. This is a custom format described in more detail in [haptools file formats](../../docs/project_info/haptools_file_formats.rst). The HAP format enables flexible specification of a range of effect types including traditional variant-level effects, haplotype-level effects, associations with repeat lengths at short tandem repeats, and interaction of these effects with local ancestry labels. See [Examples](#examples) below for detailed examples of how to specify effects. +* `--hap `: A bgzipped, tabix-indexed HAP file, which specifies causal effects. This is a custom format described in more detail in [haptools file formats](../../docs/formats/haplotypes.rst). The HAP format enables flexible specification of a range of effect types including traditional variant-level effects, haplotype-level effects, associations with repeat lengths at short tandem repeats, and interaction of these effects with local ancestry labels. See [Examples](#examples) below for detailed examples of how to specify effects. * `--out `: Prefix to name output files. diff --git a/docs/commands/simphenotype.rst b/docs/commands/simphenotype.rst index 09b1c98b..5c2afcfe 100644 --- a/docs/commands/simphenotype.rst +++ b/docs/commands/simphenotype.rst @@ -1,4 +1,4 @@ -.. _subcommands-simphenotype: +.. _commands-simphenotype: .. include:: simphenotype.md :parser: myst_parser.sphinx_ \ No newline at end of file diff --git a/docs/commands/transform.rst b/docs/commands/transform.rst new file mode 100644 index 00000000..e6f82493 --- /dev/null +++ b/docs/commands/transform.rst @@ -0,0 +1,35 @@ +.. _commands-transform: + + +transform +========= + +Transform a set of genotypes via a list of haplotypes. Create a new VCF containing haplotypes instead of variants. + +The ``transform`` command takes as input a set of genotypes in VCF and a list of haplotypes (specified as a :doc:`.hap file `) and outputs a set of haplotype "genotypes" in VCF. + +Usage +~~~~~ +.. code-block:: bash + + haptools transform \ + --region TEXT \ + --sample SAMPLE \ + --samples-file FILENAME \ + --output PATH \ + --verbosity [CRITICAL|ERROR|WARNING|INFO|DEBUG|NOTSET] \ + GENOTYPES HAPLOTYPES + +Examples +~~~~~~~~ +.. code-block:: bash + + haptools transform tests/data/example.vcf.gz tests/data/example.hap.gz | less + +Detailed Usage +~~~~~~~~~~~~~~ + +.. click:: haptools.__main__:main + :prog: haptools + :show-nested: + :commands: transform diff --git a/docs/formats/haplotypes.rst b/docs/formats/haplotypes.rst new file mode 100644 index 00000000..a6707d1e --- /dev/null +++ b/docs/formats/haplotypes.rst @@ -0,0 +1,168 @@ +.. _formats-haplotypes: + + +.hap +==== + +This document describes our custom file format specification for haplotypes: the ``.hap`` file. + +This is a tab-separated file composed of different types of lines. The first field of each line is a single, uppercase character denoting the type of line. The following line types are supported. + +.. list-table:: + :widths: 25 25 + :header-rows: 1 + + * - Type + - Description + * - # + - Comment + * - H + - Haplotype + * - V + - Variant + +Each line type (besides #) has a set of mandatory fields described below. Additional "extra" fields can be appended to these to customize the file. + +``#`` Comment line +~~~~~~~~~~~~~~~~~~ +Comment lines begin with ``#`` and are ignored. Consecutive comment lines that appear at the beginning of the file are treated as part of the header. + +Extra fields must be declared in the header. The declaration must be a tab-separated line containing the following fields: + +1. Line type (ex: ``H`` or ``V``) +2. Name +3. Python format string (ex: 'd' for int, 's' for string, or '.3f' for a float with 3 decimals) +4. Description + +Note that the first field must follow the ``#`` symbol immediately (ex: ``#H`` or ``#V``). + + +``H`` Haplotype +~~~~~~~~~~~~~~~ +Haplotypes contain the following attributes: + +.. list-table:: + :widths: 25 25 25 50 + :header-rows: 1 + + * - Column + - Field + - Type + - Description + * - 1 + - Chromosome + - string + - The contig that this haplotype belongs on + * - 2 + - Start Position + - int + - The start position of this haplotype on this contig + * - 3 + - End Position + - int + - The end position of this haplotype on this contig + * - 4 + - Haplotype ID + - string + - Uniquely identifies a haplotype + +``V`` Variant +~~~~~~~~~~~~~ +Each variant line belongs to a particular haplotype. These lines contain the following attributes: + +.. list-table:: + :widths: 25 25 25 50 + :header-rows: 1 + + * - Column + - Field + - Type + - Description + * - 1 + - Haplotype ID + - string + - Identifies the haplotype to which this variant belongs + * - 2 + - Start Position + - int + - The start position of this variant on its contig + * - 3 + - End Position + - int + - The end position of this variant on its contig + + Usually the same as the Start Position + * - 4 + - Variant ID + - string + - The unique ID for this variant, as defined in the genotypes file + * - 5 + - Allele + - string + - The allele of this variant within the haplotype + +Examples +~~~~~~~~ +You can find an example of a ``.hap`` file without any extra fields in `tests/data/basic.hap `_: + +.. include:: ../../tests/data/basic.hap + :literal: + +You can find an example with extra fields added within `tests/data/simphenotype.hap `_: + +.. include:: ../../tests/data/simphenotype.hap + :literal: + + +Compressing and indexing +~~~~~~~~~~~~~~~~~~~~~~~~ +We encourage you to bgzip compress and/or index your ``.hap`` file whenever possible. This will reduce both disk usage and the time required to parse the file. + +.. code-block:: bash + + sort -k1,4 -o file.hap file.hap + bgzip file.hap + tabix -s 2 -b 3 -e 4 file.hap.gz + +In order to properly index the file, the IDs in the haplotype lines must be different from their chromosomes. In addition, you must sort on the first field (ie the line type symbol) in addition to the latter three. + +Extra fields +~~~~~~~~~~~~ +Additional fields can be appended to the ends of the haplotype and variant lines as long as they are declared in the header. + +haptools extras +--------------- +The following extra fields should be declared for your ``.hap`` file to be compatible with ``simphenotype``. + +.. code-block:: + + #H ancestry s Local ancestry + #H beta .2f Effect size in linear model + +.. + _TODO: figure out how to tab this code block so that the tabs get copied when someone copies from it + + +``H`` Haplotype ++++++++++++++++ + +.. list-table:: + :widths: 25 25 25 50 + :header-rows: 1 + + * - Column + - Field + - Type + - Description + * - 5 + - Local Ancestry + - string + - A population code denoting this haplotype's ancestral origins + * - 6 + - Effect Size + - float + - The effect size of this haplotype; for use in ``simphenotype`` + +``V`` Variant ++++++++++++++ +No extra fields are required here. diff --git a/docs/executing/inputs.rst b/docs/formats/inputs.rst similarity index 97% rename from docs/executing/inputs.rst rename to docs/formats/inputs.rst index 0b0e7452..14f89d9b 100644 --- a/docs/executing/inputs.rst +++ b/docs/formats/inputs.rst @@ -1,4 +1,4 @@ -.. _executing-inputs: +.. _formats-inputs: Inputs diff --git a/docs/index.rst b/docs/index.rst index 8bf44d4e..6b3266dc 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -4,12 +4,13 @@ :parser: myst_parser.sphinx_ .. toctree:: - :caption: Execution - :name: executing + :caption: File Formats + :name: formats :hidden: :maxdepth: 1 - executing/inputs.rst + formats/inputs.rst + formats/haplotypes.rst .. toctree:: :caption: Commands @@ -18,6 +19,8 @@ :maxdepth: 1 commands/simgenotype.rst + commands/simphenotype.rst + commands/transform.rst .. toctree:: :caption: API diff --git a/docs/project_info/haptools_file_formats.rst b/docs/project_info/haptools_file_formats.rst deleted file mode 100644 index 67fc1401..00000000 --- a/docs/project_info/haptools_file_formats.rst +++ /dev/null @@ -1,3 +0,0 @@ -# Haptools file formats - -UNDER CONSTRUCTION \ No newline at end of file diff --git a/haptools/__main__.py b/haptools/__main__.py index 3bcb743c..b89e4533 100644 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -1,9 +1,13 @@ +#!/usr/bin/env python + +from __future__ import annotations import sys import click +from pathlib import Path -from .karyogram import PlotKaryogram -from .sim_genotypes import simulate_gt, write_breakpoints -from .sim_phenotypes import simulate_pt +# AVOID IMPORTING ANYTHING HERE +# any imports we put here will make it slower to use the command line client +# a basic "haptools --help" should be quick and require very few imports, for example ################### Haptools ################## @click.group() @@ -38,6 +42,7 @@ def karyogram(bp, sample, out, title, centromeres, colors): --out test.png --centromeres tests/data/centromeres_hg19.txt \ --colors 'CEU:blue,YRI:red' """ + from .karyogram import PlotKaryogram if colors is not None: colors = dict([item.split(":") for item in colors.split(",")]) PlotKaryogram(bp, sample, out, \ @@ -69,12 +74,10 @@ def simgenotype(invcf, sample_info, model, mapdir, out, popsize, seed, chroms): --mapdir map/ \ --out test """ + from .sim_genotypes import simulate_gt, write_breakpoints chroms = chroms.split(',') samples, breakpoints = simulate_gt(model, mapdir, chroms, popsize, seed) breakpoints = write_breakpoints(samples, breakpoints, out) - - - ############ Haptools simphenotype ############### DEFAULT_SIMU_REP = 1 @@ -101,6 +104,7 @@ def simphenotype(vcf, hap, simu_rep, simu_hsq, simu_k, simu_qt, simu_cc, out): """ Haplotype-aware phenotype simulation """ + from .sim_phenotypes import simulate_pt # Basic checks on input # TODO - check VCF zipped, check only one of simu-qt/simu-cc, # check values of other inputs @@ -110,6 +114,137 @@ def simphenotype(vcf, hap, simu_rep, simu_hsq, simu_k, simu_qt, simu_cc, out): simulate_pt(vcf, hap, simu_rep, \ simu_hsq, simu_k, simu_qt, simu_cc, out) +@main.command(short_help="Transform a genotypes matrix via a set of haplotypes") +@click.argument("genotypes", type=click.Path(exists=True, path_type=Path)) +@click.argument("haplotypes", type=click.Path(exists=True, path_type=Path)) +@click.option( + "--region", + type=str, + default=None, + show_default="all genotypes", + help=""" + The region from which to extract genotypes; ex: 'chr1:1234-34566' or 'chr7'\n + For this to work, the VCF must be indexed and the seqname must match!""", +) +@click.option( + "-s", + "--sample", + "samples", + type=str, + multiple=True, + show_default="all samples", + help=( + "A list of the samples to subset from the genotypes file (ex: '-s sample1 -s" + " sample2')" + ), +) +@click.option( + "-S", + "--samples-file", + type=click.File("r"), + show_default="all samples", + help=( + "A single column txt file containing a list of the samples (one per line) to" + " subset from the genotypes file" + ), +) +@click.option( + "-o", + "--output", + type=click.Path(path_type=Path), + default=Path("-"), + show_default="stdout", + help="A VCF file containing haplotype 'genotypes'", +) +@click.option( + "-v", + "--verbosity", + type=click.Choice(["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"]), + default="ERROR", + show_default="only errors", + help="The level of verbosity desired", +) +def transform( + genotypes: Path, + haplotypes: Path, + region: str = None, + samples: tuple[str] = tuple(), + samples_file: Path = None, + output: Path = Path("-"), + verbosity: str = 'CRITICAL', +): + """ + Creates a VCF composed of haplotypes + + GENOTYPES must be formatted as a VCF and HAPLOTYPES must be formatted according + to the .hap format spec + + \f + Examples + -------- + >>> haptools transform tests/data/example.vcf.gz tests/data/example.hap.gz > example_haps.vcf + + Parameters + ---------- + genotypes : Path + The path to the genotypes in VCF format + haplotypes : Path + The path to the haplotypes in a .hap file + region : str, optional + See documentation for :py:meth:`~.data.Genotypes.read` + sample : Tuple[str], optional + See documentation for :py:meth:`~.data.Genotypes.read` + samples_file : Path, optional + A single column txt file containing a list of the samples (one per line) to + subset from the genotypes file + output : Path, optional + The location to which to write output + verbosity : str, optional + The level of verbosity desired in messages written to stderr + """ + import logging + + from haptools import data + from .haplotype import HaptoolsHaplotype + + log = logging.getLogger("run") + logging.basicConfig( + format="[%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)", + level=verbosity, + ) + # handle samples + if samples and samples_file: + raise click.UsageError( + "You may only use one of --sample or --samples-file but not both." + ) + if samples_file: + with samples_file as samps_file: + samples = samps_file.read().splitlines() + elif samples: + # needs to be converted from tuple to list + samples = list(samples) + else: + samples = None + + log.info("Loading haplotypes") + hp = data.Haplotypes(haplotypes) + hp.read(region=region) + log.info("Extracting variants from haplotypes") + variants = {var.id for hap in hp.data.values() for var in hap.variants} + log.info("Loading genotypes") + gt = data.GenotypesRefAlt(genotypes, log=log) + # gt._prephased = True + gt.read(region=region, samples=samples, variants=variants) + gt.check_missing(discard_also=True) + gt.check_biallelic(discard_also=True) + gt.check_phase() + log.info("Transforming genotypes via haplotypes") + hp_gt = data.GenotypesRefAlt(fname=output, log=log) + hp.transform(gt, hp_gt) + log.info("Writing haplotypes to VCF") + hp_gt.write() + + if __name__ == "__main__": # run the CLI if someone tries 'python -m haptools' on the command line main(prog_name="haptools") diff --git a/haptools/data/__init__.py b/haptools/data/__init__.py index 8c30fd86..5953f7d6 100644 --- a/haptools/data/__init__.py +++ b/haptools/data/__init__.py @@ -1,4 +1,5 @@ from .data import Data -from .genotypes import Genotypes +from .genotypes import Genotypes, GenotypesRefAlt from .phenotypes import Phenotypes from .covariates import Covariates +from .haplotypes import Extra, Variant, Haplotype, Haplotypes diff --git a/haptools/data/covariates.py b/haptools/data/covariates.py index bf7321c2..45ad1f30 100644 --- a/haptools/data/covariates.py +++ b/haptools/data/covariates.py @@ -2,6 +2,7 @@ from csv import reader from pathlib import Path from collections import namedtuple +from logging import getLogger, Logger from fileinput import hook_compressed import numpy as np @@ -111,7 +112,7 @@ def read(self, samples: list[str] = None): # coerce strings to floats self.data = np.transpose(np.array(data[1:], dtype="float64")) - def iterate(self, samples: list[str] = None) -> Iterator[namedtuple]: + def __iter__(self, samples: list[str] = None) -> Iterator[namedtuple]: """ Read covariates from a TSV line by line without storing anything diff --git a/haptools/data/data.py b/haptools/data/data.py index 8709dcc7..ab6b6b35 100644 --- a/haptools/data/data.py +++ b/haptools/data/data.py @@ -45,16 +45,27 @@ def load(cls: Data, fname: Path): """ pass + def unset(self) -> bool: + """ + Whether the data has been loaded into the object yet + + Returns + ------- + bool + True if :py:attr:`~.Data.data` is None else False + """ + return self.data is None + @abstractmethod def read(self): """ Read the raw file contents into the class properties """ - if self.data is not None: + if not self.unset(): self.log.warning("The data has already been loaded. Overriding.") @abstractmethod - def iterate(self) -> Iterator[namedtuple]: + def __iter__(self) -> Iterator[namedtuple]: """ Return an iterator over the raw file contents diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index ad864fd1..24e36fd9 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -1,10 +1,14 @@ from __future__ import annotations +from csv import reader from pathlib import Path from typing import Iterator from collections import namedtuple +from logging import getLogger, Logger import numpy as np +import numpy.typing as npt from cyvcf2 import VCF, Variant +from pysam import VariantFile, TabixFile from .data import Data @@ -15,34 +19,54 @@ class Genotypes(Data): Attributes ---------- - data : np.array + data : npt.NDArray The genotypes in an n (samples) x p (variants) x 2 (strands) array fname : Path The path to the read-only file containing the data samples : tuple[str] The names of each of the n samples - variants : list + variants : np.array Variant-level meta information: 1. ID 2. CHROM 3. POS 4. AAF: allele freq of alternate allele (or MAF if to_MAC() is called) log: Logger - A logging instance for recording debug statements. + A logging instance for recording debug statements + _prephased : bool + If True, assume that the genotypes are phased. Otherwise, extract their phase + when reading from the VCF. Examples -------- >>> genotypes = Genotypes.load('tests/data/simple.vcf') + >>> # directly access the loaded variants, samples, and genotypes (in data) + >>> genotypes.variants + >>> genotypes.samples + >>> genotypes.data """ def __init__(self, fname: Path, log: Logger = None): super().__init__(fname, log) self.samples = tuple() - self.variants = np.array([]) + self.variants = np.array( + [], + dtype=[ + ("id", "U50"), + ("chrom", "U10"), + ("pos", np.uint32), + ("aaf", np.float64), + ], + ) + self._prephased = False @classmethod def load( - cls: Genotypes, fname: Path, region: str = None, samples: list[str] = None + cls: Genotypes, + fname: Path, + region: str = None, + samples: list[str] = None, + variants: set[str] = None, ) -> Genotypes: """ Load genotypes from a VCF file @@ -57,6 +81,8 @@ def load( See documentation for :py:meth:`~.Genotypes.read` samples : list[str], optional See documentation for :py:meth:`~.Genotypes.read` + variants : set[str], optional + See documentation for :py:meth:`~.Genotypes.read` Returns ------- @@ -64,13 +90,21 @@ def load( A Genotypes object with the data loaded into its properties """ genotypes = cls(fname) - genotypes.read(region, samples) + genotypes.read(region, samples, variants) + genotypes.check_missing() genotypes.check_biallelic() - genotypes.check_phase() + if not self._prephased: + genotypes.check_phase() # genotypes.to_MAC() return genotypes - def read(self, region: str = None, samples: list[str] = None): + def read( + self, + region: str = None, + samples: list[str] = None, + variants: set[str] = None, + max_variants: int = None, + ): """ Read genotypes from a VCF into a numpy matrix stored in :py:attr:`~.Genotypes.data` @@ -91,61 +125,109 @@ def read(self, region: str = None, samples: list[str] = None): A subset of the samples from which to extract genotypes Defaults to loading genotypes from all samples + variants : set[str], optional + A set of variant IDs for which to extract genotypes + + All other variants will be ignored. This may be useful if you're running + out of memory. + max_variants : int, optional + The maximum mumber of variants to load from the file. Setting this value + helps preallocate the arrays, making the process faster and less memory + intensive. You should use this option if your processes are frequently + "Killed" from memory overuse. + + If you don't know how many variants there are, set this to a large number + greater than what you would except. The np array will be resized + appropriately. You can also use the bcftools "counts" plugin to obtain the + number of expected sites within a region. + + Note that this value is ignored if the variants argument is provided. """ super().read() - # initialize variables - vcf = VCF(str(self.fname), samples=samples) - self.samples = tuple(vcf.samples) - self.variants = [] - self.data = [] - # load all info into memory - for variant in vcf(region): - # save meta information about each variant - self.variants.append((variant.ID, variant.CHROM, variant.POS, variant.aaf)) - # extract the genotypes to a matrix of size n x p x 3 - # the last dimension has three items: - # 1) presence of REF in strand one - # 2) presence of REF in strand two - # 3) whether the genotype is phased - self.data.append(variant.genotypes) - vcf.close() - # convert to np array for speedy operations later on - self.variants = np.array( - self.variants, - dtype=[ - ("id", "U50"), - ("chrom", "U10"), - ("pos", np.uint), - ("aaf", np.float64), - ], - ) - self.data = np.array(self.data, dtype=np.uint8) + records = self.__iter__(region=region, samples=samples, variants=variants) + if variants is not None: + max_variants = len(variants) + # check whether we can preallocate memory instead of making copies + if max_variants is None: + self.log.warning( + "The max_variants parameter was not specified. We have no choice but to" + " append to an ever-growing array, which can lead to memory overuse!" + ) + variants_arr = [] + data_arr = [] + for rec in records: + variants_arr.append(rec.variants) + data_arr.append(rec.data) + self.log.info(f"Copying {len(variants_arr)} variants into np arrays.") + # convert to np array for speedy operations later on + self.variants = np.array(variants_arr, dtype=self.variants.dtype) + self.data = np.array(data_arr, dtype=np.uint8) + else: + # preallocate arrays! this will save us lots of memory and speed b/c + # appends can sometimes make copies + self.variants = np.empty((max_variants,), dtype=self.variants.dtype) + # in order to check_phase() later, we must store the phase info, as well + self.data = np.empty( + (max_variants, len(self.samples), (2 + (not self._prephased))), + dtype=np.uint8, + ) + num_seen = 0 + for rec in records: + self.variants[num_seen] = rec.variants + self.data[num_seen] = rec.data + num_seen += 1 + if max_variants > num_seen: + self.log.info( + f"Removing {max_variants-num_seen} unneeded variant records that " + "were preallocated b/c max_variants was specified." + ) + self.variants = self.variants[:num_seen] + self.data = self.data[:num_seen] if self.data.shape == (0, 0, 0): self.log.warning( "Failed to load genotypes. If you specified a region, check that the" " contig name matches! For example, double-check the 'chr' prefix." ) # transpose the GT matrix so that samples are rows and variants are columns + self.log.info("Transposing genotype matrix.") self.data = self.data.transpose((1, 0, 2)) - def iterate( - self, region: str = None, samples: list[str] = None - ) -> Iterator[namedtuple]: + def _variant_arr(self, record: Variant): """ - Read genotypes from a VCF line by line without storing anything + Construct a np array from the metadata in a line of the VCF + + This is a helper function for :py:meth:`~.Genotypes._iterate`. It's separate + so that it can easily be overridden in any child classes. Parameters ---------- - region : str, optional - The region from which to extract genotypes; ex: 'chr1:1234-34566' or 'chr7' + record: Variant + A cyvcf2.Variant object from which to fetch metadata - For this to work, the VCF must be indexed and the seqname must match! + Returns + ------- + npt.NDArray + A row from the :py:attr:`~.Genotypes.variants` array + """ + return np.array( + (record.ID, record.CHROM, record.POS, record.aaf), + dtype=self.variants.dtype, + ) - Defaults to loading all genotypes - samples : list[str], optional - A subset of the samples from which to extract genotypes + def _iterate(self, vcf: VCF, region: str = None, variants: set[str] = None): + """ + A generator over the lines of a VCF - Defaults to loading genotypes from all samples + This is a helper function for :py:meth:`~.Genotypes.__iter__` + + Parameters + ---------- + vcf: VCF + The cyvcf2.VCF object from which to fetch variant records + region : str, optional + See documentation for :py:meth:`~.Genotypes.read` + variants : set[str], optional + See documentation for :py:meth:`~.Genotypes.read` Yields ------ @@ -153,30 +235,90 @@ def iterate( An iterator over each line in the file, where each line is encoded as a namedtuple containing each of the class properties """ - vcf = VCF(str(self.fname), samples=samples) - samples = tuple(vcf.samples) - Record = namedtuple("Record", "data samples variants") - # load all info into memory + self.log.info(f"Loading genotypes from {len(self.samples)} samples") + Record = namedtuple("Record", "data variants") + # iterate over each line in the VCF + # note, this can take a lot of time if there are many samples for variant in vcf(region): + if variants is not None and variant.ID not in variants: + continue # save meta information about each variant - variants = np.array( - (variant.ID, variant.CHROM, variant.POS, variant.aaf), - dtype=[ - ("id", "U50"), - ("chrom", "U10"), - ("pos", np.uint), - ("aaf", np.float64), - ], - ) + variant_arr = self._variant_arr(variant) # extract the genotypes to a matrix of size 1 x p x 3 # the last dimension has three items: # 1) presence of REF in strand one # 2) presence of REF in strand two - # 3) whether the genotype is phased + # 3) whether the genotype is phased (if self._prephased is False) data = np.array(variant.genotypes, dtype=np.uint8) - yield Record(data, samples, variants) + data = data[:, : (2 + (not self._prephased))] + yield Record(data, variant_arr) vcf.close() + def __iter__( + self, region: str = None, samples: list[str] = None, variants: set[str] = None + ) -> Iterator[namedtuple]: + """ + Read genotypes from a VCF line by line without storing anything + + Parameters + ---------- + region : str, optional + See documentation for :py:meth:`~.Genotypes.read` + samples : list[str], optional + See documentation for :py:meth:`~.Genotypes.read` + variants : set[str], optional + See documentation for :py:meth:`~.Genotypes.read` + + Returns + ------- + Iterator[namedtuple] + See documentation for :py:meth:`~.Genotypes._iterate` + """ + vcf = VCF(str(self.fname), samples=samples, lazy=True) + self.samples = tuple(vcf.samples) + # call another function to force the lines above to be run immediately + # see https://stackoverflow.com/a/36726497 + return self._iterate(vcf, region, variants) + + def check_missing(self, discard_also=False): + """ + Check that each sample is properly genotyped + + Raises + ------ + ValueError + If any of the samples have missing genotypes 'GT: .|.' + + Parameters + ---------- + discard_also : bool, optional + If True, discard any samples that are missing genotypes without raising a + ValueError + """ + # check: are there any samples that have genotype values that are empty? + # A genotype value equal to the max for uint8 indicates the value was missing + missing = np.any(self.data[:, :, :2] == np.iinfo(np.uint8).max, axis=2) + if np.any(missing): + samp_idx, variant_idx = np.nonzero(missing) + if discard_also: + self.log.info( + f"Ignoring missing genotypes from {len(samp_idx)} samples" + ) + self.data = np.delete(self.data, samp_idx, axis=0) + self.samples = tuple(np.delete(self.samples, samp_idx)) + else: + raise ValueError( + "Genotype with ID {} at POS {}:{} is missing for sample {}".format( + *tuple(self.variants[variant_idx[0]])[:3], + self.samples[samp_idx[0]], + ) + ) + if discard_also and not self.data.shape[0]: + self.log.warning( + "All samples were discarded! Check that that none of your variants are" + " missing genotypes (GT: '.|.')." + ) + def check_biallelic(self, discard_also=False): """ Check that each genotype is composed of only two alleles @@ -185,9 +327,6 @@ def check_biallelic(self, discard_also=False): Raises ------ - AssertionError - If the number of alleles has already been checked and the dtype has been - converted to bool ValueError If any of the genotypes have more than two alleles @@ -205,15 +344,22 @@ def check_biallelic(self, discard_also=False): if np.any(multiallelic): samp_idx, variant_idx = np.nonzero(multiallelic) if discard_also: + self.log.info(f"Ignoring {len(variant_idx)} multiallelic variants") self.data = np.delete(self.data, variant_idx, axis=1) self.variants = np.delete(self.variants, variant_idx) else: raise ValueError( - "Variant with ID {} at POS {}:{} is multiallelic for sample {}".format( + "Variant with ID {} at POS {}:{} is multiallelic for sample {}" + .format( *tuple(self.variants[variant_idx[0]])[:3], self.samples[samp_idx[0]], ) ) + if discard_also and not self.data.shape[1]: + self.log.warning( + "All variants were discarded! Check that there are biallelic variants " + "in your dataset." + ) self.data = self.data.astype(np.bool_) def check_phase(self): @@ -224,12 +370,10 @@ def check_phase(self): Raises ------ - AssertionError - If the phase information has already been checked and removed from the data ValueError If any heterozgyous genotpyes are unphased """ - if self.data.shape[2] < 3: + if self._prephased or self.data.shape[2] < 3: self.log.warning("Phase information has already been removed from the data") return # check: are there any variants that are heterozygous and unphased? @@ -277,98 +421,257 @@ def to_MAC(self): ] -class GenotypesPLINK(Data): +class GenotypesRefAlt(Genotypes): """ - A class for processing genotypes from a PLINK .pgen file + A class for processing genotypes from a file + Unlike the base Genotypes class, this class also includes REF and ALT alleles in + the variants array Attributes ---------- data : np.array - The genotypes in an n (samples) x p (variants) x 2 (strands) array - samples : tuple - The names of each of the n samples - variants : list + See documentation for :py:attr:`~.Genotypes.data` + fname : Path + See documentation for :py:attr:`~.Genotypes.fname` + samples : tuple[str] + See documentation for :py:attr:`~.Genotypes.samples` + variants : np.array Variant-level meta information: 1. ID 2. CHROM 3. POS 4. AAF: allele freq of alternate allele (or MAF if to_MAC() is called) + 5. REF + 6. ALT log: Logger - A logging instance for recording debug statements. - - Examples - -------- - >>> genotypes = Genotypes.load('tests/data/simple.pgen') + See documentation for :py:attr:`~.Genotypes.log` """ def __init__(self, fname: Path, log: Logger = None): - super().__init__(fname, log) + super(Genotypes, self).__init__(fname, log) self.samples = tuple() - self.variants = np.array([]) + self.variants = np.array( + [], + dtype=[ + ("id", "U50"), + ("chrom", "U10"), + ("pos", np.uint32), + ("aaf", np.float64), + ("ref", "U100"), + ("alt", "U100"), + ], + ) + self._prephased = False - @classmethod - def load( - cls: Genotypes, fname: Path, region: str = None, samples: list[str] = None - ) -> Genotypes: + def _variant_arr(self, record: Variant): """ - Load genotypes from a VCF file + See documentation for :py:meth:`~.Genotypes._variant_arr` + """ + return np.array( + ( + record.ID, + record.CHROM, + record.POS, + record.aaf, + record.REF, + record.ALT[0], + ), + dtype=self.variants.dtype, + ) + + def write(self): + """ + Write the variants in this class to a VCF at :py:attr:`~.GenotypesRefAlt.fname` + """ + vcf = VariantFile(str(self.fname), mode="w") + # make sure the header is properly structured + for contig in set(self.variants["chrom"]): + vcf.header.contigs.add(contig) + vcf.header.add_meta( + "FORMAT", + items=[ + ("ID", "GT"), + ("Number", 1), + ("Type", "String"), + ("Description", "Genotype"), + ], + ) + for sample in self.samples: + # TODO: figure out how to make this work for large datasets + # it gets stuck and takes a long time to exit this loop + vcf.header.add_sample(sample) + self.log.info("Writing VCF records") + for var_idx, var in enumerate(self.variants): + rec = { + "contig": var["chrom"], + "start": var["pos"], + "stop": var["pos"] + 1, + "qual": None, + "alleles": tuple(var[["ref", "alt"]]), + "id": var["id"], + "filter": None, + } + # handle pysam increasing the start site by 1 + rec["start"] -= 1 + # parse the record into a pysam.VariantRecord + record = vcf.new_record(**rec) + for samp_idx, sample in enumerate(self.samples): + record.samples[sample]["GT"] = tuple(self.data[samp_idx, var_idx]) + record.samples[sample].phased = True + # write the record to a file + vcf.write(record) + vcf.close() - Read the file contents, check the genotype phase, and create the MAC matrix + +class GenotypesPLINK(GenotypesRefAlt): + """ + A class for processing genotypes from a PLINK .pgen file + + NOTE: this class is still under-development and NOT fit for usage + + Attributes + ---------- + data : np.array + See documentation for :py:attr:`~.GenotypesRefAlt.data` + samples : tuple + See documentation for :py:attr:`~.GenotypesRefAlt.data` + variants : np.array + See documentation for :py:attr:`~.GenotypesRefAlt.data` + log: Logger + See documentation for :py:attr:`~.GenotypesRefAlt.data` + _prephased: bool + See documentation for :py:attr:`~.GenotypesRefAlt.data` + + Examples + -------- + >>> genotypes = GenotypesPLINK.load('tests/data/simple.pgen') + """ + + def read_variants( + self, + variants: set[str] = None, + max_variants: int = None, + ): + """ + Read variants from a PVAR file into a numpy array stored in + :py:attr:`~.GenotypesPLINK.variants` + + One of either variants or max_variants MUST be specified! Parameters ---------- - fname - See documentation for :py:attr:`~.Data.fname` region : str, optional - See documentation for :py:meth:`~.Genotypes.read` - samples : list[str], optional - See documentation for :py:meth:`~.Genotypes.read` + See documentation for :py:attr:`~.GenotypesRefAlt.read` + variants : set[str], optional + See documentation for :py:attr:`~.GenotypesRefAlt.read` + max_variants : int, optional + See documentation for :py:attr:`~.GenotypesRefAlt.read` Returns ------- - genotypes - A Genotypes object with the data loaded into its properties + npt.NDArray[np.uint32] + The indices of each of the variants within the PVAR file """ - genotypes = cls(fname) - genotypes.read(region, samples) - genotypes.check_biallelic() - genotypes.check_phase() - # genotypes.to_MAC() - return genotypes - - def read(self, region: str = None, samples: list[str] = None): + if len(self.variants) != 0: + self.log.warning("Variant data has already been loaded. Overriding.") + if variants is not None: + max_variants = len(variants) + if max_variants is None: + raise ValueError("Provide either the variants or max_variants parameter!") + # TODO: try using pysam.tabix_iterator with the parser param specified for VCF + if region: + pvar = TabixFile(str(self.fname.with_suffix(".pvar"))) + pvariants = pvar.fetch(region) + else: + pvar = hook_compressed(self.fname.with_suffix(".pvar"), mode="rt") + pvariants = reader(pvar, delimiter="\t") + # first, preallocate the array + self.variants = np.empty((max_variants,), dtype=self.variants.dtype) + header = next(pvariants) + # there should be at least five columns + if len(header) < 5: + raise ValueError("Your PVAR file should have at least five columns.") + if header[0][0] == "#": + header[0] = header[0][1:] + else: + raise ValueError("Your PVAR file is missing a header!") + header = ["CHROM", "POS", "ID", "REF", "ALT"] + self.log.info( + "Your PVAR file lacks a proper header! Assuming first five columns" + " are [CHROM, POS, ID, REF, ALT] in that order..." + ) + # TODO: add header back in to pvariants using itertools.chain? + cid = {item: header.index(item.upper()) for item in self.variants.dtype.names} + if variants is not None: + indices = np.empty((max_variants,), dtype=np.uint32) + num_seen = 0 + for ct, rec in enumerate(pvariants): + if variants is not None: + if rec[cid["id"]] not in variants: + continue + indices[num_seen] = ct + self.variants[num_seen] = np.array( + ( + rec[cid["id"]], + rec[cid["chrom"]], + rec[cid["pos"]], + 0, + rec[cid["ref"]], + rec[cid["alt"]], + ), + dtype=self.variants.dtype, + ) + num_seen += 1 + pvar.close() + if variants is not None: + return indices + + def read( + self, + region: str = None, + samples: list[str] = None, + variants: set[str] = None, + max_variants: int = None, + ): """ - Read genotypes from a VCF into a numpy matrix stored in :py:attr:`~.Genotypes.data` + Read genotypes from a PGEN file into a numpy matrix stored in + :py:attr:`~.GenotypesPLINK.data` Parameters ---------- region : str, optional - The region from which to extract genotypes; ex: 'chr1:1234-34566' or 'chr7' - - For this to work, the VCF must be indexed and the seqname must match! - - Defaults to loading all genotypes + See documentation for :py:attr:`~.GenotypesRefAlt.read` samples : list[str], optional - A subset of the samples from which to extract genotypes - - Defaults to loading genotypes from all samples + See documentation for :py:attr:`~.GenotypesRefAlt.read` + variants : set[str], optional + See documentation for :py:attr:`~.GenotypesRefAlt.read` + max_variants : int, optional + See documentation for :py:attr:`~.GenotypesRefAlt.read` """ super().read() - # TODO: load the variant-level info from the .pvar file - # and use that info to figure out how many variants there are in the region - variant_ct_start = 0 - variant_ct_end = None - variant_ct = variant_ct_end - variant_ct_start # load the pgen-reader file # note: very little is loaded into memory at this point - from pgenlib import PgenReader # TODO: figure out how to install this package + # TODO: figure out how to install this package + from pgenlib import PgenReader + + pgen = PgenReader(bytes(self.fname, "utf8")) + + if variants is not None: + max_variants = len(variants) + if max_variants is None: + # use the pgen file to figure out how many variants there are + max_variants = pgen.get_variant_ct() + indices = self.read_variants(region, variants, max_variants) + + variant_ct_start = 0 + variant_ct_end = max_variants + variant_ct = variant_ct_end - variant_ct_start - pgen = PgenReader(bytes(self.fname)) sample_ct = pgen.get_raw_sample_ct() # the genotypes start out as a simple 2D array with twice the number of samples # so each column is a different chromosomal strand - self.data = np.empty((variant_ct, sample_ct * 2), np.int32) - pgen.read_alleles_range(variant_ct_start, variant_ct_end, self.data) + self.data = np.empty((variant_ct, sample_ct * 2), dtype=np.int32) + pgen.read_alleles(indices, self.data) # extract the genotypes to a np matrix of size n x p x 2 # the last dimension has two items: # 1) presence of REF in strand one diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py new file mode 100644 index 00000000..f15f5c9b --- /dev/null +++ b/haptools/data/haplotypes.py @@ -0,0 +1,807 @@ +from __future__ import annotations +import re +from pathlib import Path +from logging import getLogger, Logger +from fileinput import hook_compressed +from dataclasses import dataclass, field, fields +from typing import Iterator, get_type_hints, Generator + +import numpy as np +import numpy.typing as npt +from pysam import TabixFile + +from .data import Data +from .genotypes import GenotypesRefAlt + + +@dataclass +class Extra: + """ + An extra field on a line in the .hap file + + Attributes + ---------- + name: str + The name of the extra field + fmt: str = "s" + The python fmt string of the field value; indicates how to format the value + description: str = "" + A description of the extra field + """ + + name: str + fmt: str = "s" + description: str = "" + _type: type = field(init=False, repr=False) + + def __post_init(self): + if self.fmt.endswith("s"): + self._type = str + elif self.fmt.endswith("d"): + self._type = int + elif self.fmt.endswith("f"): + self._type = float + else: + raise ValueError("Unsupported extra type '{}'!".format(self.fmt[-1])) + + @classmethod + def from_hap_spec(cls, line: str) -> Extra: + """ + Convert an "extra" line in the header of a .hap file into an Extra object + + Parameters + ---------- + line: str + An "extra" field, as it appears declared in the header + + Returns + ------- + Extra + An Extra object + """ + line = line[3:].split("\t") + return cls(name=line[0], fmt=line[1], description=line[2]) + + def to_hap_spec(self, line_type_symbol: str) -> str: + """ + Convert an Extra object into a header line in the .hap format spec + + Parameters + ---------- + hap_id: str + The ID of the haplotype associated with this variant + + Returns + ------- + str + A valid variant line (V) in the .hap format spec + """ + return ( + "#" + + line_type_symbol + + "\t" + + "\t".join((self.name, self.fmt, self.description)) + ) + + @property + def fmt_str(self) -> str: + """ + Convert an Extra into a fmt string + + Retruns + ------- + str + A python format string (ex: "{beta:.3f}") + """ + return "{" + self.name + ":" + self.fmt + "}" + + +# We declare this class to be a dataclass to automatically define __init__ and a few +# other methods. +@dataclass +class Variant: + """ + A variant within the .hap format spec + + In order to use this class with the Haplotypes class, you should + 1) add properties to the class for each of extra fields + 2) override the _extras property to describe the header declaration + + Attributes + ---------- + start: int + The chromosomal start position of the variant + end: int + The chromosomal end position of the variant + + In most cases this will be the same as the start position + id: str + The variant's unique ID + allele: str + The allele of this variant within the Haplotype + _extras: tuple[Extra] + Extra fields for the haplotype + + Examples + -------- + Let's extend this class and add an extra field called "score" + + >>> from dataclasses import dataclass, field + >>> @dataclass + >>> class CustomVariant(Variant): + ... score: float + ... _extras: tuple = ( + ... Extra("score", ".3f", "Importance of inclusion"), + ... ) + """ + + start: int + end: int + id: str + allele: str + _extras: tuple = field(default=tuple(), init=False, repr=False) + + @property + def ID(self): + """ + Create an alias for the id property + """ + return self.id + + @property + # TODO: use @cached_property in py3.8 + def _fmt(self): + extras = "" + if len(self._extras): + extras = "\t" + "\t".join(extra.fmt_str for extra in self._extras) + return "V\t{hap:s}\t{start:d}\t{end:d}\t{id:s}\t{allele:s}" + extras + + @classmethod + def from_hap_spec(cls: Variant, line: str) -> tuple[str, Variant]: + """ + Convert a variant line into a Variant object in the .hap format spec + + Note that this implementation does NOT support having more extra fields than + appear in the header + + Parameters + ---------- + line: str + A variant (V) line from the .hap file + + Returns + ------- + tuple[str, Variant] + The haplotype ID and Variant object for the variant + """ + assert line[0] == "V", "Attempting to init a Variant with a non-V line" + line = line[2:].split("\t") + hap_id = line[0] + var_fields = {} + idx = 1 + for name, val in get_type_hints(cls).items(): + if not name.startswith("_"): + var_fields[name] = val(line[idx]) + idx += 1 + return hap_id, cls(**var_fields) + + def to_hap_spec(self, hap_id: str) -> str: + """ + Convert a Variant object into a variant line in the .hap format spec + + Parameters + ---------- + hap_id: str + The ID of the haplotype associated with this variant + + Returns + ------- + str + A valid variant line (V) in the .hap format spec + """ + return self._fmt.format(**self.__dict__, hap=hap_id) + + @classmethod + def extras_head(cls) -> tuple: + """ + Return the header lines of the extra fields that are supported + + Returns + ------- + tuple + The header lines of the extra fields + """ + return tuple(extra.to_hap_spec("V") for extra in cls._extras) + + +# We declare this class to be a dataclass to automatically define __init__ and a few +# other methods. +@dataclass +class Haplotype: + """ + A haplotype within the .hap format spec + + In order to use this class with the Haplotypes class, you should + 1) add properties to the class for each of extra fields + 2) override the _extras property to describe the header declaration + + Attributes + ---------- + chrom: str + The contig to which this haplotype belongs + start: int + The chromosomal start position of the haplotype + end: int + The chromosomal end position of the haplotype + id: str + The haplotype's unique ID + variants: list[Variant] + A list of the variants in this haplotype + _extras: tuple[Extra] + Extra fields for the haplotype + + Examples + -------- + Let's extend this class and add an extra field called "ancestry" + + >>> from dataclasses import dataclass, field + >>> @dataclass + >>> class CustomHaplotype(Haplotype): + ... ancestry: str + ... _extras: tuple = ( + ... Extra("ancestry", "s", "Local ancestry"), + ... ) + """ + + chrom: str + start: int + end: int + id: str + variants: tuple = field(default_factory=tuple, init=False) + _extras: tuple = field(default=tuple(), init=False, repr=False) + + @property + def ID(self): + """ + Create an alias for the id property + """ + return self.id + + @property + # TODO: use @cached_property in py3.8 + def _fmt(self): + extras = "" + if len(self._extras): + extras = "\t" + "\t".join(extra.fmt_str for extra in self._extras) + return "H\t{chrom:s}\t{start:d}\t{end:d}\t{id:s}" + extras + + @property + # TODO: use @cached_property in py3.8 + def varIDs(self): + return {var.id for var in self.variants} + + @classmethod + def from_hap_spec( + cls: Haplotype, line: str, variants: tuple = tuple() + ) -> Haplotype: + """ + Convert a variant line into a Haplotype object in the .hap format spec + + Note that this implementation does NOT support having more extra fields than + appear in the header + + Parameters + ---------- + line: str + A variant (H) line from the .hap file + + Returns + ------- + Haplotype + The Haplotype object for the variant + """ + assert line[0] == "H", "Attempting to init a Haplotype with a non-H line" + line = line[2:].split("\t") + hap_fields = {} + idx = 0 + for name, val in get_type_hints(cls).items(): + if name != "variants" and not name.startswith("_"): + hap_fields[name] = val(line[idx]) + idx += 1 + hap = cls(**hap_fields) + hap.variants = variants + return hap + + def to_hap_spec(self) -> str: + """ + Convert a Haplotype object into a haplotype line in the .hap format spec + + Returns + ------- + str + A valid haplotype line (H) in the .hap format spec + """ + return self._fmt.format(**self.__dict__) + + @classmethod + def extras_head(cls) -> tuple: + """ + Return the header lines of the extra fields that are supported + + Returns + ------- + tuple + The header lines of the extra fields + """ + return tuple(extra.to_hap_spec("H") for extra in cls._extras) + + def transform( + self, genotypes: GenotypesRefAlt, samples: list[str] = None + ) -> npt.NDArray[bool]: + """ + Transform a genotypes matrix via the current haplotype + + Each entry in the returned matrix denotes the presence of the current haplotype + in each chromosome of each sample in the Genotypes object + + Parameters + ---------- + genotypes : GenotypesRefAlt + The genotypes which to transform using the current haplotype + + If the genotypes have not been loaded into the Genotypes object yet, this + method will call Genotypes.read(), while loading only the needed variants + samples : list[str], optional + See documentation for :py:attr:`~.Genotypes.read` + + Returns + ------- + npt.NDArray[bool] + A 2D matrix of shape (num_samples, 2) where each entry in the matrix + denotes the presence of the haplotype in one chromosome of a sample + """ + var_IDs = self.varIDs + # check: have the genotypes been loaded yet? + # if not, we can load just the variants we need + if genotypes.unset(): + start = min(var.start for var in self.variants) + end = max(var.end for var in self.variants) + region = f"{self.chrom}:{start}-{end}" + genotypes.read(region=region, samples=samples, variants=var_IDs) + genotypes.check_biallelic(discard_also=True) + genotypes.check_phase() + # create a dict where the variants are keyed by ID + var_dict = { + var["id"]: var["ref"] for var in genotypes.variants if var["id"] in var_IDs + } + var_idxs = [ + idx for idx, var in enumerate(genotypes.variants) if var["id"] in var_IDs + ] + missing_IDs = var_IDs - var_dict.keys() + if len(missing_IDs): + raise ValueError( + f"Variants {missing_IDs} are present in haplotype '{self.id}' but " + "absent in the provided genotypes" + ) + # create a np array denoting the alleles that we want + alleles = [int(var.allele != var_dict[var.id]) for var in self.variants] + allele_arr = np.array([[[al] for al in alleles]]) # shape: (1, n, 1) + # look for the presence of each allele in each chromosomal strand + # and then just AND them together + return np.all(allele_arr == genotypes.data[:, var_idxs], axis=1) + + +class Haplotypes(Data): + """ + A class for processing haplotypes from a file + + Attributes + ---------- + fname: Path + The path to the file containing the data + data: dict[str, Haplotype] + A dict of Haplotype objects keyed by their IDs + types: dict + A dict of class names keyed by the symbol denoting their line type + + Ex: {'H': Haplotype, 'V': Variant} + version: str + A string denoting the current file format version + log: Logger + A logging instance for recording debug statements. + + Examples + -------- + Parsing a basic .hap file without any extra fields is simple: + >>> haplotypes = Haplotypes.load('tests/data/basic.hap') + >>> haps = haplotypes.data # a dictionary of Haplotype objects + + If the .hap file contains extra fields, you'll need to call the read() method + manually. You'll also need to create Haplotype and Variant subclasses that support + the extra fields and then specify the names of the classes when you initialize the + Haplotypes object: + >>> haplotypes = Haplotypes('tests/data/simphenotype.hap', HaptoolsHaplotype) + >>> haplotypes.read() + >>> haps = haplotypes.data # a dictionary of Haplotype objects + """ + + def __init__( + self, + fname: Path, + haplotype: type[Haplotype] = Haplotype, + variant: type[Variant] = Variant, + log: Logger = None, + ): + super().__init__(fname, log) + self.data = None + self.types = {"H": haplotype, "V": variant} + self.version = "0.0.1" + + @classmethod + def load( + cls: Haplotypes, fname: Path, region: str = None, haplotypes: set[str] = None + ) -> Haplotypes: + """ + Load haplotypes from a .hap file + + Read the file contents + + Parameters + ---------- + fname: Path + See documentation for :py:attr:`~.Data.fname` + region: str, optional + See documentation for :py:meth:`~.Haplotypes.read` + haplotypes: list[str], optional + See documentation for :py:meth:`~.Haplotypes.read` + + Returns + ------- + Haplotypes + A Haplotypes object with the data loaded into its properties + """ + haps = cls(fname) + haps.read(region, haplotypes) + return haps + + def check_header(self, lines: list[str], check_version=False): + """ + Check 1) that the version number matches and 2) that extra fields declared in + # the .haps file can be handled by the the Variant and Haplotype classes + # provided in __init__() + + Parameters + ---------- + lines: list[str] + Header lines from the .hap file + check_version: bool = False + Whether to also check the version of the file + + Raises + ------ + ValueError + If any of the header lines are not supported + """ + self.log.info("Checking header") + if check_version: + version_line = lines[0].split("\t") + assert version_line[1] == "version", ( + "The version of the format spec must be declared as the first line of" + " the header." + ) + if version_line[2] != self.version: + self.log.warning( + f"The version of the provided .hap file is {version_line} but this" + f" tool expected {self.version}" + ) + expected_lines = [ + line + for line_type in self.types.values() + for line in line_type.extras_head() + ] + for line in lines: + if line[1] in self.types.keys(): + try: + expected_lines.remove(line) + except ValueError: + # extract the name of the extra field + name = line.split("\t", maxsplit=1)[1] + raise ValueError( + f"The extra field '{name}' is declared in the header of the" + " .hap file but is not accepted by this tool." + ) + # if there are any fields left... + if expected_lines: + names = [line.split("\t", maxsplit=2)[1] for line in expected_lines] + raise ValueError( + "Expected the input .hap file to have these extra fields, but they " + f"don't seem to be declared in the header: {*names,}" + ) + + def _line_type(self, line: str) -> type: + """ + Return the type of line that this line matches + + Parameters + ---------- + line: str + A line of the .hap file + + Returns + ------- + type + The name of the class corresponding with the type of this line + """ + line_types = self.types.keys() + if line[0] in line_types: + return line[0] + else: + # if none of the lines matched, return None + return None + + def read(self, region: str = None, haplotypes: set[str] = None): + """ + Read haplotypes from a .hap file into a list stored in :py:attr:`~.Haplotypes.data` + + Parameters + ---------- + region: str, optional + The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7' + + For this to work, the .hap file must be indexed and the seqname must match! + + Defaults to loading all haplotypes + haplotypes: list[str], optional + A list of haplotype IDs corresponding to a subset of the haplotypes to + extract + + Defaults to loading haplotypes from all samples + + For this to work, the .hap file must be indexed + """ + super().read() + self.data = {} + var_haps = {} + for line in self.__iter__(region, haplotypes): + if isinstance(line, Haplotype): + self.data[line.id] = line + elif isinstance(line, Variant): + hap_id = line.hap + del line.hap + # store the variant for later + var_haps.setdefault(hap_id, []).append(line) + for hap in var_haps: + self.data[hap].variants = tuple(var_haps[hap]) + + def __iter__( + self, region: str = None, haplotypes: set[str] = None + ) -> Iterator[Variant | Haplotype]: + """ + Read haplotypes from a .hap file line by line without storing anything + + Parameters + ---------- + region: str, optional + The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7' + + For this to work, the .hap file must be indexed and the seqname must match! + + Defaults to loading all haplotypes + haplotypes: list[str], optional + A list of haplotype IDs corresponding to a subset of the haplotypes to + extract + + Defaults to loading haplotypes from all samples + + For this to work, the .hap file must be indexed + + Yields + ------ + Iterator[Variant|Haplotype] + An iterator over each line in the file, where each line is encoded as a + Variant or Haplotype containing each of the class properties + + Examples + -------- + If you're worried that the contents of the .hap file will be large, you may + opt to parse the file line-by-line instead of loading it all into memory at + once. In cases like these, you can use the __iter__() method in a for-loop: + >>> haplotypes = Haplotypes('tests/data/basic.hap') + >>> for line in haplotypes: + ... print(line) + + Call the function manually to pass it the region or haplotypes params: + >>> haplotypes = Haplotypes('tests/data/basic.hap.gz') + >>> for line in haplotypes.__iter__( + ... region='21:26928472-26941960', haplotypes={"chr21.q.3365*1"} + ... ): + ... print(line) + """ + # if the user requested a specific region or set of haplotypes, then we should + # handle it using tabix + # else, we use a regular text opener + if region or haplotypes: + haps_file = TabixFile(str(self.fname)) + self.check_header(list(haps_file.header)) + if region: + region_positions = region.split(":", maxsplit=1)[1] + # fetch region + # we already know that each line will start with an H, so we don't + # need to check that + for line in haps_file.fetch(region): + hap = self.types["H"].from_hap_spec(line) + if haplotypes is not None: + if hap.id not in haplotypes: + continue + haplotypes.remove(hap.id) + yield hap + else: + for line in haps_file.fetch(): + # we only want lines that start with an H + line_type = self._line_type(line) + if line_type == "H": + hap = self.types["H"].from_hap_spec(line) + if hap.id in haplotypes: + yield hap + haplotypes.remove(hap.id) + elif line_type > "H": + # if we've already passed all of the H's, we can just exit + # We assume the file has been sorted so that all of the H lines + # come before the V lines + break + # query for the variants of each haplotype + for hap_id in self.data: + # exclude variants outside the desired region + hap_region = hap_id + if region: + hap_region = hap_id + ":" + region_positions + # fetch region + # we already know that each line will start with a V, so we don't + # need to check that + for line in haps_file.fetch(hap_region): + line_type = self._line_type(line) + if line_type == "V": + var = self.types["V"].from_hap_spec(line)[1] + # add the haplotype, since otherwise, the user won't know + # which haplotype this variant belongs to + var.hap = hap_id + yield var + else: + self.log.warning( + "Check that chromosomes are distinct from your hap IDs!" + ) + haps_file.close() + else: + # the file is not indexed, so we can't assume it's sorted, either + # use hook_compressed to automatically handle gz files + with hook_compressed(self.fname, mode="rt") as haps: + self.log.info("Not taking advantage of indexing.") + header_lines = [] + for line in haps: + line = line.rstrip("\n") + line_type = self._line_type(line) + if line[0] == "#": + # store header for later + try: + header_lines.append(line) + except AttributeError: + # this happens when we encounter a line beginning with a # + # after already having seen an H or V line + # in this case, it's usually just a comment, so we can ignore + pass + else: + if header_lines: + self.check_header(header_lines) + header_lines = None + self.log.info("Finished reading header.") + if line_type == "H": + yield self.types["H"].from_hap_spec(line) + elif line_type == "V": + hap_id, var = self.types["V"].from_hap_spec(line) + # add the haplotype, since otherwise, the user won't know + # which haplotype this variant belongs to + var.hap = hap_id + yield var + else: + self.log.warning( + f"Ignoring unsupported line type '{line[0]}'" + ) + + def to_str(self) -> Generator[str, None, None]: + """ + Create a string representation of this Haplotype + + Yields + ------ + Generator[str, None, None] + A list of lines (strings) to include in the output + """ + yield "#\tversion\t" + self.version + for line_type in self.types: + yield from self.types[line_type].extras_head() + for hap in self.data.values(): + yield self.types["H"].to_hap_spec(hap) + for hap in self.data.values(): + for var in hap.variants: + yield self.types["V"].to_hap_spec(var, hap.id) + + def __repr__(self): + return "\n".join(self.to_str()) + + def write(self): + """ + Write the contents of this Haplotypes object to the file given by fname + + Parameters + ---------- + file: TextIO + A file-like object to which this Haplotypes object should be written. + + Examples + -------- + To write to a .hap file, you must first initialize a Haplotypes object and then + fill out the data property: + >>> haplotypes = Haplotypes('tests/data/basic.hap') + >>> haplotypes.data = {'H1': Haplotype('chr1', 0, 10, 'H1')} + >>> haplotypes.write() + """ + with hook_compressed(self.fname, mode="wt") as haps: + for line in self.to_str(): + haps.write(line + "\n") + + def transform( + self, + genotypes: GenotypesRefAlt, + hap_gts: GenotypesRefAlt, + samples: list[str] = None, + low_memory: bool = False, + ) -> GenotypesRefAlt: + """ + Transform a genotypes matrix via the current haplotype + + Each entry in the returned matrix denotes the presence of each haplotype + in each chromosome of each sample in the Genotypes object + + Parameters + ---------- + genotypes : GenotypesRefAlt + The genotypes which to transform using the current haplotype + + If the genotypes have not been loaded into the Genotypes object yet, this + method will call Genotypes.read(), while loading only the needed variants + hap_gts: GenotypesRefAlt + An empty GenotypesRefAlt object into which the haplotype genotypes should + be stored + samples : list[str], optional + See documentation for :py:attr:`~.Genotypes.read` + low_memory : bool, optional + If True, each haplotype's genotypes will be loaded one at a time. + + Returns + ------- + GenotypesRefAlt + A Genotypes object composed of haplotypes instead of regular variants. + """ + hap_gts.samples = genotypes.samples + hap_gts.variants = np.array( + [(hap.id, hap.chrom, hap.start, 0, "A", "T") for hap in self.data.values()], + dtype=[ + ("id", "U50"), + ("chrom", "U10"), + ("pos", np.uint32), + ("aaf", np.float64), + ("ref", "U100"), + ("alt", "U100"), + ], + ) + self.log.info( + f"Transforming a set of genotypes from {len(genotypes.variants)} total " + f"variants with a list of {len(self.data)} haplotypes" + ) + hap_gts.data = np.concatenate( + tuple( + hap.transform(genotypes, samples)[:, np.newaxis] + for hap in self.data.values() + ), + axis=1, + ).astype(np.uint8) diff --git a/haptools/data/phenotypes.py b/haptools/data/phenotypes.py index 3c85cc74..a27c2535 100644 --- a/haptools/data/phenotypes.py +++ b/haptools/data/phenotypes.py @@ -2,6 +2,7 @@ from csv import reader from pathlib import Path from collections import namedtuple +from logging import getLogger, Logger from fileinput import hook_compressed import numpy as np @@ -97,7 +98,7 @@ def read(self, samples: list[str] = None): # coerce strings to floats self.data = np.array(self.data, dtype="float64") - def iterate(self, samples: list[str] = None) -> Iterator[namedtuple]: + def __iter__(self, samples: list[str] = None) -> Iterator[namedtuple]: """ Read phenotypes from a TSV line by line without storing anything diff --git a/haptools/haplotype.py b/haptools/haplotype.py new file mode 100644 index 00000000..b6c0a1d8 --- /dev/null +++ b/haptools/haplotype.py @@ -0,0 +1,26 @@ +from __future__ import annotations +from dataclasses import dataclass, field + +import numpy as np + +from .data import Extra, Haplotype + + +@dataclass +class HaptoolsHaplotype(Haplotype): + """ + A haplotype with sufficient fields for simphenotype + + Properties and functions are shared with the base Haplotype object + """ + + ancestry: str + beta: float + _extras: tuple = field( + repr=False, + init=False, + default=( + Extra("ancestry", "s", "Local ancestry"), + Extra("beta", ".2f", "Effect size in linear model"), + ), + ) diff --git a/tests/data/basic.hap b/tests/data/basic.hap new file mode 100644 index 00000000..4ab87f52 --- /dev/null +++ b/tests/data/basic.hap @@ -0,0 +1,14 @@ +# version 0.0.1 +H 21 26928472 26941960 chr21.q.3365*1 +H 21 26938989 26941960 chr21.q.3365*10 +H 21 26938353 26938989 chr21.q.3365*11 +V chr21.q.3365*1 26928472 26928472 21_26928472_C_A C +V chr21.q.3365*1 26938353 26938353 21_26938353_T_C T +V chr21.q.3365*1 26940815 26940815 21_26940815_T_C C +V chr21.q.3365*1 26941960 26941960 21_26941960_A_G G +V chr21.q.3365*10 26938989 26938989 21_26938989_G_A A +V chr21.q.3365*10 26940815 26940815 21_26940815_T_C T +V chr21.q.3365*10 26941960 26941960 21_26941960_A_G A +# this comment should be ignored +V chr21.q.3365*11 26938353 26938353 21_26938353_T_C T +V chr21.q.3365*11 26938989 26938989 21_26938989_G_A A diff --git a/tests/data/basic.hap.gz b/tests/data/basic.hap.gz new file mode 100644 index 00000000..5285a5c4 Binary files /dev/null and b/tests/data/basic.hap.gz differ diff --git a/tests/data/basic.hap.gz.tbi b/tests/data/basic.hap.gz.tbi new file mode 100644 index 00000000..38ec3e11 Binary files /dev/null and b/tests/data/basic.hap.gz.tbi differ diff --git a/tests/data/example.hap.gz b/tests/data/example.hap.gz new file mode 100644 index 00000000..f234916b Binary files /dev/null and b/tests/data/example.hap.gz differ diff --git a/tests/data/example.hap.gz.tbi b/tests/data/example.hap.gz.tbi new file mode 100644 index 00000000..7147844d Binary files /dev/null and b/tests/data/example.hap.gz.tbi differ diff --git a/tests/data/example_haplotypes.vcf.gz b/tests/data/example.vcf.gz similarity index 100% rename from tests/data/example_haplotypes.vcf.gz rename to tests/data/example.vcf.gz diff --git a/tests/data/example_haplotypes.vcf.gz.tbi b/tests/data/example.vcf.gz.tbi similarity index 100% rename from tests/data/example_haplotypes.vcf.gz.tbi rename to tests/data/example.vcf.gz.tbi diff --git a/tests/data/example_haplotypes.hap.gz b/tests/data/example_haplotypes.hap.gz deleted file mode 100644 index 1baa1587..00000000 Binary files a/tests/data/example_haplotypes.hap.gz and /dev/null differ diff --git a/tests/data/example_haplotypes.hap.gz.tbi b/tests/data/example_haplotypes.hap.gz.tbi deleted file mode 100644 index 34d3d65b..00000000 Binary files a/tests/data/example_haplotypes.hap.gz.tbi and /dev/null differ diff --git a/tests/data/simphenotype.hap b/tests/data/simphenotype.hap new file mode 100644 index 00000000..529aef7a --- /dev/null +++ b/tests/data/simphenotype.hap @@ -0,0 +1,15 @@ +# version 0.0.1 +#H ancestry s Local ancestry +#H beta .2f Effect size in linear model +H 21 26928472 26941960 chr21.q.3365*1 ASW 0.73 +H 21 26938989 26941960 chr21.q.3365*10 CEU 0.30 +H 21 26938353 26938989 chr21.q.3365*11 MXL 0.49 +V chr21.q.3365*1 26928472 26928472 21_26928472_C_A C +V chr21.q.3365*1 26938353 26938353 21_26938353_T_C T +V chr21.q.3365*1 26940815 26940815 21_26940815_T_C C +V chr21.q.3365*1 26941960 26941960 21_26941960_A_G G +V chr21.q.3365*10 26938989 26938989 21_26938989_G_A A +V chr21.q.3365*10 26940815 26940815 21_26940815_T_C T +V chr21.q.3365*10 26941960 26941960 21_26941960_A_G A +V chr21.q.3365*11 26938353 26938353 21_26938353_T_C T +V chr21.q.3365*11 26938989 26938989 21_26938989_G_A A diff --git a/tests/data/simple.hap.gz b/tests/data/simple.hap.gz deleted file mode 100644 index 62093a21..00000000 Binary files a/tests/data/simple.hap.gz and /dev/null differ diff --git a/tests/test_data.py b/tests/test_data.py index 40a03a64..190e3370 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -1,136 +1,212 @@ +import os +from pathlib import Path + import pytest import numpy as np +import numpy.lib.recfunctions as rfn -from pathlib import Path - -from haptools.data import Genotypes, Phenotypes, Covariates +from haptools.haplotype import HaptoolsHaplotype +from haptools.data import ( + Genotypes, + GenotypesRefAlt, + Phenotypes, + Covariates, + Haplotypes, + Variant, + Haplotype, +) DATADIR = Path(__file__).parent.joinpath("data") -def get_expected_genotypes(): - # create a GT matrix with shape: samples x SNPs x (strands+phase) - expected = np.zeros(60).reshape((5, 4, 3)).astype(np.uint8) - expected[:4, 1, 1] = 1 - expected[2:4, 1, 0] = 1 - expected[:, :, 2] = 1 - return expected - - -def test_load_genotypes(caplog): - expected = get_expected_genotypes() - - # can we load the data from the VCF? - gts = Genotypes(DATADIR.joinpath("simple.vcf")) - gts.read() - np.testing.assert_allclose(gts.data, expected) - assert gts.samples == ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") - - # try loading the data again - it should warn b/c we've already done it - gts.read() - assert len(caplog.records) == 1 and caplog.records[0].levelname == "WARNING" - - # force one of the SNPs to have more than one allele and check that we get an error - gts.data[1, 1, 1] = 2 - with pytest.raises(ValueError) as info: +class TestGenotypes: + def _get_expected_genotypes(self): + # create a GT matrix with shape: samples x SNPs x (strands+phase) + expected = np.zeros(60).reshape((5, 4, 3)).astype(np.uint8) + expected[:4, 1, 1] = 1 + expected[2:4, 1, 0] = 1 + expected[:, :, 2] = 1 + return expected + + def _get_fake_genotypes(self): + gts = Genotypes(fname=None) + gts.data = self._get_expected_genotypes() + gts.variants = np.array( + [ + ("1:10114:T:C", "1", 10114, 0), + ("1:10116:A:G", "1", 10116, 0.6), + ("1:10117:C:A", "1", 10117, 0), + ("1:10122:A:G", "1", 10122, 0), + ], + dtype=[ + ("id", "U50"), + ("chrom", "U10"), + ("pos", np.uint32), + ("aaf", np.float64), + ], + ) + gts.samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") + gts.check_phase() + return gts + + def _get_fake_genotypes_refalt(self): + base_gts = self._get_fake_genotypes() + # copy all of the fields + gts = GenotypesRefAlt(fname=None) + gts.data = base_gts.data + gts.samples = base_gts.samples + # add additional ref and alt alleles + ref_alt = np.array( + [ + ("T", "C"), + ("A", "G"), + ("C", "A"), + ("A", "G"), + ], + dtype=[ + ("ref", "U100"), + ("alt", "U100"), + ], + ) + # see https://stackoverflow.com/a/5356137 + gts.variants = rfn.merge_arrays((base_gts.variants, ref_alt), flatten=True) + return gts + + def test_load_genotypes(self, caplog): + expected = self._get_expected_genotypes() + + # can we load the data from the VCF? + gts = Genotypes(DATADIR.joinpath("simple.vcf")) + gts.read() + np.testing.assert_allclose(gts.data, expected) + assert gts.samples == ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") + + # try loading the data again - it should warn b/c we've already done it + caplog.clear() + gts.read() + assert len(caplog.records) > 0 and caplog.records[0].levelname == "WARNING" + + # force one of the samples to have a missing GT and check that we get an error + gts.data[1, 1, 1] = -1 + with pytest.raises(ValueError) as info: + gts.check_missing() + assert ( + str(info.value) + == "Genotype with ID 1:10116:A:G at POS 1:10116 is missing for sample" + " HG00097" + ) + gts.data[1, 1, 1] = 1 + + # force one of the SNPs to have more than one allele and check that we get an error + gts.data[1, 1, 1] = 2 + with pytest.raises(ValueError) as info: + gts.check_biallelic() + assert ( + str(info.value) + == "Variant with ID 1:10116:A:G at POS 1:10116 is multiallelic for sample" + " HG00097" + ) + gts.data[1, 1, 1] = 1 + + # check biallelic-ness and convert to bool_ gts.check_biallelic() - assert ( - str(info.value) - == "Variant with ID 1:10116:A:G at POS 1:10116 is multiallelic for sample" - " HG00097" - ) - gts.data[1, 1, 1] = 1 - - # check biallelic-ness and convert to bool_ - gts.check_biallelic() - expected = expected.astype(np.bool_) - np.testing.assert_allclose(gts.data, expected) - - # force one of the het SNPs to be unphased and check that we get an error message - gts.data[1, 1, 2] = 0 - with pytest.raises(ValueError) as info: + expected = expected.astype(np.bool_) + np.testing.assert_allclose(gts.data, expected) + + # force one of the het SNPs to be unphased and check that we get an error message + gts.data[1, 1, 2] = 0 + with pytest.raises(ValueError) as info: + gts.check_phase() + assert ( + str(info.value) + == "Variant with ID 1:10116:A:G at POS 1:10116 is unphased for sample" + " HG00097" + ) + gts.data[1, 1, 2] = 1 + + # check phase and remove the phase axis gts.check_phase() - assert ( - str(info.value) - == "Variant with ID 1:10116:A:G at POS 1:10116 is unphased for sample HG00097" - ) - gts.data[1, 1, 2] = 1 - - # check phase and remove the phase axis - gts.check_phase() - expected = expected[:, :, :2] - np.testing.assert_allclose(gts.data, expected) - - # try to check phase again - it should warn b/c we've already done it before - gts.check_phase() - assert len(caplog.records) == 2 and caplog.records[1].levelname == "WARNING" - - # convert the matrix of alt allele counts to a matrix of minor allele counts - assert gts.variants["aaf"][1] == 0.6 - gts.to_MAC() - expected[:, 1, :] = ~expected[:, 1, :] - np.testing.assert_allclose(gts.data, expected) - assert gts.variants["maf"][1] == 0.4 - - # try to do the MAC conversion again - it should warn b/c we've already done it - gts.to_MAC() - assert len(caplog.records) == 3 and caplog.records[2].levelname == "WARNING" - - -def test_load_genotypes_iterate(caplog): - expected = get_expected_genotypes().transpose((1, 0, 2)) - samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") - - # can we load the data from the VCF? - gts = Genotypes(DATADIR.joinpath("simple.vcf")) - for idx, line in enumerate(gts.iterate()): - np.testing.assert_allclose(line.data, expected[idx]) - assert line.samples == samples - - -def test_load_genotypes_discard_multiallelic(): - expected = get_expected_genotypes() + expected = expected[:, :, :2] + np.testing.assert_allclose(gts.data, expected) - # can we load the data from the VCF? - gts = Genotypes(DATADIR.joinpath("simple.vcf")) - gts.read() - - # make a copy for later - data_copy = gts.data.copy().astype(np.bool_) - variant_shape = list(gts.variants.shape) - variant_shape[0] -= 1 - - # force one of the SNPs to have more than one allele and check that it gets dicarded - gts.data[1, 1, 1] = 2 - gts.check_biallelic(discard_also=True) - - data_copy_without_biallelic = np.delete(data_copy, [1], axis=1) - np.testing.assert_equal(gts.data, data_copy_without_biallelic) - assert gts.variants.shape == tuple(variant_shape) - - -def test_load_genotypes_subset(): - expected = get_expected_genotypes() - - # subset for the region we want - expected = expected[:, 1:3] - - # can we load the data from the VCF? - gts = Genotypes(DATADIR.joinpath("simple.vcf.gz")) - gts.read(region="1:10115-10117") - np.testing.assert_allclose(gts.data, expected) - assert gts.samples == ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") - - # subset for just the samples we want - expected = expected[[1, 3]] - - # can we load the data from the VCF? - gts = Genotypes(DATADIR.joinpath("simple.vcf.gz")) - samples = ["HG00097", "HG00100"] - gts.read(region="1:10115-10117", samples=samples) - np.testing.assert_allclose(gts.data, expected) - assert gts.samples == tuple(samples) + # try to check phase again - it should warn b/c we've already done it before + caplog.clear() + gts.check_phase() + assert len(caplog.records) > 0 and caplog.records[0].levelname == "WARNING" + + # convert the matrix of alt allele counts to a matrix of minor allele counts + assert gts.variants["aaf"][1] == 0.6 + gts.to_MAC() + expected[:, 1, :] = ~expected[:, 1, :] + np.testing.assert_allclose(gts.data, expected) + assert gts.variants["maf"][1] == 0.4 + + # try to do the MAC conversion again - it should warn b/c we've already done it + caplog.clear() + gts.to_MAC() + assert len(caplog.records) > 0 and caplog.records[0].levelname == "WARNING" + + def test_load_genotypes_iterate(self, caplog): + expected = self._get_expected_genotypes().transpose((1, 0, 2)) + samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") + + # can we load the data from the VCF? + gts = Genotypes(DATADIR.joinpath("simple.vcf")) + for idx, line in enumerate(gts): + np.testing.assert_allclose(line.data, expected[idx]) + assert gts.samples == samples + + def test_load_genotypes_discard_multiallelic(self): + expected = self._get_expected_genotypes() + + # can we load the data from the VCF? + gts = Genotypes(DATADIR.joinpath("simple.vcf")) + gts.read() + + # make a copy for later + data_copy = gts.data.copy().astype(np.bool_) + variant_shape = list(gts.variants.shape) + variant_shape[0] -= 1 + + # force one of the SNPs to have more than one allele and check that it gets dicarded + gts.data[1, 1, 1] = 2 + gts.check_biallelic(discard_also=True) + + data_copy_without_biallelic = np.delete(data_copy, [1], axis=1) + np.testing.assert_equal(gts.data, data_copy_without_biallelic) + assert gts.variants.shape == tuple(variant_shape) + + def test_load_genotypes_subset(self): + expected = self._get_expected_genotypes() + + # subset for the region we want + expected = expected[:, 1:3] + + # can we load the data from the VCF? + gts = Genotypes(DATADIR.joinpath("simple.vcf.gz")) + gts.read(region="1:10115-10117") + np.testing.assert_allclose(gts.data, expected) + assert gts.samples == ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") + + # subset for just the samples we want + expected = expected[[1, 3]] + + gts = Genotypes(DATADIR.joinpath("simple.vcf.gz")) + samples = ["HG00097", "HG00100"] + gts.read(region="1:10115-10117", samples=samples) + np.testing.assert_allclose(gts.data, expected) + assert gts.samples == tuple(samples) + + # subset to just one of the variants + expected = expected[:, [1]] + + gts = Genotypes(DATADIR.joinpath("simple.vcf.gz")) + samples = ["HG00097", "HG00100"] + variants = {"1:10117:C:A"} + gts.read(region="1:10115-10117", samples=samples, variants=variants) + np.testing.assert_allclose(gts.data, expected) + assert gts.samples == tuple(samples) def test_load_phenotypes(caplog): @@ -159,7 +235,7 @@ def test_load_phenotypes_iterate(): # can we load the data from the phenotype file? phens = Phenotypes(DATADIR.joinpath("simple.tsv")) - for idx, line in enumerate(phens.iterate()): + for idx, line in enumerate(phens): np.testing.assert_allclose(line.data, expected[idx]) assert line.samples == samples[idx] @@ -202,14 +278,14 @@ def test_load_covariates_iterate(): # can we load the data from the covariates file? covars = Covariates(DATADIR.joinpath("covars.tsv")) - for idx, line in enumerate(covars.iterate()): + for idx, line in enumerate(covars): np.testing.assert_allclose(line.data, expected[idx]) assert line.samples == samples[idx] assert line.names == ("sex", "age") def test_load_covariates_subset(): - # create a covriate vector with shape: num_samples x num_covars + # create a covariate vector with shape: num_samples x num_covars expected = np.array([(0, 4), (1, 20), (1, 33), (0, 15), (0, 78)]) # subset for just the samples we want @@ -221,3 +297,206 @@ def test_load_covariates_subset(): covars.read(samples=samples) np.testing.assert_allclose(covars.data, expected) assert covars.samples == tuple(samples) + + +class TestHaplotypes: + def _basic_haps(self): + # what do we expect to see from the basic.hap file? + expected = { + "chr21.q.3365*1": Haplotype("21", 26928472, 26941960, "chr21.q.3365*1"), + "chr21.q.3365*10": Haplotype("21", 26938989, 26941960, "chr21.q.3365*10"), + "chr21.q.3365*11": Haplotype("21", 26938353, 26938989, "chr21.q.3365*11"), + } + expected["chr21.q.3365*1"].variants = ( + Variant(26928472, 26928472, "21_26928472_C_A", "C"), + Variant(26938353, 26938353, "21_26938353_T_C", "T"), + Variant(26940815, 26940815, "21_26940815_T_C", "C"), + Variant(26941960, 26941960, "21_26941960_A_G", "G"), + ) + expected["chr21.q.3365*10"].variants = ( + Variant(26938989, 26938989, "21_26938989_G_A", "A"), + Variant(26940815, 26940815, "21_26940815_T_C", "T"), + Variant(26941960, 26941960, "21_26941960_A_G", "A"), + ) + expected["chr21.q.3365*11"].variants = ( + Variant(26938353, 26938353, "21_26938353_T_C", "T"), + Variant(26938989, 26938989, "21_26938989_G_A", "A"), + ) + return expected + + def _get_dummy_haps(self): + # create three haplotypes + haplotypes = { + "H1": Haplotype(chrom="1", start=10114, end=8, id="H1"), + "H2": Haplotype(chrom="1", start=10114, end=10119, id="H2"), + "H3": Haplotype(chrom="1", start=10116, end=10119, id="H3"), + } + haplotypes["H1"].variants = ( + Variant(start=10114, end=10115, id="1:10114:T:C", allele="T"), + Variant(start=10116, end=10117, id="1:10116:A:G", allele="G"), + ) + haplotypes["H2"].variants = ( + Variant(start=10114, end=10115, id="1:10114:T:C", allele="C"), + Variant(start=10117, end=10118, id="1:10117:C:A", allele="C"), + ) + haplotypes["H3"].variants = ( + Variant(start=10116, end=10117, id="1:10116:A:G", allele="A"), + Variant(start=10117, end=10118, id="1:10117:C:A", allele="A"), + ) + haps = Haplotypes(fname=None) + haps.data = haplotypes + return haps + + def test_load(self): + # can we load this data from the hap file? + haps = Haplotypes.load(DATADIR.joinpath("basic.hap")) + assert self._basic_haps() == haps.data + + def test_read_subset(self): + expected = {} + expected["chr21.q.3365*1"] = self._basic_haps()["chr21.q.3365*1"] + + haps = Haplotypes(DATADIR.joinpath("basic.hap")) + # this should fail with an informative error b/c the file isn't indexed + with pytest.raises(OSError) as info: + haps.read(haplotypes={"chr21.q.3365*1"}) + assert len(str(info.value)) + + haps = Haplotypes(DATADIR.joinpath("basic.hap.gz")) + haps.read(haplotypes={"chr21.q.3365*1"}) + assert expected == haps.data + + haps = Haplotypes(DATADIR.joinpath("basic.hap.gz")) + haps.read(region="21:26928472-26941960", haplotypes={"chr21.q.3365*1"}) + assert expected == haps.data + + expected = self._basic_haps() + + haps = Haplotypes(DATADIR.joinpath("basic.hap.gz")) + haps.read(region="21:26928472-26941960") + assert expected == haps.data + + def test_read_extras(self): + # what do we expect to see from the simphenotype.hap file? + expected = { + "chr21.q.3365*1": HaptoolsHaplotype( + "21", 26928472, 26941960, "chr21.q.3365*1", "ASW", 0.73 + ), + "chr21.q.3365*10": HaptoolsHaplotype( + "21", 26938989, 26941960, "chr21.q.3365*10", "CEU", 0.30 + ), + "chr21.q.3365*11": HaptoolsHaplotype( + "21", 26938353, 26938989, "chr21.q.3365*11", "MXL", 0.49 + ), + } + for hap_id, hap in self._basic_haps().items(): + expected[hap_id].variants = hap.variants + + # can we load this data from the hap file? + haps = Haplotypes(DATADIR.joinpath("simphenotype.hap"), HaptoolsHaplotype) + haps.read() + assert expected == haps.data + + def test_read_extras_large(self): + """ + try reading a large-ish file + """ + haps = Haplotypes(DATADIR.joinpath("example.hap.gz"), HaptoolsHaplotype) + haps.read() + assert len(self._basic_haps().keys() & haps.data.keys()) + + def test_write(self): + haps = Haplotypes(DATADIR.joinpath("test.hap")) + haps.data = self._basic_haps() + haps.write() + + haps.data = None + haps.read() + assert self._basic_haps() == haps.data + + # remove the file + os.remove("tests/data/test.hap") + + def test_write_extras(self): + # what do we expect to see from the simphenotype.hap file? + expected = { + "chr21.q.3365*1": HaptoolsHaplotype( + "21", 26928472, 26941960, "chr21.q.3365*1", "ASW", 0.73 + ), + "chr21.q.3365*10": HaptoolsHaplotype( + "21", 26938989, 26941960, "chr21.q.3365*10", "CEU", 0.30 + ), + "chr21.q.3365*11": HaptoolsHaplotype( + "21", 26938353, 26938989, "chr21.q.3365*11", "MXL", 0.49 + ), + } + for hap_id, hap in self._basic_haps().items(): + expected[hap_id].variants = hap.variants + + haps = Haplotypes(DATADIR.joinpath("test.hap"), HaptoolsHaplotype) + haps.data = expected + haps.write() + + haps.data = None + haps.read() + assert expected == haps.data + + # remove the file + os.remove("tests/data/test.hap") + + def test_hap_transform(self): + expected = np.array( + [ + [0, 1], + [0, 1], + [1, 1], + [1, 1], + [0, 0], + ], + dtype=np.uint8, + ) + + hap = list(self._get_dummy_haps().data.values())[0] + gens = TestGenotypes()._get_fake_genotypes_refalt() + hap_gt = hap.transform(gens) + np.testing.assert_allclose(hap_gt, expected) + + def test_haps_transform(self): + expected = np.array( + [ + [[0, 1], [0, 0], [0, 0]], + [[0, 1], [0, 0], [1, 0]], + [[1, 0], [0, 1], [0, 0]], + [[1, 1], [0, 0], [0, 0]], + [[0, 0], [0, 1], [1, 0]], + ], + dtype=np.uint8, + ) + + haps = self._get_dummy_haps() + gens = TestGenotypes()._get_fake_genotypes_refalt() + gens.data[[2, 4], 0, 1] = 1 + gens.data[[1, 4], 2, 0] = 1 + hap_gt = GenotypesRefAlt(fname=None) + haps.transform(gens, hap_gt) + np.testing.assert_allclose(hap_gt.data, expected) + return hap_gt + + def test_hap_gt_write(self): + fname = DATADIR.joinpath("simple_haps.vcf") + + hap_gt = self.test_haps_transform() + hap_gt.fname = fname + expected_data = hap_gt.data + expected_samples = hap_gt.samples + hap_gt.write() + + hap_gt.data = None + hap_gt.read() + hap_gt.check_phase() + np.testing.assert_allclose(hap_gt.data, expected_data) + assert hap_gt.samples == expected_samples + assert len(hap_gt.variants) == hap_gt.data.shape[1] + + # remove the file + os.remove(str(fname))