Skip to content

Commit

Permalink
Fixes to infer and discover.
Browse files Browse the repository at this point in the history
Infer: now output single most likely call in het calls first in GT.

Discover: build 'regions' from inferred vcf using the GT flag.
Previously was taking the first ALT, systematically- this made incorrect
vcf combining in `build`.

Modified test_discover.py accordingly.
  • Loading branch information
bricoletc committed Mar 25, 2019
1 parent 6a1a1db commit 8b46a86
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 11 deletions.
25 changes: 19 additions & 6 deletions gramtools/commands/discover.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,13 +177,26 @@ def _flag_personalised_reference_regions(base_records, secondary_reference_lengt
inf_ref_pos += non_var_region.length

# Build variant region
var_region = _Region(base_POS = base_ref_pos, inf_POS = inf_ref_pos,
length = len(vcf_record.ALT[0]), vcf_record_REF = vcf_record.REF,
vcf_record_ALT = str(vcf_record.ALT[0]))
all_personalised_ref_regions.append(var_region)
# Note the following 'guarantee' from `infer`: the first element of GT is the single most likely haploid genotype call.
# We need to take that, as that is what gets used to produce the inferred reference fasta- which is the reference used for variant calling.
picked_alleles = vcf_record.samples[0].gt_alleles

if set(picked_alleles) == {None}:
picked_allele = 0 # GT of './.'
else:
picked_allele = int(picked_alleles[0]) # Get the first one. Can be 0 (REF) !

# Only make a var region if the inference procedure did not pick the REF.
if picked_allele != 0:
var_region = _Region(base_POS = base_ref_pos, inf_POS = inf_ref_pos,
length = len(vcf_record.ALT[picked_allele - 1]), vcf_record_REF = vcf_record.REF,
vcf_record_ALT = str(vcf_record.ALT[picked_allele - 1]))

all_personalised_ref_regions.append(var_region)
base_ref_pos += len(vcf_record.REF)
inf_ref_pos += var_region.length


base_ref_pos += len(vcf_record.REF)
inf_ref_pos += var_region.length

# End game: deal with the last non-var region if there is one.
# We test `inf_ref_pos` because we have the inferred reference length (= secondary reference)
Expand Down
22 changes: 18 additions & 4 deletions gramtools/commands/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,8 @@ def __init__(self,CallData, FORMAT, sample_name):
## Computes and records depth, genotype, coverage, genotype confidence.
# Also removes non-zero coverage alleles if site has been genotyped.
def update_vcf_record(self, new_record, gtyper):
most_likely_single_allele = _extract_allele_index(gtyper)

if '.' in gtyper.genotype: #Case: we had no coverage anywhere in the variant site. We will keep all REF and ALTs in the vcf output.
depth, gt_conf = '0', '0.0'
cov_string = ','.join(['0' for x in range(1 + len(new_record.ALT))])
Expand Down Expand Up @@ -256,13 +258,18 @@ def update_vcf_record(self, new_record, gtyper):
genotype_index = genotype_indexes[0]
genotype = str(genotype_index) + '/' + str(genotype_index)
else:
genotype = '/'.join([str(x) for x in genotype_indexes])
# Make sure we output the **most likely single** allele first
other_genotype_indexes = gtyper.genotype.copy()
other_genotype_indexes.discard(most_likely_single_allele)

ordered_genotyped_indexes = [most_likely_single_allele + sorted(list(other_genotype_indexes))]
genotype = '/'.join([str(x) for x in ordered_genotyped_indexes])

sample = vcf.model._Call(site = new_record, sample = self.sample_name, data=self.CallData(GT=genotype, DP=depth, COV=cov_string,GT_CONF=gt_conf))
new_record.samples = [sample] # Removes pre-existing genotype calls if there were any.
new_record.FORMAT = self.FORMAT

self.allele_indexes.append(_extract_allele_index(gtyper))
self.allele_indexes.append(most_likely_single_allele)


class _PopulationUpdater(_VcfUpdater):
Expand All @@ -272,6 +279,8 @@ def __init__(self,CallData, FORMAT, sample_name):
## Computes and records depth, genotype, coverage, genotype confidence.
# Keeps all alleles at all sites even if they have no coverage.
def update_vcf_record(self, new_record, gtyper):
most_likely_single_allele = _extract_allele_index(gtyper)

if '.' in gtyper.genotype: #Case: we had no coverage anywhere in the variant site.
depth, gt_conf = '0', '0.0'
cov_string = ','.join(['0' for _ in range(1 + len(new_record.ALT))])
Expand All @@ -290,13 +299,18 @@ def update_vcf_record(self, new_record, gtyper):
genotype_index = genotype_indexes[0]
genotype = str(genotype_index) + '/' + str(genotype_index)
else:
genotype = '/'.join([str(x) for x in genotype_indexes])
# Make sure we output the **most likely single** allele first
other_genotype_indexes = gtyper.genotype.copy()
other_genotype_indexes.discard(most_likely_single_allele)

ordered_genotyped_indexes = [most_likely_single_allele + sorted(list(other_genotype_indexes))]
genotype = '/'.join([str(x) for x in ordered_genotyped_indexes])

sample = vcf.model._Call(site = new_record, sample = self.sample_name, data=self.CallData(GT=genotype, DP=depth, COV=cov_string,GT_CONF=gt_conf))
new_record.samples = [sample] # Removes pre-existing genotype calls if there were any.
new_record.FORMAT = self.FORMAT

self.allele_indexes.append(_extract_allele_index(gtyper))
self.allele_indexes.append(most_likely_single_allele)


## Generate max. likelihood call for each allele.
Expand Down
7 changes: 6 additions & 1 deletion gramtools/tests/commands/test_discover.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import unittest
import vcf
import os
import collections

from ...commands import discover
from ... import prg_local_parser
Expand Down Expand Up @@ -337,7 +338,11 @@ def __init__(self, POS, REF, ALT, samples=[]):
self.POS = POS
self.REF = REF
self.ALT = [vcf.model._Substitution(x) for x in ALT]
self.samples = samples

if len(samples) == 0:
self.samples = [_Sample(["1","1"])] # Default to recording a ALT call for a single sample.
else:
self.samples = samples

def __repr__(self):
return str(self.__dict__)
Expand Down

0 comments on commit 8b46a86

Please sign in to comment.