Skip to content

Commit

Permalink
N's are now replaceed with proper reference base (#13).
Browse files Browse the repository at this point in the history
mpileup doesn't output all data, so we had to add a fudge to insert ./.
record in the missing positions. Before we added N as reference base,
however, if reference is provided then appropriate reference base is
added.
  • Loading branch information
Aleksey Jironkin authored and Aleksey Jironkin committed Jun 9, 2016
1 parent 7206951 commit 3f6e702
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 16 deletions.
96 changes: 83 additions & 13 deletions phe/variant/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import operator
import pickle

from Bio import SeqIO
from vcf import filters
import vcf
from vcf.model import make_calldata_tuple
Expand Down Expand Up @@ -66,7 +67,7 @@ class VariantSet(object):

_reader = None

def __init__(self, vcf_in, filters=None):
def __init__(self, vcf_in, filters=None, reference=None):
"""Constructor of variant set.
Parameters
Expand Down Expand Up @@ -101,6 +102,8 @@ def __init__(self, vcf_in, filters=None):

self._variants = []

self._read_reference(reference)

def __iter__(self):
'''Iterator over **all** variants'''
return self.variants(only_good=False)
Expand All @@ -120,7 +123,7 @@ def variants(self, only_good=False):
elif not var.FILTER :
yield var

def filter_variants(self, keep_only_snps=True, out_vcf=None):
def filter_variants(self, keep_only_snps=False, out_vcf=None, only_good=False):
"""Filter the VCF records.
Parameters
Expand All @@ -130,6 +133,8 @@ def filter_variants(self, keep_only_snps=True, out_vcf=None):
out_vcf: str, optional
If specified, filtered record will be written to *out_vcf*
instead of being retianed in memory.
only_good: bool, optional
True/False if only SNPs that PASS should output.
Returns
-------
Expand Down Expand Up @@ -195,7 +200,10 @@ def filter_variants(self, keep_only_snps=True, out_vcf=None):
else:
# This is a padding "N" record when records do not follow each other,
# and there is a gap. e,g, 1,2,3,5,6 -> in 4 "N" will be inserted.
_record = vcf.model._Record(record.CHROM, _pos, ".", "N", [None], 0, [], {}, 'GT', None)

_ref = self._get_reference_base(record.CHROM, _pos)

_record = vcf.model._Record(record.CHROM, _pos, ".", _ref, [None], 0, [], {}, 'GT', None)
_calls = []
sorted_samples = sorted(record._sample_indexes.items(), key=operator.itemgetter(1))
for sample, i in sorted_samples:
Expand All @@ -215,16 +223,23 @@ def filter_variants(self, keep_only_snps=True, out_vcf=None):
# After applying all filters, check if FILTER is None.
# If it is, then record PASSED all filters.
if _record.FILTER is None or _record.FILTER == []:
_record.FILTER = []
# FIXME: Does this work for indels?
if keep_only_snps and _record.is_snp:
if _writer is not None:
_writer.write_record(_record)
records += 1
else:
self._variants.append(_record)
elif _writer is None:
self._variants.append(_record)
if not record.is_monomorphic:
_record.FILTER = []

if not keep_only_snps or (_record.is_snp and keep_only_snps):

if _writer is not None:
_writer.write_record(_record)
records += 1
else:
self._variants.append(_record)

elif not only_good:
if _writer is not None:
_writer.write_record(_record)
records += 1
else:
self._variants.append(_record)

_pos += 1
if _chrom is None:
Expand Down Expand Up @@ -263,6 +278,61 @@ def _filter_record(self, record, removed_filters=list()):
# If we got this far, then record is filtered OUT.
record.add_filter(record_filter.filter_name())

def _get_reference_base(self, chrom, pos):
"""Get the reference base at chromosome *chrom* and position
*pos*. If it is not found, **N** is returned.
Parameters
----------
reference: dict
Dictionary of chromosome to sequence map.
chrom: str
Chromosome to access.
pos: int
Position to retrieve.
Returns
-------
_ref: str
Reference base to use.
"""

if self._reference is None:
_ref = "N"
else:
try:
_chrom = self._reference.get(chrom, [])

_ref = _chrom[pos]
except ValueError:
logging.error("Could not retrieve reference base for %s @ %s", chrom, pos)
_ref = "N"

return _ref

def _read_reference(self, reference):
"""Read in the reference from the file into a dictionary.
Parameters
----------
reference: str
Path to the reference.
Returns
-------
reference: dict
Dictionary with chromosome - sequence mapping.
"""

if reference is None:
self._reference = None
else:
self._reference = {}

with open(reference) as reference_fp:
for record in SeqIO.parse(reference_fp, "fasta"):
self._reference[record.id] = list(record.seq)

def add_metadata(self, info):
"""Add metadata to the variant set.
Expand Down
6 changes: 4 additions & 2 deletions scripts/filter_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ def get_args():

args.add_argument("--output", "-o", required=True, help="Location for filtered VCF to be written.")

args.add_argument("--reference", "-r", help="mpileup version <= 1.3 do not output all positions. This is required to fix rfrence base in VCF.")

args.add_argument("--only-good", action="store_true", default=False, help="Write only variants that PASS all filters (default all variants are written).")

return args
Expand All @@ -62,12 +64,12 @@ def main(args):
logging.error("Either --config or --filters needs to be specified.")
return 1

var_set = VariantSet(args["vcf"], filters=args["filters"])
var_set = VariantSet(args["vcf"], filters=args["filters"], reference=args["reference"])

if args.get("version") is not None:
var_set.add_metadata(OrderedDict({"PHEnix-Version": (args["version"],)}))

var_set.filter_variants(out_vcf=args["output"])
var_set.filter_variants(out_vcf=args["output"], only_good=args["only_good"])

logging.info("Finished filtering")
return 0
Expand Down
2 changes: 1 addition & 1 deletion scripts/run_snp_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def main(args):

if args["filters"] and vcf_file:
logging.info("Applying filters: %s", [str(f) for f in args["filters"]])
var_set = VariantSet(vcf_file, filters=args["filters"])
var_set = VariantSet(vcf_file, filters=args["filters"], reference=args["reference"])

var_set.add_metadata(mapper.get_meta())
var_set.add_metadata(variant.get_meta())
Expand Down

0 comments on commit 3f6e702

Please sign in to comment.