Skip to content

Commit

Permalink
v0.4.0: support for BAMs with multiple references
Browse files Browse the repository at this point in the history
  • Loading branch information
bede committed Aug 13, 2019
1 parent fc1ce7e commit edf9166
Show file tree
Hide file tree
Showing 16 changed files with 23,950 additions and 26 deletions.
26 changes: 16 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
[![PyPI version](https://badge.fury.io/py/kindel.svg)](https://badge.fury.io/py/kindel)
[![Build Status](https://travis-ci.org/bede/kindel.svg?branch=master)](https://travis-ci.org/bede/kindel)

A consensus caller which accepts a headed SAM/BAM input and generates a majority consensus, reconciling small indels described in CIGAR fields so as to maximise read-reference concordance. Kindel also permits recovery of consensus sequence across highly divergent regions (such as those encoding viral envelope proteins) where regions of reads cannot be aligned. With **`--realign`**, Kindel identifies regions of the reference that are clip-dominant (>depth\*0.5) and attempts to assemble a patched consensus using the unaligned sequence context. Existing consensus calling approaches are complicated and often involve a variant calling step. An [elegant streaming approach](https://github.com/karel-brinda/ococo) was recently released but cannot reconcile indels.
A consensus caller that generates majority consensus sequence(s) from a BAM file, reconciling small CIGAR-described indels so as to maximise read-to-reference concordance. Kindel also permits recovery of consensus sequence across highly divergent regions (such as those encoding viral envelope proteins) where regions of reads cannot be aligned. With **`--realign`**, Kindel identifies regions of the reference that are clip-dominant (>depth\*0.5) and attempts to assemble a patched consensus using the unaligned sequence context. Existing consensus calling approaches are complicated and often involve a variant calling step. An [elegant streaming approach](https://github.com/karel-brinda/ococo) was recently released but cannot reconcile indels. Update: [Pilon](https://github.com/broadinstitute/pilon) now exists.



Expand All @@ -26,20 +26,26 @@ A consensus caller which accepts a headed SAM/BAM input and generates a majority


## Features
- [x] Reconciliation of CIGAR described insertions and deletions
- [x] Reconciliation of aligned substititutions, insertions and deletions
- [x] Gap closure (`--realign`) using overlapping soft-clipped alignment context
- [x] Support SAMs from multiple aligners – (currently tested BWA MEM, Segemehl)
- [ ] Accept unmapped fastq input
- [ ] Minimap2 support
- [x] Tested against BAM output from BWA, Minimap2 and Segemehl
- [ ] Support for BAMs with multiple reference contigs, chromosomes
- [x] Frequency based variant calling with `kindel variants` (no VCF output currently)
- [x] Plotting of clip frequencies
- [ ]
- [ ] Customisable threshold weight
- [ ] Support for mutiple reference sequences (untested)
- [ ] C++ version (early stages)
- [x] Plotting



### Todo

- [ ] Fix broken SAM parsing (BAM works fine)

- [ ] Customisable threshold weight

- [ ] Numpy rewrite to handle bacterial long read sequences

- [ ] Parallelism



## Installation

Expand Down
2 changes: 1 addition & 1 deletion kindel/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.3.0'
__version__ = '0.4.0'
5 changes: 3 additions & 2 deletions kindel/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import argh
import pandas as pd

from pprint import pprint

from Bio import SeqIO

from kindel import kindel
Expand All @@ -24,8 +26,7 @@ def consensus(bam_path: 'path to SAM/BAM file',
mask_ends,
trim_ends,
uppercase)
print(result.report, file=sys.stderr)
# print(result.consensuses[1].seq, file=sys.stdout)
print('\n'.join([r for r in result.refs_reports.values()]), file=sys.stderr)
SeqIO.write(result.consensuses, sys.stdout,'fasta')


Expand Down
27 changes: 16 additions & 11 deletions kindel/kindel.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,9 @@ def parse_bam(bam_path):
if '*' in refs_records:
del refs_records['*']

assert len(refs_records) <= 1, 'Detected primary mappings to more than one reference'
# assert len(refs_records) <= 1, 'Detected primary mappings to more than one reference'
# Use samtools view to extract single contig primary mappings
# Otherwise would make a useful enhancement
# Otherwise would make a useful enhancement

for ref_id, records in refs_records.items():
alignments[ref_id] = parse_records(ref_id, refs_lens[ref_id], records)
Expand Down Expand Up @@ -315,7 +315,9 @@ def consensus_sequence(weights, clip_start_weights, clip_end_weights, insertions
consensus_seq = ''
changes = [None] * len(weights)
skip_positions = 0
for pos, weight in tqdm.tqdm(enumerate(weights), total=len(weights), desc='building consensus'):
for pos, weight in tqdm.tqdm(enumerate(weights),
total=len(weights),
desc='building consensus'):
if skip_positions:
skip_positions -= 1
continue
Expand Down Expand Up @@ -357,7 +359,7 @@ def consensus_seqrecord(consensus, ref_id):
return SeqRecord(Seq(consensus), id=ref_id + '_cns', description='')


def build_report(weights, changes, cdr_patches, bam_path, realign, min_depth, min_overlap,
def build_report(ref_id, weights, changes, cdr_patches, bam_path, realign, min_depth, min_overlap,
clip_decay_threshold, trim_ends, uppercase):
aligned_depth = [sum({nt:w[nt] for nt in list('ACGT')}.values()) for w in weights]
ambiguous_sites = []
Expand All @@ -372,6 +374,7 @@ def build_report(weights, changes, cdr_patches, bam_path, realign, min_depth, mi
elif change == 'D':
deletion_sites.append(str(pos))
report = '========================= REPORT ===========================\n'
report += 'reference: {}\n'.format(ref_id)
report += 'options:\n'
report += '- bam_path: {}\n'.format(bam_path)
report += '- realign: {}\n'.format(realign)
Expand All @@ -387,15 +390,16 @@ def build_report(weights, changes, cdr_patches, bam_path, realign, min_depth, mi
report += '- insertion sites: {}\n'.format(', '.join(insertion_sites))
report += '- deletion sites: {}\n'.format(', '.join(deletion_sites))
report += '- clip-dominant regions: {}\n'.format(', '.join(cdr_patches_fmt))
report += '============================================================\n'

return report


def bam_to_consensus(bam_path, realign=False, min_depth=2, min_overlap=7,
clip_decay_threshold=0.1, mask_ends=10, trim_ends=False, uppercase=False):
refs_consensuses = []
consensuses = []
refs_changes = {}
refs_reports = {}
# for i, (ref_id, aln) in enumerate(parse_bam(bam_path).items()):
for ref_id, aln in parse_bam(bam_path).items():
if realign:
cdrps = cdrp_consensuses(aln.weights, aln.clip_start_weights, aln.clip_end_weights,
Expand All @@ -410,14 +414,15 @@ def bam_to_consensus(bam_path, realign=False, min_depth=2, min_overlap=7,
aln.clip_end_weights, aln.insertions,
aln.deletions, cdr_patches, trim_ends,
min_depth, uppercase)
report = build_report(aln.weights, changes, cdr_patches, bam_path, realign,
report = build_report(ref_id, aln.weights, changes, cdr_patches, bam_path, realign,
min_depth, min_overlap, clip_decay_threshold, trim_ends,
uppercase)
refs_consensuses.append(consensus_seqrecord(consensus, ref_id))
refs_changes[ref_id] = changes
result = namedtuple('result', ['consensuses', 'refs_changes', 'report'])
consensuses.append(consensus_seqrecord(consensus, ref_id))
refs_reports[ref_id] = report
refs_changes[ref_id] = changes
result = namedtuple('result', ['consensuses', 'refs_changes', 'refs_reports'])

return result(refs_consensuses, refs_changes, report)
return result(consensuses, refs_changes, refs_reports)


def weights(bam_path: 'path to SAM/BAM file',
Expand Down
Loading

0 comments on commit edf9166

Please sign in to comment.