Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cram reference support #555

Merged
merged 9 commits into from
Dec 8, 2020
18 changes: 9 additions & 9 deletions cnvlib/autobin.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def midsize_file(fnames):

def do_autobin(bam_fname, method, targets=None, access=None,
bp_per_bin=100000., target_min_size=20, target_max_size=50000,
antitarget_min_size=500, antitarget_max_size=1000000):
antitarget_min_size=500, antitarget_max_size=1000000, fasta=None):
"""Quickly calculate reasonable bin sizes from BAM read counts.

Parameters
Expand Down Expand Up @@ -70,8 +70,8 @@ def depth2binsize(depth, min_size, max_size):
return bin_size

samutil.ensure_bam_index(bam_fname)
rc_table = samutil.idxstats(bam_fname, drop_unmapped=True)
read_len = samutil.get_read_length(bam_fname)
rc_table = samutil.idxstats(bam_fname, drop_unmapped=True, fasta=fasta)
read_len = samutil.get_read_length(bam_fname, fasta=fasta)
logging.info("Estimated read length %s", read_len)

# Dispatch
Expand All @@ -80,11 +80,11 @@ def depth2binsize(depth, min_size, max_size):
# rc_table = update_chrom_length(rc_table, targets)
# tgt_depth = average_depth(rc_table, read_len)
# By sampling
tgt_depth = sample_region_cov(bam_fname, targets)
tgt_depth = sample_region_cov(bam_fname, targets, fasta=fasta)
anti_depth = None
elif method == 'hybrid':
tgt_depth, anti_depth = hybrid(rc_table, read_len, bam_fname, targets,
access)
access, fasta)
elif method == 'wgs':
if access is not None and len(access):
rc_table = update_chrom_length(rc_table, access)
Expand All @@ -99,7 +99,7 @@ def depth2binsize(depth, min_size, max_size):
(anti_depth, anti_bin_size))


def hybrid(rc_table, read_len, bam_fname, targets, access=None):
def hybrid(rc_table, read_len, bam_fname, targets, access=None, fasta=None):
"""Hybrid capture sequencing."""
# Identify off-target regions
if access is None:
Expand All @@ -111,7 +111,7 @@ def hybrid(rc_table, read_len, bam_fname, targets, access=None):
rc_table, targets, antitargets = shared_chroms(rc_table, targets,
antitargets)
# Deal with targets
target_depth = sample_region_cov(bam_fname, targets)
target_depth = sample_region_cov(bam_fname, targets, fasta=fasta)
# Antitargets: subtract captured reads from total
target_length = region_size_by_chrom(targets)['length']
target_reads = (target_length * target_depth / read_len).values
Expand Down Expand Up @@ -142,13 +142,13 @@ def idxstats2ga(table, bam_fname):
meta_dict={'filename': bam_fname})


def sample_region_cov(bam_fname, regions, max_num=100):
def sample_region_cov(bam_fname, regions, max_num=100, fasta=None):
"""Calculate read depth in a randomly sampled subset of regions."""
midsize_regions = sample_midsize_regions(regions, max_num)
with tempfile.NamedTemporaryFile(suffix='.bed', mode='w+t') as f:
tabio.write(regions.as_dataframe(midsize_regions), f, 'bed4')
f.flush()
table = coverage.bedcov(f.name, bam_fname, 0)
table = coverage.bedcov(f.name, bam_fname, 0, fasta)
# Mean read depth across all sampled regions
return table.basecount.sum() / (table.end - table.start).sum()

Expand Down
16 changes: 8 additions & 8 deletions cnvlib/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def batch_make_reference(normal_bams, target_bed, antitarget_bed,
# Choose median-size normal bam or tumor bam
bam_fname = autobin.midsize_file(normal_bams)
(wgs_depth, target_avg_size), _ = autobin.do_autobin(
bam_fname, *autobin_args, bp_per_bin=50000.)
bam_fname, *autobin_args, bp_per_bin=50000., fasta=fasta)
logging.info("WGS average depth %.2f --> using bin size %d",
wgs_depth, target_avg_size)
else:
Expand Down Expand Up @@ -129,12 +129,12 @@ def batch_make_reference(normal_bams, target_bed, antitarget_bed,
pool.submit(batch_write_coverage,
target_bed, nbam,
sample_pfx + '.targetcoverage.cnn',
by_count, procs_per_cnn))
by_count, procs_per_cnn, fasta))
anti_futures.append(
pool.submit(batch_write_coverage,
antitarget_bed, nbam,
sample_pfx + '.antitargetcoverage.cnn',
by_count, procs_per_cnn))
by_count, procs_per_cnn, fasta))

target_fnames = [tf.result() for tf in tgt_futures]
antitarget_fnames = [af.result() for af in anti_futures]
Expand All @@ -152,29 +152,29 @@ def batch_make_reference(normal_bams, target_bed, antitarget_bed,
return output_reference, target_bed, antitarget_bed


def batch_write_coverage(bed_fname, bam_fname, out_fname, by_count, processes):
def batch_write_coverage(bed_fname, bam_fname, out_fname, by_count, processes, fasta):
"""Run coverage on one sample, write to file."""
cnarr = coverage.do_coverage(bed_fname, bam_fname, by_count, 0, processes)
cnarr = coverage.do_coverage(bed_fname, bam_fname, by_count, 0, processes, fasta)
tabio.write(cnarr, out_fname)
return out_fname


def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname,
output_dir, male_reference, plot_scatter, plot_diagram,
rscript_path, by_count, skip_low, seq_method,
segment_method, processes, do_cluster):
segment_method, processes, do_cluster, fasta=None):
"""Run the pipeline on one BAM file."""
# ENH - return probes, segments (cnarr, segarr)
logging.info("Running the CNVkit pipeline on %s ...", bam_fname)
sample_id = core.fbase(bam_fname)
sample_pfx = os.path.join(output_dir, sample_id)

raw_tgt = coverage.do_coverage(target_bed, bam_fname, by_count, 0,
processes)
processes, fasta)
tabio.write(raw_tgt, sample_pfx + '.targetcoverage.cnn')

raw_anti = coverage.do_coverage(antitarget_bed, bam_fname, by_count, 0,
processes)
processes, fasta)
tabio.write(raw_anti, sample_pfx + '.antitargetcoverage.cnn')

cnarr = fix.do_fix(raw_tgt, raw_anti, read_cna(ref_fname),
Expand Down
10 changes: 7 additions & 3 deletions cnvlib/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def _cmd_batch(args):
args.output_dir, args.male_reference, args.scatter,
args.diagram, args.rscript_path, args.count_reads,
args.drop_low_coverage, args.seq_method, args.segment_method, procs_per_bam,
args.cluster)
args.cluster, args.fasta)
else:
logging.info("No tumor/test samples (but %d normal/control samples) "
"specified on the command line.",
Expand Down Expand Up @@ -370,7 +370,7 @@ def read_regions(bed_fname):
fields = autobin.do_autobin(bam_fname, args.method, tgt_arr, access_arr,
args.bp_per_bin, args.target_min_size,
args.target_max_size, args.antitarget_min_size,
args.antitarget_max_size)
args.antitarget_max_size, args.fasta)
(_tgt_depth, tgt_bin_size), (_anti_depth, anti_bin_size) = fields

# Create & write BED files
Expand Down Expand Up @@ -408,6 +408,8 @@ def read_regions(bed_fname):
P_autobin = AP_subparsers.add_parser('autobin', help=_cmd_autobin.__doc__)
P_autobin.add_argument('bams', nargs='+',
help="""Sample BAM file(s) to test for target coverage""")
P_autobin.add_argument('-f', '--fasta', metavar="FILENAME",
help="Reference genome, FASTA format (e.g. UCSC hg19.fa)")
P_autobin.add_argument('-m', '--method',
choices=('hybrid', 'amplicon', 'wgs'), default='hybrid',
help="""Sequencing protocol: hybridization capture ('hybrid'), targeted
Expand Down Expand Up @@ -458,7 +460,7 @@ def read_regions(bed_fname):
def _cmd_coverage(args):
"""Calculate coverage in the given regions from BAM read depths."""
pset = coverage.do_coverage(args.interval, args.bam_file, args.count,
args.min_mapq, args.processes)
args.min_mapq, args.processes, args.fasta)
if not args.output:
# Create an informative but unique name for the coverage output file
bambase = core.fbase(args.bam_file)
Expand All @@ -476,6 +478,8 @@ def _cmd_coverage(args):
P_coverage = AP_subparsers.add_parser('coverage', help=_cmd_coverage.__doc__)
P_coverage.add_argument('bam_file', help="Mapped sequence reads (.bam)")
P_coverage.add_argument('interval', help="Intervals (.bed or .list)")
P_coverage.add_argument('-f', '--fasta', metavar="FILENAME",
help="Reference genome, FASTA format (e.g. UCSC hg19.fa)")
P_coverage.add_argument('-c', '--count', action='store_true',
help="""Get read depths by counting read midpoints within each bin.
(An alternative algorithm).""")
Expand Down
36 changes: 19 additions & 17 deletions cnvlib/coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,19 @@
from .params import NULL_LOG2_COVERAGE


def do_coverage(bed_fname, bam_fname, by_count=False, min_mapq=0, processes=1):
def do_coverage(bed_fname, bam_fname, by_count=False, min_mapq=0, processes=1, fasta=None):
"""Calculate coverage in the given regions from BAM read depths."""
if not samutil.ensure_bam_sorted(bam_fname):
if not samutil.ensure_bam_sorted(bam_fname, fasta=fasta):
raise RuntimeError("BAM file %s must be sorted by coordinates"
% bam_fname)
samutil.ensure_bam_index(bam_fname)
# ENH: count importers.TOO_MANY_NO_COVERAGE & warn
cnarr = interval_coverages(bed_fname, bam_fname, by_count, min_mapq,
processes)
processes, fasta)
return cnarr


def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes):
def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes, fasta=None):
"""Calculate log2 coverages in the BAM file at each interval."""
meta = {'sample_id': core.fbase(bam_fname)}
start_time = time.time()
Expand All @@ -47,16 +47,16 @@ def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes):
# Calculate average read depth in each bin
if by_count:
results = interval_coverages_count(bed_fname, bam_fname, min_mapq,
processes)
processes, fasta)
read_counts, cna_rows = zip(*results)
read_counts = pd.Series(read_counts)
cnarr = CNA.from_rows(list(cna_rows),
columns=CNA._required_columns + ('depth',),
meta_dict=meta)
else:
table = interval_coverages_pileup(bed_fname, bam_fname, min_mapq,
processes)
read_len = samutil.get_read_length(bam_fname)
processes, fasta)
read_len = samutil.get_read_length(bam_fname, fasta=fasta)
read_counts = table['basecount'] / read_len
table = table.drop('basecount', axis=1)
cnarr = CNA(table, meta)
Expand All @@ -75,7 +75,7 @@ def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes):
(tot_reads / len(read_counts)),
read_counts.min(),
read_counts.max())
tot_mapped_reads = samutil.bam_total_reads(bam_fname)
tot_mapped_reads = samutil.bam_total_reads(bam_fname, fasta=fasta)
if tot_mapped_reads:
logging.info("Percent reads in regions: %.3f (of %d mapped)",
100. * tot_reads / tot_mapped_reads,
Expand All @@ -86,19 +86,19 @@ def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes):
return cnarr


def interval_coverages_count(bed_fname, bam_fname, min_mapq, procs=1):
def interval_coverages_count(bed_fname, bam_fname, min_mapq, procs=1, fasta=None):
"""Calculate log2 coverages in the BAM file at each interval."""
regions = tabio.read_auto(bed_fname)
if procs == 1:
bamfile = pysam.Samfile(bam_fname, 'rb')
bamfile = pysam.Samfile(bam_fname, 'rb', reference_filename=fasta)
for chrom, subregions in regions.by_chromosome():
logging.info("Processing chromosome %s of %s",
chrom, os.path.basename(bam_fname))
for count, row in _rdc_chunk(bamfile, subregions, min_mapq):
yield [count, row]
else:
with futures.ProcessPoolExecutor(procs) as pool:
args_iter = ((bam_fname, subr, min_mapq)
args_iter = ((bam_fname, subr, min_mapq, fasta)
for _c, subr in regions.by_chromosome())
for chunk in pool.map(_rdc, args_iter):
for count, row in chunk:
Expand All @@ -110,9 +110,9 @@ def _rdc(args):
return list(_rdc_chunk(*args))


def _rdc_chunk(bamfile, regions, min_mapq):
def _rdc_chunk(bamfile, regions, min_mapq, fasta=None):
if isinstance(bamfile, str):
bamfile = pysam.Samfile(bamfile, 'rb')
bamfile = pysam.Samfile(bamfile, 'rb', reference_filename=fasta)
for chrom, start, end, gene in regions.coords(["gene"]):
yield region_depth_count(bamfile, chrom, start, end, gene, min_mapq)

Expand Down Expand Up @@ -152,15 +152,15 @@ def filter_read(read):
return count, row


def interval_coverages_pileup(bed_fname, bam_fname, min_mapq, procs=1):
def interval_coverages_pileup(bed_fname, bam_fname, min_mapq, procs=1, fasta=None):
"""Calculate log2 coverages in the BAM file at each interval."""
logging.info("Processing reads in %s", os.path.basename(bam_fname))
if procs == 1:
table = bedcov(bed_fname, bam_fname, min_mapq)
table = bedcov(bed_fname, bam_fname, min_mapq, fasta)
else:
chunks = []
with futures.ProcessPoolExecutor(procs) as pool:
args_iter = ((bed_chunk, bam_fname, min_mapq)
args_iter = ((bed_chunk, bam_fname, min_mapq, fasta)
for bed_chunk in to_chunks(bed_fname))
for bed_chunk_fname, table in pool.map(_bedcov, args_iter):
chunks.append(table)
Expand Down Expand Up @@ -189,7 +189,7 @@ def _bedcov(args):
return bed_fname, table


def bedcov(bed_fname, bam_fname, min_mapq):
def bedcov(bed_fname, bam_fname, min_mapq, fasta=None):
"""Calculate depth of all regions in a BED file via samtools (pysam) bedcov.

i.e. mean pileup depth across each region.
Expand All @@ -198,6 +198,8 @@ def bedcov(bed_fname, bam_fname, min_mapq):
cmd = [bed_fname, bam_fname]
if min_mapq and min_mapq > 0:
cmd.extend(['-Q', bytes(min_mapq)])
if fasta:
cmd.extend(['--reference', fasta])
try:
raw = pysam.bedcov(*cmd, split_lines=False)
except pysam.SamtoolsError as exc:
Expand Down
16 changes: 8 additions & 8 deletions cnvlib/samutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,26 @@
from pathlib import Path,PurePath


def idxstats(bam_fname, drop_unmapped=False):
def idxstats(bam_fname, drop_unmapped=False, fasta=None):
"""Get chromosome names, lengths, and number of mapped/unmapped reads.

Use the BAM index (.bai) to get the number of reads and size of each
chromosome. Contigs with no mapped reads are skipped.
"""
handle = StringIO(pysam.idxstats(bam_fname, split_lines=False))
handle = StringIO(pysam.idxstats(bam_fname, split_lines=False, reference_filename=fasta))
table = pd.read_csv(handle, sep='\t', header=None,
names=['chromosome', 'length', 'mapped', 'unmapped'])
if drop_unmapped:
table = table[table.mapped != 0].drop('unmapped', axis=1)
return table


def bam_total_reads(bam_fname):
def bam_total_reads(bam_fname, fasta=None):
"""Count the total number of mapped reads in a BAM file.

Uses the BAM index to do this quickly.
"""
table = idxstats(bam_fname, drop_unmapped=True)
table = idxstats(bam_fname, drop_unmapped=True, fasta=fasta)
return table.mapped.sum()


Expand Down Expand Up @@ -70,7 +70,7 @@ def ensure_bam_index(bam_fname):
return bai_fname


def ensure_bam_sorted(bam_fname, by_name=False, span=50):
def ensure_bam_sorted(bam_fname, by_name=False, span=50, fasta=None):
"""Test if the reads in a BAM file are sorted as expected.

by_name=True: reads are expected to be sorted by query name. Consecutive
Expand All @@ -92,7 +92,7 @@ def out_of_order(read, prev):
prev.pos <= read.pos)

# ENH - repeat at 50%, ~99% through the BAM
bam = pysam.Samfile(bam_fname, 'rb')
bam = pysam.Samfile(bam_fname, 'rb', reference_filename=fasta)
last_read = None
for read in islice(bam, span):
if out_of_order(read, last_read):
Expand All @@ -109,7 +109,7 @@ def is_newer_than(target_fname, orig_fname):
return (os.stat(target_fname).st_mtime >= os.stat(orig_fname).st_mtime)


def get_read_length(bam, span=1000):
def get_read_length(bam, span=1000, fasta=None):
"""Get (median) read length from first few reads in a BAM file.

Illumina reads all have the same length; other sequencers might not.
Expand All @@ -123,7 +123,7 @@ def get_read_length(bam, span=1000):
"""
was_open = False
if isinstance(bam, str):
bam = pysam.Samfile(bam, 'rb')
bam = pysam.Samfile(bam, 'rb', reference_filename=fasta)
else:
was_open = True
lengths = [read.query_length for read in islice(bam, span)
Expand Down
8 changes: 5 additions & 3 deletions scripts/guess_baits.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
# ___________________________________________
# Guided method: guess from potential targets

def filter_targets(target_bed, sample_bams, procs):
def filter_targets(target_bed, sample_bams, procs, fasta):
"""Check if each potential target has significant coverage."""
try:
baits = tabio.read(target_bed, 'bed4')
Expand All @@ -47,7 +47,7 @@ def filter_targets(target_bed, sample_bams, procs):
total_depths = np.zeros(len(baits), dtype=np.float_)
for bam_fname in sample_bams:
logging.info("Evaluating targets in %s", bam_fname)
sample = cnvlib.do_coverage(target_bed, bam_fname, processes=procs)
sample = cnvlib.do_coverage(target_bed, bam_fname, processes=procs, fasta=fasta)
assert len(sample) == len(baits), \
"%d != %d" % (len(sample), len(baits))
total_depths += sample['depth'].values
Expand Down Expand Up @@ -204,6 +204,8 @@ def normalize_depth_log2_filter(baits, min_depth, enrich_ratio=0.1):
help="""Number of subprocesses to segment in parallel.
If given without an argument, use the maximum number
of available CPUs. [Default: use 1 process]""")
P_coverage.add_argument('-f', '--fasta', metavar="FILENAME",
help="Reference genome, FASTA format (e.g. UCSC hg19.fa)")

AP_x = AP.add_mutually_exclusive_group(required=True)
AP_x.add_argument('-t', '--targets', metavar='TARGET_BED',
Expand Down Expand Up @@ -241,7 +243,7 @@ def normalize_depth_log2_filter(baits, min_depth, enrich_ratio=0.1):
args.processes = None

if args.targets:
baits = filter_targets(args.targets, args.sample_bams, args.processes)
baits = filter_targets(args.targets, args.sample_bams, args.processes, args.fasta)
else:
baits = scan_targets(args.access, args.sample_bams,
0.5 * args.min_depth, # More sensitive 1st pass
Expand Down