diff --git a/cnvlib/autobin.py b/cnvlib/autobin.py index 3c343546..f0a81425 100644 --- a/cnvlib/autobin.py +++ b/cnvlib/autobin.py @@ -23,7 +23,7 @@ def midsize_file(fnames): def do_autobin(bam_fname, method, targets=None, access=None, bp_per_bin=100000., target_min_size=20, target_max_size=50000, - antitarget_min_size=500, antitarget_max_size=1000000): + antitarget_min_size=500, antitarget_max_size=1000000, fasta=None): """Quickly calculate reasonable bin sizes from BAM read counts. Parameters @@ -70,8 +70,8 @@ def depth2binsize(depth, min_size, max_size): return bin_size samutil.ensure_bam_index(bam_fname) - rc_table = samutil.idxstats(bam_fname, drop_unmapped=True) - read_len = samutil.get_read_length(bam_fname) + rc_table = samutil.idxstats(bam_fname, drop_unmapped=True, fasta=fasta) + read_len = samutil.get_read_length(bam_fname, fasta=fasta) logging.info("Estimated read length %s", read_len) # Dispatch @@ -80,11 +80,11 @@ def depth2binsize(depth, min_size, max_size): # rc_table = update_chrom_length(rc_table, targets) # tgt_depth = average_depth(rc_table, read_len) # By sampling - tgt_depth = sample_region_cov(bam_fname, targets) + tgt_depth = sample_region_cov(bam_fname, targets, fasta=fasta) anti_depth = None elif method == 'hybrid': tgt_depth, anti_depth = hybrid(rc_table, read_len, bam_fname, targets, - access) + access, fasta) elif method == 'wgs': if access is not None and len(access): rc_table = update_chrom_length(rc_table, access) @@ -99,7 +99,7 @@ def depth2binsize(depth, min_size, max_size): (anti_depth, anti_bin_size)) -def hybrid(rc_table, read_len, bam_fname, targets, access=None): +def hybrid(rc_table, read_len, bam_fname, targets, access=None, fasta=None): """Hybrid capture sequencing.""" # Identify off-target regions if access is None: @@ -111,7 +111,7 @@ def hybrid(rc_table, read_len, bam_fname, targets, access=None): rc_table, targets, antitargets = shared_chroms(rc_table, targets, antitargets) # Deal with targets - target_depth = sample_region_cov(bam_fname, targets) + target_depth = sample_region_cov(bam_fname, targets, fasta=fasta) # Antitargets: subtract captured reads from total target_length = region_size_by_chrom(targets)['length'] target_reads = (target_length * target_depth / read_len).values @@ -142,13 +142,13 @@ def idxstats2ga(table, bam_fname): meta_dict={'filename': bam_fname}) -def sample_region_cov(bam_fname, regions, max_num=100): +def sample_region_cov(bam_fname, regions, max_num=100, fasta=None): """Calculate read depth in a randomly sampled subset of regions.""" midsize_regions = sample_midsize_regions(regions, max_num) with tempfile.NamedTemporaryFile(suffix='.bed', mode='w+t') as f: tabio.write(regions.as_dataframe(midsize_regions), f, 'bed4') f.flush() - table = coverage.bedcov(f.name, bam_fname, 0) + table = coverage.bedcov(f.name, bam_fname, 0, fasta) # Mean read depth across all sampled regions return table.basecount.sum() / (table.end - table.start).sum() diff --git a/cnvlib/batch.py b/cnvlib/batch.py index de2479fb..4e163068 100644 --- a/cnvlib/batch.py +++ b/cnvlib/batch.py @@ -70,7 +70,7 @@ def batch_make_reference(normal_bams, target_bed, antitarget_bed, # Choose median-size normal bam or tumor bam bam_fname = autobin.midsize_file(normal_bams) (wgs_depth, target_avg_size), _ = autobin.do_autobin( - bam_fname, *autobin_args, bp_per_bin=50000.) + bam_fname, *autobin_args, bp_per_bin=50000., fasta=fasta) logging.info("WGS average depth %.2f --> using bin size %d", wgs_depth, target_avg_size) else: @@ -129,12 +129,12 @@ def batch_make_reference(normal_bams, target_bed, antitarget_bed, pool.submit(batch_write_coverage, target_bed, nbam, sample_pfx + '.targetcoverage.cnn', - by_count, procs_per_cnn)) + by_count, procs_per_cnn, fasta)) anti_futures.append( pool.submit(batch_write_coverage, antitarget_bed, nbam, sample_pfx + '.antitargetcoverage.cnn', - by_count, procs_per_cnn)) + by_count, procs_per_cnn, fasta)) target_fnames = [tf.result() for tf in tgt_futures] antitarget_fnames = [af.result() for af in anti_futures] @@ -152,9 +152,9 @@ def batch_make_reference(normal_bams, target_bed, antitarget_bed, return output_reference, target_bed, antitarget_bed -def batch_write_coverage(bed_fname, bam_fname, out_fname, by_count, processes): +def batch_write_coverage(bed_fname, bam_fname, out_fname, by_count, processes, fasta): """Run coverage on one sample, write to file.""" - cnarr = coverage.do_coverage(bed_fname, bam_fname, by_count, 0, processes) + cnarr = coverage.do_coverage(bed_fname, bam_fname, by_count, 0, processes, fasta) tabio.write(cnarr, out_fname) return out_fname @@ -162,7 +162,7 @@ def batch_write_coverage(bed_fname, bam_fname, out_fname, by_count, processes): def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname, output_dir, male_reference, plot_scatter, plot_diagram, rscript_path, by_count, skip_low, seq_method, - segment_method, processes, do_cluster): + segment_method, processes, do_cluster, fasta=None): """Run the pipeline on one BAM file.""" # ENH - return probes, segments (cnarr, segarr) logging.info("Running the CNVkit pipeline on %s ...", bam_fname) @@ -170,11 +170,11 @@ def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname, sample_pfx = os.path.join(output_dir, sample_id) raw_tgt = coverage.do_coverage(target_bed, bam_fname, by_count, 0, - processes) + processes, fasta) tabio.write(raw_tgt, sample_pfx + '.targetcoverage.cnn') raw_anti = coverage.do_coverage(antitarget_bed, bam_fname, by_count, 0, - processes) + processes, fasta) tabio.write(raw_anti, sample_pfx + '.antitargetcoverage.cnn') cnarr = fix.do_fix(raw_tgt, raw_anti, read_cna(ref_fname), diff --git a/cnvlib/commands.py b/cnvlib/commands.py index 8cf6d599..ca6e9f13 100644 --- a/cnvlib/commands.py +++ b/cnvlib/commands.py @@ -140,7 +140,7 @@ def _cmd_batch(args): args.output_dir, args.male_reference, args.scatter, args.diagram, args.rscript_path, args.count_reads, args.drop_low_coverage, args.seq_method, args.segment_method, procs_per_bam, - args.cluster) + args.cluster, args.fasta) else: logging.info("No tumor/test samples (but %d normal/control samples) " "specified on the command line.", @@ -370,7 +370,7 @@ def read_regions(bed_fname): fields = autobin.do_autobin(bam_fname, args.method, tgt_arr, access_arr, args.bp_per_bin, args.target_min_size, args.target_max_size, args.antitarget_min_size, - args.antitarget_max_size) + args.antitarget_max_size, args.fasta) (_tgt_depth, tgt_bin_size), (_anti_depth, anti_bin_size) = fields # Create & write BED files @@ -408,6 +408,8 @@ def read_regions(bed_fname): P_autobin = AP_subparsers.add_parser('autobin', help=_cmd_autobin.__doc__) P_autobin.add_argument('bams', nargs='+', help="""Sample BAM file(s) to test for target coverage""") +P_autobin.add_argument('-f', '--fasta', metavar="FILENAME", + help="Reference genome, FASTA format (e.g. UCSC hg19.fa)") P_autobin.add_argument('-m', '--method', choices=('hybrid', 'amplicon', 'wgs'), default='hybrid', help="""Sequencing protocol: hybridization capture ('hybrid'), targeted @@ -458,7 +460,7 @@ def read_regions(bed_fname): def _cmd_coverage(args): """Calculate coverage in the given regions from BAM read depths.""" pset = coverage.do_coverage(args.interval, args.bam_file, args.count, - args.min_mapq, args.processes) + args.min_mapq, args.processes, args.fasta) if not args.output: # Create an informative but unique name for the coverage output file bambase = core.fbase(args.bam_file) @@ -476,6 +478,8 @@ def _cmd_coverage(args): P_coverage = AP_subparsers.add_parser('coverage', help=_cmd_coverage.__doc__) P_coverage.add_argument('bam_file', help="Mapped sequence reads (.bam)") P_coverage.add_argument('interval', help="Intervals (.bed or .list)") +P_coverage.add_argument('-f', '--fasta', metavar="FILENAME", + help="Reference genome, FASTA format (e.g. UCSC hg19.fa)") P_coverage.add_argument('-c', '--count', action='store_true', help="""Get read depths by counting read midpoints within each bin. (An alternative algorithm).""") diff --git a/cnvlib/coverage.py b/cnvlib/coverage.py index 9b5e870b..36afbae3 100644 --- a/cnvlib/coverage.py +++ b/cnvlib/coverage.py @@ -17,19 +17,19 @@ from .params import NULL_LOG2_COVERAGE -def do_coverage(bed_fname, bam_fname, by_count=False, min_mapq=0, processes=1): +def do_coverage(bed_fname, bam_fname, by_count=False, min_mapq=0, processes=1, fasta=None): """Calculate coverage in the given regions from BAM read depths.""" - if not samutil.ensure_bam_sorted(bam_fname): + if not samutil.ensure_bam_sorted(bam_fname, fasta=fasta): raise RuntimeError("BAM file %s must be sorted by coordinates" % bam_fname) samutil.ensure_bam_index(bam_fname) # ENH: count importers.TOO_MANY_NO_COVERAGE & warn cnarr = interval_coverages(bed_fname, bam_fname, by_count, min_mapq, - processes) + processes, fasta) return cnarr -def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes): +def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes, fasta=None): """Calculate log2 coverages in the BAM file at each interval.""" meta = {'sample_id': core.fbase(bam_fname)} start_time = time.time() @@ -47,7 +47,7 @@ def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes): # Calculate average read depth in each bin if by_count: results = interval_coverages_count(bed_fname, bam_fname, min_mapq, - processes) + processes, fasta) read_counts, cna_rows = zip(*results) read_counts = pd.Series(read_counts) cnarr = CNA.from_rows(list(cna_rows), @@ -55,8 +55,8 @@ def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes): meta_dict=meta) else: table = interval_coverages_pileup(bed_fname, bam_fname, min_mapq, - processes) - read_len = samutil.get_read_length(bam_fname) + processes, fasta) + read_len = samutil.get_read_length(bam_fname, fasta=fasta) read_counts = table['basecount'] / read_len table = table.drop('basecount', axis=1) cnarr = CNA(table, meta) @@ -75,7 +75,7 @@ def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes): (tot_reads / len(read_counts)), read_counts.min(), read_counts.max()) - tot_mapped_reads = samutil.bam_total_reads(bam_fname) + tot_mapped_reads = samutil.bam_total_reads(bam_fname, fasta=fasta) if tot_mapped_reads: logging.info("Percent reads in regions: %.3f (of %d mapped)", 100. * tot_reads / tot_mapped_reads, @@ -86,11 +86,11 @@ def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes): return cnarr -def interval_coverages_count(bed_fname, bam_fname, min_mapq, procs=1): +def interval_coverages_count(bed_fname, bam_fname, min_mapq, procs=1, fasta=None): """Calculate log2 coverages in the BAM file at each interval.""" regions = tabio.read_auto(bed_fname) if procs == 1: - bamfile = pysam.Samfile(bam_fname, 'rb') + bamfile = pysam.Samfile(bam_fname, 'rb', reference_filename=fasta) for chrom, subregions in regions.by_chromosome(): logging.info("Processing chromosome %s of %s", chrom, os.path.basename(bam_fname)) @@ -98,7 +98,7 @@ def interval_coverages_count(bed_fname, bam_fname, min_mapq, procs=1): yield [count, row] else: with futures.ProcessPoolExecutor(procs) as pool: - args_iter = ((bam_fname, subr, min_mapq) + args_iter = ((bam_fname, subr, min_mapq, fasta) for _c, subr in regions.by_chromosome()) for chunk in pool.map(_rdc, args_iter): for count, row in chunk: @@ -110,9 +110,9 @@ def _rdc(args): return list(_rdc_chunk(*args)) -def _rdc_chunk(bamfile, regions, min_mapq): +def _rdc_chunk(bamfile, regions, min_mapq, fasta=None): if isinstance(bamfile, str): - bamfile = pysam.Samfile(bamfile, 'rb') + bamfile = pysam.Samfile(bamfile, 'rb', reference_filename=fasta) for chrom, start, end, gene in regions.coords(["gene"]): yield region_depth_count(bamfile, chrom, start, end, gene, min_mapq) @@ -152,15 +152,15 @@ def filter_read(read): return count, row -def interval_coverages_pileup(bed_fname, bam_fname, min_mapq, procs=1): +def interval_coverages_pileup(bed_fname, bam_fname, min_mapq, procs=1, fasta=None): """Calculate log2 coverages in the BAM file at each interval.""" logging.info("Processing reads in %s", os.path.basename(bam_fname)) if procs == 1: - table = bedcov(bed_fname, bam_fname, min_mapq) + table = bedcov(bed_fname, bam_fname, min_mapq, fasta) else: chunks = [] with futures.ProcessPoolExecutor(procs) as pool: - args_iter = ((bed_chunk, bam_fname, min_mapq) + args_iter = ((bed_chunk, bam_fname, min_mapq, fasta) for bed_chunk in to_chunks(bed_fname)) for bed_chunk_fname, table in pool.map(_bedcov, args_iter): chunks.append(table) @@ -189,7 +189,7 @@ def _bedcov(args): return bed_fname, table -def bedcov(bed_fname, bam_fname, min_mapq): +def bedcov(bed_fname, bam_fname, min_mapq, fasta=None): """Calculate depth of all regions in a BED file via samtools (pysam) bedcov. i.e. mean pileup depth across each region. @@ -198,6 +198,8 @@ def bedcov(bed_fname, bam_fname, min_mapq): cmd = [bed_fname, bam_fname] if min_mapq and min_mapq > 0: cmd.extend(['-Q', bytes(min_mapq)]) + if fasta: + cmd.extend(['--reference', fasta]) try: raw = pysam.bedcov(*cmd, split_lines=False) except pysam.SamtoolsError as exc: diff --git a/cnvlib/samutil.py b/cnvlib/samutil.py index 13bb7bed..12fbf091 100644 --- a/cnvlib/samutil.py +++ b/cnvlib/samutil.py @@ -10,13 +10,13 @@ from pathlib import Path,PurePath -def idxstats(bam_fname, drop_unmapped=False): +def idxstats(bam_fname, drop_unmapped=False, fasta=None): """Get chromosome names, lengths, and number of mapped/unmapped reads. Use the BAM index (.bai) to get the number of reads and size of each chromosome. Contigs with no mapped reads are skipped. """ - handle = StringIO(pysam.idxstats(bam_fname, split_lines=False)) + handle = StringIO(pysam.idxstats(bam_fname, split_lines=False, reference_filename=fasta)) table = pd.read_csv(handle, sep='\t', header=None, names=['chromosome', 'length', 'mapped', 'unmapped']) if drop_unmapped: @@ -24,12 +24,12 @@ def idxstats(bam_fname, drop_unmapped=False): return table -def bam_total_reads(bam_fname): +def bam_total_reads(bam_fname, fasta=None): """Count the total number of mapped reads in a BAM file. Uses the BAM index to do this quickly. """ - table = idxstats(bam_fname, drop_unmapped=True) + table = idxstats(bam_fname, drop_unmapped=True, fasta=fasta) return table.mapped.sum() @@ -70,7 +70,7 @@ def ensure_bam_index(bam_fname): return bai_fname -def ensure_bam_sorted(bam_fname, by_name=False, span=50): +def ensure_bam_sorted(bam_fname, by_name=False, span=50, fasta=None): """Test if the reads in a BAM file are sorted as expected. by_name=True: reads are expected to be sorted by query name. Consecutive @@ -92,7 +92,7 @@ def out_of_order(read, prev): prev.pos <= read.pos) # ENH - repeat at 50%, ~99% through the BAM - bam = pysam.Samfile(bam_fname, 'rb') + bam = pysam.Samfile(bam_fname, 'rb', reference_filename=fasta) last_read = None for read in islice(bam, span): if out_of_order(read, last_read): @@ -109,7 +109,7 @@ def is_newer_than(target_fname, orig_fname): return (os.stat(target_fname).st_mtime >= os.stat(orig_fname).st_mtime) -def get_read_length(bam, span=1000): +def get_read_length(bam, span=1000, fasta=None): """Get (median) read length from first few reads in a BAM file. Illumina reads all have the same length; other sequencers might not. @@ -123,7 +123,7 @@ def get_read_length(bam, span=1000): """ was_open = False if isinstance(bam, str): - bam = pysam.Samfile(bam, 'rb') + bam = pysam.Samfile(bam, 'rb', reference_filename=fasta) else: was_open = True lengths = [read.query_length for read in islice(bam, span) diff --git a/scripts/guess_baits.py b/scripts/guess_baits.py index 86fb5d48..bd1d00a3 100755 --- a/scripts/guess_baits.py +++ b/scripts/guess_baits.py @@ -36,7 +36,7 @@ # ___________________________________________ # Guided method: guess from potential targets -def filter_targets(target_bed, sample_bams, procs): +def filter_targets(target_bed, sample_bams, procs, fasta): """Check if each potential target has significant coverage.""" try: baits = tabio.read(target_bed, 'bed4') @@ -47,7 +47,7 @@ def filter_targets(target_bed, sample_bams, procs): total_depths = np.zeros(len(baits), dtype=np.float_) for bam_fname in sample_bams: logging.info("Evaluating targets in %s", bam_fname) - sample = cnvlib.do_coverage(target_bed, bam_fname, processes=procs) + sample = cnvlib.do_coverage(target_bed, bam_fname, processes=procs, fasta=fasta) assert len(sample) == len(baits), \ "%d != %d" % (len(sample), len(baits)) total_depths += sample['depth'].values @@ -204,6 +204,8 @@ def normalize_depth_log2_filter(baits, min_depth, enrich_ratio=0.1): help="""Number of subprocesses to segment in parallel. If given without an argument, use the maximum number of available CPUs. [Default: use 1 process]""") + P_coverage.add_argument('-f', '--fasta', metavar="FILENAME", + help="Reference genome, FASTA format (e.g. UCSC hg19.fa)") AP_x = AP.add_mutually_exclusive_group(required=True) AP_x.add_argument('-t', '--targets', metavar='TARGET_BED', @@ -241,7 +243,7 @@ def normalize_depth_log2_filter(baits, min_depth, enrich_ratio=0.1): args.processes = None if args.targets: - baits = filter_targets(args.targets, args.sample_bams, args.processes) + baits = filter_targets(args.targets, args.sample_bams, args.processes, args.fasta) else: baits = scan_targets(args.access, args.sample_bams, 0.5 * args.min_depth, # More sensitive 1st pass