diff --git a/README.md b/README.md index 150a8e0..14ccc06 100755 --- a/README.md +++ b/README.md @@ -106,7 +106,10 @@ WisecondorX predict test_input.npz reference_input.npz output_id [--optional arg `--beta x` | When beta is given, `--zscore` is ignored. Beta sets a ratio cutoff for aberration calling. It's a number between 0 (liberal) and 1 (conservative) and, when used, is optimally close to the purity (e.g. fetal/tumor fraction) `--blacklist x` | Blacklist for masking additional regions; requires headerless .bed file. This is particularly useful when the reference set is too small to recognize some obvious loci (such as centromeres; examples at `./example.blacklist/`) `--gender x` | Force WisecondorX to analyze this case as male (M) or female (F). Useful when e.g. dealing with a loss of chromosome Y, which causes erroneous gender predictions (choices: x=F or x=M) -`--bed` | Outputs tab-delimited .bed files (trisomy 21 NIPT example at `./example.bed`), containing all necessary information **(\*)** +`--bed` | Outputs tab-delimited .bed files (trisomy 21 NIPT example at `./example.bed`), containing all necessary information **(\*)** +`--vcf` | Outputs the found CNVs in a .vcf file (default: False) +`--fai FAI` | The index of the reference used to align the input files. (Only needed for VCF creation) (default: None) +`--sample SAMPLE` | The sample name to use in the VCF. Defaults to the basename of the out ID. (default: None) `--plot` | Outputs custom .png plots (trisomy 21 NIPT example at `./example.plot`), directly interpretable **(\*)** `--ylim [a,b]` | Force WisecondorX to use y-axis interval [a,b] during plotting, e.g. [-2,2] `--cairo` | Some operating systems require the cairo bitmap type to write plots diff --git a/wisecondorX/main.py b/wisecondorX/main.py index 63e61d5..aee7531 100755 --- a/wisecondorX/main.py +++ b/wisecondorX/main.py @@ -121,9 +121,14 @@ def tool_newref(args): def tool_test(args): logging.info('Starting CNA prediction') - if not args.bed and not args.plot: + if not args.bed and not args.plot and not args.vcf: logging.critical('No output format selected. ' - 'Select at least one of the supported output formats (--bed, --plot)') + 'Select at least one of the supported output formats (--bed, --plot, --vcf)') + sys.exit() + + if args.vcf and not args.fai: + logging.critical('A fasta index file is needed to create a VCF file. ' + 'Please provide one via the --fai flag') sys.exit() if args.zscore <= 0: @@ -239,8 +244,11 @@ def tool_test(args): results['results_c'] = exec_cbs(rem_input, results) - if args.bed: - logging.info('Writing tables ...') + if args.bed or args.vcf: + types = [] + if args.bed: types.append("tables") + if args.vcf: types.append("VCF") + logging.info("Writing {} ...".format("/".join(types))) generate_output_tables(rem_input, results) if args.plot: @@ -390,6 +398,15 @@ def main(): parser_test.add_argument('--bed', action='store_true', help='Outputs tab-delimited .bed files, containing the most important information') + parser_test.add_argument('--vcf', + action='store_true', + help='Outputs the found CNVs in a .vcf file') + parser_test.add_argument('--fai', + type=str, + help='The index of the reference used to align the input files. (Only needed for VCF creation)') + parser_test.add_argument('--sample', + type=str, + help='The sample name to use in the VCF. Defaults to the basename of the out ID.') parser_test.add_argument('--plot', action='store_true', help='Outputs .png plots') diff --git a/wisecondorX/predict_output.py b/wisecondorX/predict_output.py index 4586966..5959fc0 100644 --- a/wisecondorX/predict_output.py +++ b/wisecondorX/predict_output.py @@ -5,6 +5,7 @@ import numpy as np from wisecondorX.overall_tools import exec_R, get_z_score, get_median_segment_variance, get_cpa +from pysam import VariantFile, VariantRecord ''' Writes plots. @@ -44,10 +45,10 @@ def exec_write_plots(rem_input, results): def generate_output_tables(rem_input, results): - _generate_bins_bed(rem_input, results) - _generate_segments_and_aberrations_bed(rem_input, results) + if rem_input['args'].bed: + _generate_bins_bed(rem_input, results) _generate_chr_statistics_file(rem_input, results) - + _generate_segments_and_aberrations(rem_input, results) def _generate_bins_bed(rem_input, results): bins_file = open('{}_bins.bed'.format(rem_input['args'].outid), 'w') @@ -77,11 +78,23 @@ def _generate_bins_bed(rem_input, results): bins_file.close() -def _generate_segments_and_aberrations_bed(rem_input, results): - segments_file = open('{}_segments.bed'.format(rem_input['args'].outid), 'w') - abberations_file = open('{}_aberrations.bed'.format(rem_input['args'].outid), 'w') - segments_file.write('chr\tstart\tend\tratio\tzscore\n') - abberations_file.write('chr\tstart\tend\tratio\tzscore\ttype\n') +def _generate_segments_and_aberrations(rem_input, results): + if rem_input['args'].bed: + segments_file = open('{}_segments.bed'.format(rem_input['args'].outid), 'w') + abberations_file = open('{}_aberrations.bed'.format(rem_input['args'].outid), 'w') + segments_file.write('chr\tstart\tend\tratio\tzscore\n') + abberations_file.write('chr\tstart\tend\tratio\tzscore\ttype\n') + + if rem_input['args'].vcf: + vcf_file = VariantFile("{}.vcf.gz".format(rem_input['args'].outid), "w") + prefix = _add_contigs(rem_input['args'].fai, vcf_file) + _add_info(vcf_file) + sample = rem_input['args'].sample if rem_input['args'].sample else rem_input['args'].outid.split("/")[-1] + vcf_file.header.add_sample(sample) + + dup_count = 0 + del_count = 0 + cnv_count = 0 for segment in results['results_c']: chr_name = str(segment[0] + 1) @@ -89,33 +102,117 @@ def _generate_segments_and_aberrations_bed(rem_input, results): chr_name = 'X' if chr_name == '24': chr_name = 'Y' - row = [chr_name, - int(segment[1] * rem_input['binsize'] + 1), - int(segment[2] * rem_input['binsize']), - segment[4], segment[3]] - segments_file.write('{}\n'.format('\t'.join([str(x) for x in row]))) + start = int(segment[1] * rem_input['binsize'] + 1) + stop = int(segment[2] * rem_input['binsize']) + ratio = segment[4] + zscore = segment[3] + if rem_input['args'].bed: + row = [chr_name, start, stop, ratio, zscore] + segments_file.write('{}\n'.format('\t'.join([str(x) for x in row]))) + + # output segments instead of abberations but add field that annotates which variant it is ploidy = 2 if (chr_name == 'X' or chr_name == 'Y') and rem_input['ref_gender'] == 'M': ploidy = 1 - if rem_input['args'].beta is not None: - if float(segment[4]) > __get_aberration_cutoff(rem_input['args'].beta, ploidy)[1]: - abberations_file.write('{}\tgain\n'.format('\t'.join([str(x) for x in row]))) - elif float(segment[4]) < __get_aberration_cutoff(rem_input['args'].beta, ploidy)[0]: - abberations_file.write('{}\tloss\n'.format('\t'.join([str(x) for x in row]))) - elif isinstance(segment[3], str): - continue - else: - if float(segment[3]) > rem_input['args'].zscore: - abberations_file.write('{}\tgain\n'.format('\t'.join([str(x) for x in row]))) - elif float(segment[3]) < - rem_input['args'].zscore: - abberations_file.write('{}\tloss\n'.format('\t'.join([str(x) for x in row]))) - - segments_file.close() - abberations_file.close() - - -def __get_aberration_cutoff(beta, ploidy): + + gain_or_loss = _define_gain_loss(segment, rem_input, ploidy) + + if rem_input['args'].vcf: + if gain_or_loss == "gain": + cnv_type = "DUP" + dup_count += 1 + type_count = dup_count + elif gain_or_loss == "loss": + cnv_type = "DEL" + del_count += 1 + type_count = del_count + else: + cnv_type = "CNV" + cnv_count += 1 + type_count = cnv_count + + record: VariantRecord = vcf_file.new_record( + contig=prefix + chr_name, + id="WisecondorX_{}_{}".format(cnv_type, type_count), + start=start, + stop=stop, + alleles=('N', "<{}>".format(cnv_type)) + ) + record.info.update({ + 'SVTYPE': 'CNV', + 'SVLEN': record.stop - record.start + 1, + 'ABB': False if not gain_or_loss else True, + 'SM': ratio, + 'ZS': zscore + }) + record.samples[sample]['GT'] = (None,None) if ploidy == 2 else None + vcf_file.write(record) + + if not gain_or_loss: continue + if rem_input['args'].bed: + abberations_file.write('{}\t{}\n'.format('\t'.join([str(x) for x in row]), gain_or_loss)) + + if rem_input['args'].bed: + segments_file.close() + abberations_file.close() + + if rem_input['args'].vcf: + vcf_file.close() + +def _add_contigs(fai:str, vcf:VariantFile) -> str: + """ + Adds the contigs to the VCF file + """ + prefix = "" + with open(fai, "r") as index: + for line in index.readlines(): + if line.startswith("chr"): prefix = "chr" + split_line = line.split("\t") + vcf.header.add_line("##contig=".format(split_line[0], split_line[1])) + + return prefix + +def _add_info(vcf:VariantFile) -> None: + """ + Adds the INFO fields to the VCF file + """ + infos = [ + '##ALT=', + '##ALT=', + '##ALT=', + '##INFO=', + '##INFO=', + '##INFO=', + '##INFO=', + '##INFO=', + '##INFO=', + '##FILTER=', + '##FILTER=', + '##FORMAT=', + ] + for info in infos: + vcf.header.add_line(info) + +def _define_gain_loss(segment, rem_input, ploidy = 2): + """ + Define if the abberation is a gain or a loss + """ + if rem_input['args'].beta is not None: + abberation_cutoff = _get_aberration_cutoff(rem_input['args'].beta, ploidy) + if float(segment[4]) > abberation_cutoff[1]: + return "gain" + elif float(segment[4]) < abberation_cutoff[0]: + return "loss" + elif isinstance(segment[3], str): return False + else: + if float(segment[3]) > rem_input['args'].zscore: + return "gain" + elif float(segment[3]) < - rem_input['args'].zscore: + return "loss" + return False + +def _get_aberration_cutoff(beta, ploidy): loss_cutoff = np.log2((ploidy - (beta / 2)) / ploidy) gain_cutoff = np.log2((ploidy + (beta / 2)) / ploidy) return loss_cutoff, gain_cutoff