CenterForMedicalGeneticsGhent · nvnieuwk · Jun 23, 2023 · Jun 28, 2023 · Jun 28, 2023 · Jun 28, 2023
diff --git a/README.md b/README.md
@@ -106,7 +106,10 @@ WisecondorX predict test_input.npz reference_input.npz output_id [--optional arg
 `--beta x` | When beta is given, `--zscore` is ignored. Beta sets a ratio cutoff for aberration calling. It's a number between 0 (liberal) and 1 (conservative) and, when used, is optimally close to the purity (e.g. fetal/tumor fraction)  
 `--blacklist x` | Blacklist for masking additional regions; requires headerless .bed file. This is particularly useful when the reference set is too small to recognize some obvious loci (such as centromeres; examples at `./example.blacklist/`)  
 `--gender x` | Force WisecondorX to analyze this case as male (M) or female (F). Useful when e.g. dealing with a loss of chromosome Y, which causes erroneous gender predictions (choices: x=F or x=M)
-`--bed` | Outputs tab-delimited .bed files (trisomy 21 NIPT example at `./example.bed`), containing all necessary information  **(\*)**  
+`--bed` | Outputs tab-delimited .bed files (trisomy 21 NIPT example at `./example.bed`), containing all necessary information  **(\*)** 
+`--vcf` | Outputs the found CNVs in a .vcf file (default: False)
+`--fai FAI` | The index of the reference used to align the input files. (Only needed for VCF creation) (default: None)
+`--sample SAMPLE` | The sample name to use in the VCF. Defaults to the basename of the out ID. (default: None) 
 `--plot` | Outputs custom .png plots (trisomy 21 NIPT example at `./example.plot`), directly interpretable  **(\*)**  
 `--ylim [a,b]` | Force WisecondorX to use y-axis interval [a,b] during plotting, e.g. [-2,2]  
 `--cairo` | Some operating systems require the cairo bitmap type to write plots  

diff --git a/wisecondorX/main.py b/wisecondorX/main.py
@@ -121,9 +121,14 @@ def tool_newref(args):
 def tool_test(args):
     logging.info('Starting CNA prediction')
 
-    if not args.bed and not args.plot:
+    if not args.bed and not args.plot and not args.vcf:
         logging.critical('No output format selected. '
-                         'Select at least one of the supported output formats (--bed, --plot)')
+                         'Select at least one of the supported output formats (--bed, --plot, --vcf)')
+        sys.exit()
+
+    if args.vcf and not args.fai:
+        logging.critical('A fasta index file is needed to create a VCF file. '
+                         'Please provide one via the --fai flag')
         sys.exit()
 
     if args.zscore <= 0:
@@ -239,8 +244,11 @@ def tool_test(args):
 
     results['results_c'] = exec_cbs(rem_input, results)
 
-    if args.bed:
-        logging.info('Writing tables ...')
+    if args.bed or args.vcf:
+        types = []
+        if args.bed: types.append("tables")
+        if args.vcf: types.append("VCF")
+        logging.info("Writing {} ...".format("/".join(types)))
         generate_output_tables(rem_input, results)
 
     if args.plot:
@@ -390,6 +398,15 @@ def main():
     parser_test.add_argument('--bed',
                              action='store_true',
                              help='Outputs tab-delimited .bed files, containing the most important information')
+    parser_test.add_argument('--vcf',
+                             action='store_true',
+                             help='Outputs the found CNVs in a .vcf file')
+    parser_test.add_argument('--fai',
+                             type=str,
+                             help='The index of the reference used to align the input files. (Only needed for VCF creation)')
+    parser_test.add_argument('--sample',
+                             type=str,
+                             help='The sample name to use in the VCF. Defaults to the basename of the out ID.')
     parser_test.add_argument('--plot',
                              action='store_true',
                              help='Outputs .png plots')

diff --git a/wisecondorX/predict_output.py b/wisecondorX/predict_output.py
@@ -5,6 +5,7 @@
 import numpy as np
 
 from wisecondorX.overall_tools import exec_R, get_z_score, get_median_segment_variance, get_cpa
+from pysam import VariantFile, VariantRecord
 
 '''
 Writes plots.
@@ -44,10 +45,10 @@ def exec_write_plots(rem_input, results):
 
 
 def generate_output_tables(rem_input, results):
-    _generate_bins_bed(rem_input, results)
-    _generate_segments_and_aberrations_bed(rem_input, results)
+    if rem_input['args'].bed:
+        _generate_bins_bed(rem_input, results)
     _generate_chr_statistics_file(rem_input, results)
-
+    _generate_segments_and_aberrations(rem_input, results)
 
 def _generate_bins_bed(rem_input, results):
     bins_file = open('{}_bins.bed'.format(rem_input['args'].outid), 'w')
@@ -77,45 +78,141 @@ def _generate_bins_bed(rem_input, results):
     bins_file.close()
 
 
-def _generate_segments_and_aberrations_bed(rem_input, results):
-    segments_file = open('{}_segments.bed'.format(rem_input['args'].outid), 'w')
-    abberations_file = open('{}_aberrations.bed'.format(rem_input['args'].outid), 'w')
-    segments_file.write('chr\tstart\tend\tratio\tzscore\n')
-    abberations_file.write('chr\tstart\tend\tratio\tzscore\ttype\n')
+def _generate_segments_and_aberrations(rem_input, results):
+    if rem_input['args'].bed:
+        segments_file = open('{}_segments.bed'.format(rem_input['args'].outid), 'w')
+        abberations_file = open('{}_aberrations.bed'.format(rem_input['args'].outid), 'w')
+        segments_file.write('chr\tstart\tend\tratio\tzscore\n')
+        abberations_file.write('chr\tstart\tend\tratio\tzscore\ttype\n')
+
+    if rem_input['args'].vcf:
+        vcf_file = VariantFile("{}.vcf.gz".format(rem_input['args'].outid), "w")
+        prefix = _add_contigs(rem_input['args'].fai, vcf_file)
+        _add_info(vcf_file)
+        sample = rem_input['args'].sample if rem_input['args'].sample else rem_input['args'].outid.split("/")[-1]
+        vcf_file.header.add_sample(sample)
+
+    dup_count = 0
+    del_count = 0
+    cnv_count = 0
 
     for segment in results['results_c']:
         chr_name = str(segment[0] + 1)
         if chr_name == '23':
             chr_name = 'X'
         if chr_name == '24':
             chr_name = 'Y'
-        row = [chr_name,
-               int(segment[1] * rem_input['binsize'] + 1),
-               int(segment[2] * rem_input['binsize']),
-               segment[4], segment[3]]
-        segments_file.write('{}\n'.format('\t'.join([str(x) for x in row])))
+        start = int(segment[1] * rem_input['binsize'] + 1)
+        stop = int(segment[2] * rem_input['binsize'])
+        ratio = segment[4]
+        zscore = segment[3]
+        if rem_input['args'].bed: 
+            row = [chr_name, start, stop, ratio, zscore]          
+            segments_file.write('{}\n'.format('\t'.join([str(x) for x in row])))
+
+        # output segments instead of abberations but add field that annotates which variant it is
 
         ploidy = 2
         if (chr_name == 'X' or chr_name == 'Y') and rem_input['ref_gender'] == 'M':
             ploidy = 1
-        if rem_input['args'].beta is not None:
-            if float(segment[4]) > __get_aberration_cutoff(rem_input['args'].beta, ploidy)[1]:
-                abberations_file.write('{}\tgain\n'.format('\t'.join([str(x) for x in row])))
-            elif float(segment[4]) < __get_aberration_cutoff(rem_input['args'].beta, ploidy)[0]:
-                abberations_file.write('{}\tloss\n'.format('\t'.join([str(x) for x in row])))
-        elif isinstance(segment[3], str):
-            continue
-        else:
-            if float(segment[3]) > rem_input['args'].zscore:
-                abberations_file.write('{}\tgain\n'.format('\t'.join([str(x) for x in row])))
-            elif float(segment[3]) < - rem_input['args'].zscore:
-                abberations_file.write('{}\tloss\n'.format('\t'.join([str(x) for x in row])))
-
-    segments_file.close()
-    abberations_file.close()
-
-
-def __get_aberration_cutoff(beta, ploidy):
+
+        gain_or_loss = _define_gain_loss(segment, rem_input, ploidy)
+
+        if rem_input['args'].vcf:
+            if gain_or_loss == "gain":
+                cnv_type = "DUP"
+                dup_count += 1
+                type_count = dup_count
+            elif gain_or_loss == "loss":
+                cnv_type = "DEL"
+                del_count += 1
+                type_count = del_count
+            else:
+                cnv_type = "CNV"
+                cnv_count += 1
+                type_count = cnv_count
+
+            record: VariantRecord = vcf_file.new_record(
+                contig=prefix + chr_name, 
+                id="WisecondorX_{}_{}".format(cnv_type, type_count),
+                start=start,
+                stop=stop,
+                alleles=('N', "<{}>".format(cnv_type))
+            )
+            record.info.update({
+                'SVTYPE': 'CNV',
+                'SVLEN': record.stop - record.start + 1,
+                'ABB': False if not gain_or_loss else True,
+                'SM': ratio,
+                'ZS': zscore
+            })
+            record.samples[sample]['GT'] = (None,None) if ploidy == 2 else None
+            vcf_file.write(record)
+
+        if not gain_or_loss: continue
+        if rem_input['args'].bed:
+            abberations_file.write('{}\t{}\n'.format('\t'.join([str(x) for x in row]), gain_or_loss))
+
+    if rem_input['args'].bed:
+        segments_file.close()
+        abberations_file.close()
+
+    if rem_input['args'].vcf:
+        vcf_file.close()
+
+def _add_contigs(fai:str, vcf:VariantFile) -> str:
+    """
+    Adds the contigs to the VCF file
+    """
+    prefix = ""
+    with open(fai, "r") as index:
+        for line in index.readlines():
+            if line.startswith("chr"): prefix = "chr"
+            split_line = line.split("\t")
+            vcf.header.add_line("##contig=<ID={},length={}>".format(split_line[0], split_line[1]))
+
+    return prefix
+
+def _add_info(vcf:VariantFile) -> None:
+    """
+    Adds the INFO fields to the VCF file
+    """
+    infos = [
+        '##ALT=<ID=CNV,Description="Copy number variant region">',
+        '##ALT=<ID=DEL,Description="Deletion relative to the reference">',
+        '##ALT=<ID=DUP,Description="Region of elevated copy number relative to the reference">',
+        '##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">',
+        '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
+        '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">',
+        '##INFO=<ID=SM,Number=1,Type=Float,Description="Linear copy ratio of the segment mean">',
+        '##INFO=<ID=ZS,Number=1,Type=Float,Description="The z-score calculated for the current CNV">',
+        '##INFO=<ID=ABB,Number=0,Type=Flag,Description="States that the CNV is an abberation">',
+        '##FILTER=<ID=cnvQual,Description="CNV with quality below 10">',
+        '##FILTER=<ID=cnvCopyRatio,Description="CNV with copy ratio within +/- 0.2 of 1.0">',
+        '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
+    ]
+    for info in infos:
+        vcf.header.add_line(info)
+
+def _define_gain_loss(segment, rem_input, ploidy = 2):
+    """
+    Define if the abberation is a gain or a loss
+    """
+    if rem_input['args'].beta is not None:
+        abberation_cutoff = _get_aberration_cutoff(rem_input['args'].beta, ploidy)
+        if float(segment[4]) > abberation_cutoff[1]:
+            return "gain"
+        elif float(segment[4]) < abberation_cutoff[0]:
+            return "loss"
+    elif isinstance(segment[3], str): return False
+    else:
+        if float(segment[3]) > rem_input['args'].zscore:
+            return "gain"
+        elif float(segment[3]) < - rem_input['args'].zscore:
+            return "loss"
+    return False
+
+def _get_aberration_cutoff(beta, ploidy):
     loss_cutoff = np.log2((ploidy - (beta / 2)) / ploidy)
     gain_cutoff = np.log2((ploidy + (beta / 2)) / ploidy)
     return loss_cutoff, gain_cutoff