Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add VCF support #107

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,10 @@ WisecondorX predict test_input.npz reference_input.npz output_id [--optional arg
`--beta x` | When beta is given, `--zscore` is ignored. Beta sets a ratio cutoff for aberration calling. It's a number between 0 (liberal) and 1 (conservative) and, when used, is optimally close to the purity (e.g. fetal/tumor fraction)
`--blacklist x` | Blacklist for masking additional regions; requires headerless .bed file. This is particularly useful when the reference set is too small to recognize some obvious loci (such as centromeres; examples at `./example.blacklist/`)
`--gender x` | Force WisecondorX to analyze this case as male (M) or female (F). Useful when e.g. dealing with a loss of chromosome Y, which causes erroneous gender predictions (choices: x=F or x=M)
`--bed` | Outputs tab-delimited .bed files (trisomy 21 NIPT example at `./example.bed`), containing all necessary information **(\*)**
`--bed` | Outputs tab-delimited .bed files (trisomy 21 NIPT example at `./example.bed`), containing all necessary information **(\*)**
`--vcf` | Outputs the found CNVs in a .vcf file (default: False)
`--fai FAI` | The index of the reference used to align the input files. (Only needed for VCF creation) (default: None)
`--sample SAMPLE` | The sample name to use in the VCF. Defaults to the basename of the out ID. (default: None)
`--plot` | Outputs custom .png plots (trisomy 21 NIPT example at `./example.plot`), directly interpretable **(\*)**
`--ylim [a,b]` | Force WisecondorX to use y-axis interval [a,b] during plotting, e.g. [-2,2]
`--cairo` | Some operating systems require the cairo bitmap type to write plots
Expand Down
25 changes: 21 additions & 4 deletions wisecondorX/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,14 @@ def tool_newref(args):
def tool_test(args):
logging.info('Starting CNA prediction')

if not args.bed and not args.plot:
if not args.bed and not args.plot and not args.vcf:
logging.critical('No output format selected. '
'Select at least one of the supported output formats (--bed, --plot)')
'Select at least one of the supported output formats (--bed, --plot, --vcf)')
sys.exit()

if args.vcf and not args.fai:
logging.critical('A fasta index file is needed to create a VCF file. '
'Please provide one via the --fai flag')
sys.exit()

if args.zscore <= 0:
Expand Down Expand Up @@ -239,8 +244,11 @@ def tool_test(args):

results['results_c'] = exec_cbs(rem_input, results)

if args.bed:
logging.info('Writing tables ...')
if args.bed or args.vcf:
types = []
if args.bed: types.append("tables")
if args.vcf: types.append("VCF")
logging.info("Writing {} ...".format("/".join(types)))
generate_output_tables(rem_input, results)

if args.plot:
Expand Down Expand Up @@ -390,6 +398,15 @@ def main():
parser_test.add_argument('--bed',
action='store_true',
help='Outputs tab-delimited .bed files, containing the most important information')
parser_test.add_argument('--vcf',
action='store_true',
help='Outputs the found CNVs in a .vcf file')
parser_test.add_argument('--fai',
type=str,
help='The index of the reference used to align the input files. (Only needed for VCF creation)')
parser_test.add_argument('--sample',
type=str,
help='The sample name to use in the VCF. Defaults to the basename of the out ID.')
parser_test.add_argument('--plot',
action='store_true',
help='Outputs .png plots')
Expand Down
159 changes: 128 additions & 31 deletions wisecondorX/predict_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np

from wisecondorX.overall_tools import exec_R, get_z_score, get_median_segment_variance, get_cpa
from pysam import VariantFile, VariantRecord

'''
Writes plots.
Expand Down Expand Up @@ -44,10 +45,10 @@ def exec_write_plots(rem_input, results):


def generate_output_tables(rem_input, results):
_generate_bins_bed(rem_input, results)
_generate_segments_and_aberrations_bed(rem_input, results)
if rem_input['args'].bed:
_generate_bins_bed(rem_input, results)
_generate_chr_statistics_file(rem_input, results)

_generate_segments_and_aberrations(rem_input, results)

def _generate_bins_bed(rem_input, results):
bins_file = open('{}_bins.bed'.format(rem_input['args'].outid), 'w')
Expand Down Expand Up @@ -77,45 +78,141 @@ def _generate_bins_bed(rem_input, results):
bins_file.close()


def _generate_segments_and_aberrations_bed(rem_input, results):
segments_file = open('{}_segments.bed'.format(rem_input['args'].outid), 'w')
abberations_file = open('{}_aberrations.bed'.format(rem_input['args'].outid), 'w')
segments_file.write('chr\tstart\tend\tratio\tzscore\n')
abberations_file.write('chr\tstart\tend\tratio\tzscore\ttype\n')
def _generate_segments_and_aberrations(rem_input, results):
if rem_input['args'].bed:
segments_file = open('{}_segments.bed'.format(rem_input['args'].outid), 'w')
abberations_file = open('{}_aberrations.bed'.format(rem_input['args'].outid), 'w')
segments_file.write('chr\tstart\tend\tratio\tzscore\n')
abberations_file.write('chr\tstart\tend\tratio\tzscore\ttype\n')

if rem_input['args'].vcf:
vcf_file = VariantFile("{}.vcf.gz".format(rem_input['args'].outid), "w")
prefix = _add_contigs(rem_input['args'].fai, vcf_file)
_add_info(vcf_file)
sample = rem_input['args'].sample if rem_input['args'].sample else rem_input['args'].outid.split("/")[-1]
vcf_file.header.add_sample(sample)

dup_count = 0
del_count = 0
cnv_count = 0

for segment in results['results_c']:
chr_name = str(segment[0] + 1)
if chr_name == '23':
chr_name = 'X'
if chr_name == '24':
chr_name = 'Y'
row = [chr_name,
int(segment[1] * rem_input['binsize'] + 1),
int(segment[2] * rem_input['binsize']),
segment[4], segment[3]]
segments_file.write('{}\n'.format('\t'.join([str(x) for x in row])))
start = int(segment[1] * rem_input['binsize'] + 1)
stop = int(segment[2] * rem_input['binsize'])
ratio = segment[4]
zscore = segment[3]
if rem_input['args'].bed:
row = [chr_name, start, stop, ratio, zscore]
segments_file.write('{}\n'.format('\t'.join([str(x) for x in row])))

# output segments instead of abberations but add field that annotates which variant it is

ploidy = 2
if (chr_name == 'X' or chr_name == 'Y') and rem_input['ref_gender'] == 'M':
ploidy = 1
if rem_input['args'].beta is not None:
if float(segment[4]) > __get_aberration_cutoff(rem_input['args'].beta, ploidy)[1]:
abberations_file.write('{}\tgain\n'.format('\t'.join([str(x) for x in row])))
elif float(segment[4]) < __get_aberration_cutoff(rem_input['args'].beta, ploidy)[0]:
abberations_file.write('{}\tloss\n'.format('\t'.join([str(x) for x in row])))
elif isinstance(segment[3], str):
continue
else:
if float(segment[3]) > rem_input['args'].zscore:
abberations_file.write('{}\tgain\n'.format('\t'.join([str(x) for x in row])))
elif float(segment[3]) < - rem_input['args'].zscore:
abberations_file.write('{}\tloss\n'.format('\t'.join([str(x) for x in row])))

segments_file.close()
abberations_file.close()


def __get_aberration_cutoff(beta, ploidy):

gain_or_loss = _define_gain_loss(segment, rem_input, ploidy)

if rem_input['args'].vcf:
if gain_or_loss == "gain":
cnv_type = "DUP"
dup_count += 1
type_count = dup_count
elif gain_or_loss == "loss":
cnv_type = "DEL"
del_count += 1
type_count = del_count
else:
cnv_type = "CNV"
cnv_count += 1
type_count = cnv_count

record: VariantRecord = vcf_file.new_record(
contig=prefix + chr_name,
id="WisecondorX_{}_{}".format(cnv_type, type_count),
start=start,
stop=stop,
alleles=('N', "<{}>".format(cnv_type))
)
record.info.update({
'SVTYPE': 'CNV',
'SVLEN': record.stop - record.start + 1,
'ABB': False if not gain_or_loss else True,
'SM': ratio,
'ZS': zscore
})
record.samples[sample]['GT'] = (None,None) if ploidy == 2 else None
vcf_file.write(record)

if not gain_or_loss: continue
if rem_input['args'].bed:
abberations_file.write('{}\t{}\n'.format('\t'.join([str(x) for x in row]), gain_or_loss))

if rem_input['args'].bed:
segments_file.close()
abberations_file.close()

if rem_input['args'].vcf:
vcf_file.close()

def _add_contigs(fai:str, vcf:VariantFile) -> str:
"""
Adds the contigs to the VCF file
"""
prefix = ""
with open(fai, "r") as index:
for line in index.readlines():
if line.startswith("chr"): prefix = "chr"
split_line = line.split("\t")
vcf.header.add_line("##contig=<ID={},length={}>".format(split_line[0], split_line[1]))

return prefix

def _add_info(vcf:VariantFile) -> None:
"""
Adds the INFO fields to the VCF file
"""
infos = [
'##ALT=<ID=CNV,Description="Copy number variant region">',
'##ALT=<ID=DEL,Description="Deletion relative to the reference">',
'##ALT=<ID=DUP,Description="Region of elevated copy number relative to the reference">',
'##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">',
'##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
'##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">',
'##INFO=<ID=SM,Number=1,Type=Float,Description="Linear copy ratio of the segment mean">',
'##INFO=<ID=ZS,Number=1,Type=Float,Description="The z-score calculated for the current CNV">',
'##INFO=<ID=ABB,Number=0,Type=Flag,Description="States that the CNV is an abberation">',
'##FILTER=<ID=cnvQual,Description="CNV with quality below 10">',
'##FILTER=<ID=cnvCopyRatio,Description="CNV with copy ratio within +/- 0.2 of 1.0">',
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
]
for info in infos:
vcf.header.add_line(info)

def _define_gain_loss(segment, rem_input, ploidy = 2):
"""
Define if the abberation is a gain or a loss
"""
if rem_input['args'].beta is not None:
abberation_cutoff = _get_aberration_cutoff(rem_input['args'].beta, ploidy)
if float(segment[4]) > abberation_cutoff[1]:
return "gain"
elif float(segment[4]) < abberation_cutoff[0]:
return "loss"
elif isinstance(segment[3], str): return False
else:
if float(segment[3]) > rem_input['args'].zscore:
return "gain"
elif float(segment[3]) < - rem_input['args'].zscore:
return "loss"
return False

def _get_aberration_cutoff(beta, ploidy):
loss_cutoff = np.log2((ploidy - (beta / 2)) / ploidy)
gain_cutoff = np.log2((ploidy + (beta / 2)) / ploidy)
return loss_cutoff, gain_cutoff
Expand Down