Skip to content

Commit

Permalink
v0.5.3 - bug fixes in parsing of ClinVar information
Browse files Browse the repository at this point in the history
Changes include:
- Fixed parseMacPathogenicity() to handle variants with multiple submitters that received both benign and pathogenic classifications, but no conflict is reported (i.e. `isPathogenic == 1 and isBenign == 1 and isConflicted == 0`)

- Fixed bug reported in pull request #19:  var.splitHGVSc, which doesn't consider strand information. When override = True, the ref and alt for genomic variants would be wrongly changed for minus strand transcripts. Changed to override = False

- Fixed bug reported in pull request #19: fixed corrdinates in getMacClinVarTSV() to match readVCF().
  • Loading branch information
fernanda-rodrigues authored Sep 30, 2019
1 parent 5a5e006 commit df097ed
Showing 1 changed file with 52 additions and 18 deletions.
70 changes: 52 additions & 18 deletions charger/charger.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# - Fernanda Martins Rodrigues ([email protected])
# - Jay R. Mashl ([email protected])
# - Kuan-lin Huang ([email protected])
# version: v0.5.2
# version: v0.5.3 - September, 2019

import os
import sys
Expand Down Expand Up @@ -904,26 +904,46 @@ def getMacClinVarTSV( self , tsvfile ):
for line in macFile:
fields = ( line.rstrip( ) ).split( "\t" )
[ description , status ] = self.parseMacPathogenicity( header, fields ) # no need to specify which fields here anymore; parseMacPathogenicity now knows which specific columns to look for
# fixed coordinates for clinvar file (refer to pull request #19)
pos = int(fields[header.index("pos")])
ref = fields[header.index("ref")]
alt = fields[header.index("alt")]
if len(ref) == 1 and len(alt) > 1: # insertion
ref = '-'
alt = alt[1:]
start = pos
stop = pos + 1
elif len(ref) > 1 and len(alt) == 1: # deletion
ref = ref[1:]
alt = '-'
start = pos + 1
stop = pos + len(ref)
else: # snv
start = pos
stop = pos

if len(header) > 27: # if yes, file is in the new format
var = clinvarvariant( chromosome = fields[header.index("chrom")] , \
start = fields[header.index("pos")] , \
reference = fields[header.index("ref")] , \
alternate = fields[header.index("alt")] , \
start = start , \
stop = stop , \
reference = ref , \
alternate = alt , \
uid = fields[header.index("variation_id")], \
gene = fields[header.index("symbol")] , \
clinical = { "description" : description , "review_status" : status } , \
trait = { fields[header.index("xrefs")] : fields[header.index("all_traits")] } )
else: # file in the old format
var = clinvarvariant( chromosome = fields[header.index("chrom")] , \
start = fields[header.index("pos")] , \
reference = fields[header.index("ref")] , \
alternate = fields[header.index("alt")] , \
start = start , \
stop = stop , \
reference = ref , \
alternate = alt , \
uid = fields[header.index("measureset_id")], \
gene = fields[header.index("symbol")] , \
clinical = { "description" : description , "review_status" : status } , \
trait = { fields[-1] : fields[header.index("all_traits")] } )
var.setStopFromReferenceAndAlternate( )
var.splitHGVSc( fields[header.index("hgvs_c")] , override = True )

var.splitHGVSc( fields[header.index("hgvs_c")] , override = False ) # refer to pull request #19
var.splitHGVSp( fields[header.index("hgvs_p")] )
#var.printVariant( "," )
#print( var.proteogenomicVar( ) )
Expand Down Expand Up @@ -974,21 +994,35 @@ def parseMacPathogenicity( header, fields ): # addded header argument, so can re
else:
splitChar="/" # new macarthur format

if isBenign == 1:
for desc in named.split( splitChar ):
if re.match( "likely", desc.lower( ) ) and desc != chargervariant.benign:
# fixed parsing of conflicting ClinVar classification
if isBenign == 1 and isPathogenic == 1 and int(isConflicted) == 0:
for desc in named.split(splitChar):
if re.match("likely", desc.lower() ) and desc != chargervariant.benign:
desc = chargervariant.likelyBenign
elif re.match( "benign", desc.lower( ) ):
elif re.match( "likely", desc.lower( ) ) and desc != chargervariant.pathogenic:
desc = chargervariant.likelyPathogenic
elif re.match( "benign", desc.lower() ):
desc = chargervariant.benign
break

if isPathogenic == 1:
for desc in named.split( splitChar ):
if re.match( "likely", desc.lower( ) ) and desc != chargervariant.pathogenic:
desc = chargervariant.likelyPathogenic
elif re.match( "pathog", desc.lower( ) ):
desc = chargervariant.pathogenic
break
else:
if isBenign == 1:
for desc in named.split( splitChar ):
if re.match( "likely", desc.lower( ) ) and desc != chargervariant.benign:
desc = chargervariant.likelyBenign
elif re.match( "benign", desc.lower( ) ):
desc = chargervariant.benign
break

if isPathogenic == 1:
for desc in named.split( splitChar ):
if re.match( "likely", desc.lower( ) ) and desc != chargervariant.pathogenic:
desc = chargervariant.likelyPathogenic
elif re.match( "pathog", desc.lower( ) ):
desc = chargervariant.pathogenic
break
return [ desc , status ]

def getMacClinVarVCF( self , vcffile ):
Expand Down

0 comments on commit df097ed

Please sign in to comment.