-
Notifications
You must be signed in to change notification settings - Fork 37
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
v0.5.3 - bug fixes in parsing of ClinVar information
Changes include: - Fixed parseMacPathogenicity() to handle variants with multiple submitters that received both benign and pathogenic classifications, but no conflict is reported (i.e. `isPathogenic == 1 and isBenign == 1 and isConflicted == 0`) - Fixed bug reported in pull request #19: var.splitHGVSc, which doesn't consider strand information. When override = True, the ref and alt for genomic variants would be wrongly changed for minus strand transcripts. Changed to override = False - Fixed bug reported in pull request #19: fixed corrdinates in getMacClinVarTSV() to match readVCF().
- Loading branch information
1 parent
5a5e006
commit df097ed
Showing
1 changed file
with
52 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,7 @@ | |
# - Fernanda Martins Rodrigues ([email protected]) | ||
# - Jay R. Mashl ([email protected]) | ||
# - Kuan-lin Huang ([email protected]) | ||
# version: v0.5.2 | ||
# version: v0.5.3 - September, 2019 | ||
|
||
import os | ||
import sys | ||
|
@@ -904,26 +904,46 @@ def getMacClinVarTSV( self , tsvfile ): | |
for line in macFile: | ||
fields = ( line.rstrip( ) ).split( "\t" ) | ||
[ description , status ] = self.parseMacPathogenicity( header, fields ) # no need to specify which fields here anymore; parseMacPathogenicity now knows which specific columns to look for | ||
# fixed coordinates for clinvar file (refer to pull request #19) | ||
pos = int(fields[header.index("pos")]) | ||
ref = fields[header.index("ref")] | ||
alt = fields[header.index("alt")] | ||
if len(ref) == 1 and len(alt) > 1: # insertion | ||
ref = '-' | ||
alt = alt[1:] | ||
start = pos | ||
stop = pos + 1 | ||
elif len(ref) > 1 and len(alt) == 1: # deletion | ||
ref = ref[1:] | ||
alt = '-' | ||
start = pos + 1 | ||
stop = pos + len(ref) | ||
else: # snv | ||
start = pos | ||
stop = pos | ||
|
||
if len(header) > 27: # if yes, file is in the new format | ||
var = clinvarvariant( chromosome = fields[header.index("chrom")] , \ | ||
start = fields[header.index("pos")] , \ | ||
reference = fields[header.index("ref")] , \ | ||
alternate = fields[header.index("alt")] , \ | ||
start = start , \ | ||
stop = stop , \ | ||
reference = ref , \ | ||
alternate = alt , \ | ||
uid = fields[header.index("variation_id")], \ | ||
gene = fields[header.index("symbol")] , \ | ||
clinical = { "description" : description , "review_status" : status } , \ | ||
trait = { fields[header.index("xrefs")] : fields[header.index("all_traits")] } ) | ||
else: # file in the old format | ||
var = clinvarvariant( chromosome = fields[header.index("chrom")] , \ | ||
start = fields[header.index("pos")] , \ | ||
reference = fields[header.index("ref")] , \ | ||
alternate = fields[header.index("alt")] , \ | ||
start = start , \ | ||
stop = stop , \ | ||
reference = ref , \ | ||
alternate = alt , \ | ||
uid = fields[header.index("measureset_id")], \ | ||
gene = fields[header.index("symbol")] , \ | ||
clinical = { "description" : description , "review_status" : status } , \ | ||
trait = { fields[-1] : fields[header.index("all_traits")] } ) | ||
var.setStopFromReferenceAndAlternate( ) | ||
var.splitHGVSc( fields[header.index("hgvs_c")] , override = True ) | ||
|
||
var.splitHGVSc( fields[header.index("hgvs_c")] , override = False ) # refer to pull request #19 | ||
var.splitHGVSp( fields[header.index("hgvs_p")] ) | ||
#var.printVariant( "," ) | ||
#print( var.proteogenomicVar( ) ) | ||
|
@@ -974,21 +994,35 @@ def parseMacPathogenicity( header, fields ): # addded header argument, so can re | |
else: | ||
splitChar="/" # new macarthur format | ||
|
||
if isBenign == 1: | ||
for desc in named.split( splitChar ): | ||
if re.match( "likely", desc.lower( ) ) and desc != chargervariant.benign: | ||
# fixed parsing of conflicting ClinVar classification | ||
if isBenign == 1 and isPathogenic == 1 and int(isConflicted) == 0: | ||
for desc in named.split(splitChar): | ||
if re.match("likely", desc.lower() ) and desc != chargervariant.benign: | ||
desc = chargervariant.likelyBenign | ||
elif re.match( "benign", desc.lower( ) ): | ||
elif re.match( "likely", desc.lower( ) ) and desc != chargervariant.pathogenic: | ||
desc = chargervariant.likelyPathogenic | ||
elif re.match( "benign", desc.lower() ): | ||
desc = chargervariant.benign | ||
break | ||
|
||
if isPathogenic == 1: | ||
for desc in named.split( splitChar ): | ||
if re.match( "likely", desc.lower( ) ) and desc != chargervariant.pathogenic: | ||
desc = chargervariant.likelyPathogenic | ||
elif re.match( "pathog", desc.lower( ) ): | ||
desc = chargervariant.pathogenic | ||
break | ||
else: | ||
if isBenign == 1: | ||
for desc in named.split( splitChar ): | ||
if re.match( "likely", desc.lower( ) ) and desc != chargervariant.benign: | ||
desc = chargervariant.likelyBenign | ||
elif re.match( "benign", desc.lower( ) ): | ||
desc = chargervariant.benign | ||
break | ||
|
||
if isPathogenic == 1: | ||
for desc in named.split( splitChar ): | ||
if re.match( "likely", desc.lower( ) ) and desc != chargervariant.pathogenic: | ||
desc = chargervariant.likelyPathogenic | ||
elif re.match( "pathog", desc.lower( ) ): | ||
desc = chargervariant.pathogenic | ||
break | ||
return [ desc , status ] | ||
|
||
def getMacClinVarVCF( self , vcffile ): | ||
|