v0.5.3 - bug fixes in parsing of ClinVar information

Changes include: - Fixed parseMacPathogenicity() to handle variants with multiple submitters that received both benign and pathogenic classifications, but no conflict is reported (i.e. `isPathogenic == 1 and isBenign == 1 and isConflicted == 0`) - Fixed bug reported in pull request #19: var.splitHGVSc, which doesn't consider strand information. When override = True, the ref and alt for genomic variants would be wrongly changed for minus strand transcripts. Changed to override = False - Fixed bug reported in pull request #19: fixed corrdinates in getMacClinVarTSV() to match readVCF().
ding-lab · Sep 30, 2019 · df097ed · df097ed
1 parent 5a5e006
commit df097ed
Showing 1 changed file with 52 additions and 18 deletions.
diff --git a/charger/charger.py b/charger/charger.py
@@ -5,7 +5,7 @@
 #	- Fernanda Martins Rodrigues ([email protected])
 #	- Jay R. Mashl ([email protected])
 #	- Kuan-lin Huang ([email protected])
-# version: v0.5.2
+# version: v0.5.3 - September, 2019
 
 import os
 import sys
@@ -904,26 +904,46 @@ def getMacClinVarTSV( self , tsvfile ):
 			for line in macFile:
 				fields = ( line.rstrip( ) ).split( "\t" )
 				[ description , status ] = self.parseMacPathogenicity( header, fields ) # no need to specify which fields here anymore; parseMacPathogenicity now knows which specific columns to look for
+				# fixed coordinates for clinvar file (refer to pull request #19)
+				pos = int(fields[header.index("pos")])
+				ref = fields[header.index("ref")]
+				alt = fields[header.index("alt")]
+				if len(ref) == 1 and len(alt) > 1: # insertion
+					ref = '-'
+					alt = alt[1:]
+					start = pos
+					stop = pos + 1
+				elif len(ref) > 1 and len(alt) == 1: # deletion
+					ref = ref[1:]
+					alt = '-'
+					start = pos + 1
+					stop = pos + len(ref)
+				else: # snv
+					start = pos
+					stop = pos
+
 				if len(header) > 27: # if yes, file is in the new format
 					var = clinvarvariant( chromosome = fields[header.index("chrom")] , \
-										  start = fields[header.index("pos")] , \
-										  reference = fields[header.index("ref")] , \
-										  alternate = fields[header.index("alt")] , \
+										  start = start , \
+										  stop = stop , \
+										  reference = ref , \
+										  alternate = alt , \
 										  uid = fields[header.index("variation_id")], \
 										  gene = fields[header.index("symbol")] , \
 										  clinical = { "description" : description , "review_status" : status } , \
 										  trait = { fields[header.index("xrefs")] : fields[header.index("all_traits")] } )
 				else: # file in the old format
 					var = clinvarvariant( chromosome = fields[header.index("chrom")] , \
-										  start = fields[header.index("pos")] , \
-										  reference = fields[header.index("ref")] , \
-										  alternate = fields[header.index("alt")] , \
+										  start = start , \
+										  stop = stop , \
+										  reference = ref , \
+										  alternate = alt , \
 										  uid = fields[header.index("measureset_id")], \
 										  gene = fields[header.index("symbol")] , \
 										  clinical = { "description" : description , "review_status" : status } , \
 										  trait = { fields[-1] : fields[header.index("all_traits")] } )
-				var.setStopFromReferenceAndAlternate( )
-				var.splitHGVSc( fields[header.index("hgvs_c")] , override = True )
+
+				var.splitHGVSc( fields[header.index("hgvs_c")] , override = False ) # refer to pull request #19
 				var.splitHGVSp( fields[header.index("hgvs_p")] )
 				#var.printVariant( "," )
 				#print( var.proteogenomicVar( ) )
@@ -974,21 +994,35 @@ def parseMacPathogenicity( header, fields ): # addded header argument, so can re
 		else:
 			splitChar="/" # new macarthur format
 
-		if isBenign == 1:
-			for desc in named.split( splitChar ):
-				if re.match( "likely", desc.lower( ) ) and desc != chargervariant.benign:
+		# fixed parsing of conflicting ClinVar classification
+		if isBenign == 1 and isPathogenic == 1 and int(isConflicted) == 0:
+			for desc in named.split(splitChar):
+				if re.match("likely", desc.lower() ) and desc != chargervariant.benign:
 					desc = chargervariant.likelyBenign
-				elif re.match( "benign", desc.lower( ) ):
+				elif re.match( "likely", desc.lower( ) ) and desc != chargervariant.pathogenic:
+					desc = chargervariant.likelyPathogenic
+				elif re.match( "benign", desc.lower() ):
 					desc = chargervariant.benign
 					break
-
-		if isPathogenic == 1:
-			for desc in named.split( splitChar ):
-				if re.match( "likely", desc.lower( ) ) and desc != chargervariant.pathogenic:
-					desc = chargervariant.likelyPathogenic
 				elif re.match( "pathog", desc.lower( ) ):
 					desc = chargervariant.pathogenic
 					break
+		else:
+			if isBenign == 1:
+				for desc in named.split( splitChar ):
+					if re.match( "likely", desc.lower( ) ) and desc != chargervariant.benign:
+						desc = chargervariant.likelyBenign
+					elif re.match( "benign", desc.lower( ) ):
+						desc = chargervariant.benign
+						break
+
+			if isPathogenic == 1:
+				for desc in named.split( splitChar ):
+					if re.match( "likely", desc.lower( ) ) and desc != chargervariant.pathogenic:
+						desc = chargervariant.likelyPathogenic
+					elif re.match( "pathog", desc.lower( ) ):
+						desc = chargervariant.pathogenic
+						break
 		return [ desc , status ]
 
 	def getMacClinVarVCF( self , vcffile ):