Skip to content

Commit

Permalink
Add protein name to the gtf ID.
Browse files Browse the repository at this point in the history
This will prevent issues stemming from merging of multiple
miniprot runs into one file. This can happen when, e.g., multiple
separate protein fasta files are supplied to GALBA.
  • Loading branch information
tomasbruna committed Mar 26, 2023
1 parent 5691166 commit a38f300
Showing 1 changed file with 8 additions and 5 deletions.
13 changes: 8 additions & 5 deletions scorer2gtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def loadStopCodons(scorerFile):
proteinEnd = extractAttribute(row, "proteinEnd")
if proteinEnd == "1":
parent = extractAttribute(row, "Parent")
validStops.add(parent)
prot = extractAttribute(row, "prot")
validStops.add(f'{parent}_{prot}')
return validStops


Expand All @@ -37,25 +38,27 @@ def convert(scorerFile):
for row in csv.reader(open(scorerFile), delimiter='\t'):
if row[2] == "mRNA":
ID = extractAttribute(row, "ID")
prot = extractAttribute(row, "prot")
row[2] = "transcript"
row[8] = f'transcript_id "{ID}"; gene_id "{ID}";'
row[8] = f'transcript_id "{ID}_{prot}"; gene_id "{ID}_{prot}";'

if ID in validStops:
if f'{ID}_{prot}' in validStops:
if row[6] == '+':
row[4] = str(int(row[4]) + 3)
else:
row[3] = str(int(row[3]) - 3)

gene = row.copy()
gene[2] = "gene"
gene[8] = f'gene_id "{ID}";'
gene[8] = f'gene_id "{ID}_{prot}";'
print("\t".join(gene))
print("\t".join(row))
elif row[2] == "CDS":
score = extractAttribute(row, "eScore")
parent = extractAttribute(row, "Parent")
prot = extractAttribute(row, "prot")
row[5] = score
row[8] = f'transcript_id "{parent}"; gene_id "{parent}";'
row[8] = f'transcript_id "{parent}_{prot}"; gene_id "{parent}_{prot}";'
exon = row.copy()
exon[2] = "exon"
exon[7] = "."
Expand Down

0 comments on commit a38f300

Please sign in to comment.