forked from NAL-i5K/GFF3toolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
function4gff.py
executable file
·124 lines (114 loc) · 4.23 KB
/
function4gff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#! /usr/env/bin python3
# -*- coding: utf-8 -*-
"""
QC functions for processing multiple features between models (inter-model) in GFF3 file.
"""
import sys
import re
import logging
import string
import random
logger = logging.getLogger(__name__)
#log.basicConfig(level=logging.DEBUG, format='%(levelname)-8s %(message)s')
logger.setLevel(logging.INFO)
if not logger.handlers:
lh = logging.StreamHandler()
lh.setFormatter(logging.Formatter('%(levelname)-8s %(message)s'))
logger.addHandler(lh)
def randomID(size=32, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
def FIX_MISSING_ATTR(gff, logger=None):
features = [line for line in gff.lines if line['line_type'] == 'feature']
flag = 0
for f in features:
try:
if 'owner' not in f['attributes']:
f['attributes']['owner'] = 'Unassigned'
if 'ID' not in f['attributes']:
IDrequired = ['gene', 'pseudogene', 'mRNA', 'pseudogenic_transcript']
if f['type'] in IDrequired:
logger.error('[Missing ID] A model needs to have a unique ID, but this feature does not. Please fix it before running the program.\n\t\t- Line {0:s}: {1:s}'.format(str(f['line_index']+1), f['line_raw']))
flag += 1
else:
#tid = f['parents'][0][0]['attributes']['ID'] + '-' + f['type']
tid = randomID()
while (tid in gff.features):
tid = randomID()
f['attributes']['ID'] = tid
gff.features[tid].append(f)
except KeyError:
logger.warning('[Missing Attributes] Program failed.\n\t\t- Line {0:s}: {1:s}'.format(str(f['line_index']+1), f['line_raw']))
if flag != 0:
sys.exit()
def featureSort(linelist, reverse=False):
"""
Used by replace_OGS.py and gff3_to_fasta.py
"""
FEATURECODE = {
'gene': 0,
'pseudogene': 0,
'mRNA': 1,
'rRNA': 1,
'tRNA': 1,
'miRNA': 1,
'snRNA': 1,
'pseudogenic_transcript': 1,
'transcript': 1,
'exon': 2,
'pseudogenic_exon': 2,
'CDS': 3,
}
id2line = {}
id2index = {}
seq2id = {}
for line in linelist:
lineindex = line['start'] if reverse==False else line['end']
id2line[str(line['line_raw'])] = line
if line['type'] in FEATURECODE:
id2index[str(line['line_raw'])] = [lineindex, FEATURECODE[line['type']] if reverse==False else (-FEATURECODE[line['type']])]
else:
id2index[str(line['line_raw'])] = [lineindex, 99 if reverse==False else -99]
tmp = re.search('(.+?)(\d+)',line['seqid'])
try:
seqnum = tmp.groups()[1]
except AttributeError:
continue
if seqnum in seq2id:
seq2id[seqnum].append(str(line['line_raw']))
else:
seq2id[seqnum] = [str(line['line_raw'])]
keys = sorted(seq2id, key=lambda i: int(i))
newlinelist = []
for k in keys:
ids = seq2id[k]
d = {}
for ID in ids:
d[ID] = id2index[ID]
try:
id_sorted = sorted(d, key=lambda i: (int(d[i][0]), int(d[i][1])), reverse=reverse)
for i in id_sorted:
newlinelist.append(id2line[i])
except:
pass
return newlinelist
def extract_internal_detected_errors(gff):
error_lines = [line for line in gff.lines if line['line_errors']]
eSet = list()
for line in error_lines:
try:
for e in line['line_errors']:
result = dict()
try:
result['ID'] = [line['attributes']['ID']]
except:
result['ID'] = ['NA']
result['line_num'] = ['Line {0:s}'.format(str(line['line_index'] + 1))]
result['eCode'] = e['eCode']
result['eLines'] = [line]
result['eTag'] = e['message']
#print('{0:s}\t{1:s}\t[{2:s}]'.format(result['ID'], result['eCode'], result['eTag']))
eSet.append(result)
except:
logger.error(line['line_raw'])
if len(eSet):
return eSet