-
Notifications
You must be signed in to change notification settings - Fork 2
/
gtf-re.py
35 lines (26 loc) · 802 Bytes
/
gtf-re.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!/usr/bin/env python
import re
patstr = \
r"""^[^\t]*\t (?# seqname )
[^\t]*\t (?# source )
[^\t]*\t (?# feature )
\d+\t (?# start )
\d+\t (?# end )
(\.|[+\-]? (' '* \d+ (\. \d*)? | \. \d+) ([eE] [+\-]? \d+)?)\t (?# score)
[+\-\.]\t (?# strand )
[012\.]\t (?# frame)
(\ * [A-Za-z][A-Za-z0-9_]* \ +
(\"\" | \" [^\"\n\r\t]* [^\\] \" | [^\"\s]*) \ * ;)*
(\ * [A-Za-z][A-Za-z0-9_]* \ +
(\"\" | \" [^\"\n\r\t]* [^\\] \" | [^\"\s]*) \ *)?$"""
pat = re.compile(patstr, re.X | re.MULTILINE)
data = open("sample.gtf").read()
count = 0
for mat in pat.finditer(data):
count += 1
print(count)
#mat = pat.match(data)
#assert mat
#print(mat.start())
#print(mat.end())
#print(data[mat.start():mat.end()])