-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlong_read_analysis_2
66 lines (62 loc) · 3 KB
/
long_read_analysis_2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# initialise some variables to count short, medium length and long LTR5_Hs alignments.
long = 0
mid = 0
short = 0
count = 0
zero = 0
# specify the input path, this will be a directory containing multiple outputs from repeatmasker for a given tool.
input_path = filtered_ervcaller_path
# loop through each file in the directory.
for filename in os.listdir(input_path):
count = count + 1
curr_len = 0
file = open(input_path + filename) # open the file and loop through it.
for line in file:
if len(line.split()) > 10: # ensure that the line is not empty.
if line.split()[0] != 'bit' and line.split()[0] != 'score': # check that the current line is not the title line.
if line.split()[9] == 'LTR5_Hs': # check if the line contains the LTR5_Hs identifier.
if line.split()[8] == '+': # find the length of the LTR5_Hs, we need to differentiate between forward and reverse orientation.
len_insert = int(line.split()[12]) - int(line.split()[11])
elif line.split()[8] == 'C':
len_insert = int(line.split()[12]) - int(line.split()[13])
if len_insert > curr_len: # if the current LTR5_Hs length is longer than any previous LTR5_Hs from this file, update curr_len with this new length.
curr_len = len_insert
if curr_len > 850: # curr_len = the longest LTR5_Hs alignment from this file, fit it into one of the three length categories (or zero if there were no LTR5_Hs).
long += 1
if curr_len > 400:
mid += 1
if curr_len > 0:
short += 1
if curr_len == 0:
zero += 1
print('LTR5_Hs results:', long/count, mid/count, short/count, zero/count, count)
# the following script is identical to the above except it looks for lines which have any kind of LTR or ERV element, as opposed to jsut LTR5_Hs.
long = 0
mid = 0
short = 0
count = 0
zero = 0
for filename in os.listdir(input_path):
count = count + 1
curr_len = 0
file = open(input_path + filename)
for line in file:
if len(line.split()) > 10:
if line.split()[0] != 'bit' and line.split()[0] != 'score':
if line.split()[10] == 'LTR/ERVK' or line.split()[9] == 'HERVK-int'\
or 'LTR5' in line.split()[9] or 'ERV' in line.split()[10]:
if line.split()[8] == '+':
len_insert = int(line.split()[12]) - int(line.split()[11])
elif line.split()[8] == 'C':
len_insert = int(line.split()[12]) - int(line.split()[13])
if len_insert > curr_len:
curr_len = len_insert
if curr_len > 850:
long += 1
if curr_len > 400:
mid += 1
if curr_len > 0:
short += 1
if curr_len == 0:
zero += 1
print('genreal ERV results:', long/count, mid/count, short/count, zero/count, count)