-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathExtractSequence.py
executable file
·101 lines (73 loc) · 2.7 KB
/
ExtractSequence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
##########################################################################################
## ##
## Muyang Xu Sam ([email protected]) ##
## Computational Biology Core ##
## Institute for Systems Genomics ##
## University of Connecticut ##
## Copyright (2018) ##
## ##
## The script will generate FASTA sequcnes using a given FASTA file according to a ##
## Gfold output file ##
## ##
## The script takes three arguments, ##
## 1) A diff file generated from GFOLD ##
## 2) A fasta file containning either protein sequences or nucleotide sequences ##
## 3) A number n indicating first n gene IDs users want to select ##
## ##
##########################################################################################
'''
The script takes three arguments, a diff file generated from GFOLD, a fasta file
containning either protein sequences or nucleotide sequences, and a number n
indicating first n gene IDs users want to select.
'''
import sys
import csv
from operator import itemgetter
diffFile = sys.argv[1] # .diff file generated by GFOLD
fastaFile = sys.argv[2] # fasta file containning GeneIDs and sequence
n = int(sys.argv[3]) # number of gene IDs to be extracted
'''
diffFile = "/home/sam/workspace/python/Tutorial/cotreated_VS_elevated.diff"
fastaFile = "centroids.fasta.transdecoder.cds"
n = 3
'''
'''
load diff file as a table
'''
with open(diffFile) as diff:
reader = csv.reader(diff, delimiter="\t")
geneList = list(reader)
geneList = geneList[11:len(geneList)] ##exclude header
'''
sort genes based on GFOLD value
'''
for row in range(0, len(geneList)):
geneList[row][2] = abs(float(geneList[row][2])) ##convert GFOLD value to float
geneList = sorted(geneList, key=itemgetter(2), reverse=True) ##sort genes descending order
geneIDs = []
for i in range(n):
geneIDs.append(geneList[0 + i][0].split(":")[0]) ##store first n gene IDs into a list
#print geneIDs
'''
load fasta file
'''
cds = open(fastaFile, 'r')
lines = cds.readlines()
oneLine = '\t'.join([line.strip() for line in lines])
sequence = oneLine.split(">") ##seperate file into a lists individual sequence
'''
search and extract seqences
'''
sqList = []
output = open("ExtractedSq.fasta", 'w') ##create a output fasta file
for item in sequence:
for id in geneIDs:
if item.startswith(id):
sq = item.split('\t', 1)[1]
sq = sq.replace('\t', '') ##reformat sequence to one line
geneIDs.remove(id)
sqList.append(sq)
output.write(">" + id+ "\n" + sq + "\n" + "\n") ##write sequnces in fasta format
else:
pass
print "Number of Sequences extracted:", len(sqList)