forked from ashutoshkpandey/SimplePrograms
-
Notifications
You must be signed in to change notification settings - Fork 0
/
psuedogenome_SNPs.py
103 lines (79 loc) · 2.59 KB
/
psuedogenome_SNPs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
## This script takes reference fasta and a SNP vcf file and creates strain-specific fasta by substituting for SNPs.
## Use of strain-specific genome removes allelic bias during read alignment.
## This script is written for creating strain-specific reference file using Sanger MGP vcf file consiting of 18 strains. You can give the index number of strain for which SNPs need to be substituted.
import sys,os,re,fileinput
Argument = []
Argument = sys.argv[1:]
filename = Argument[0] #Reference fasta
snpfile = Argument[1] #SNP file
strain = int(Argument[2])
def insert_newlines(string, every=60):
return '\n'.join(string[i:i+every] for i in range(0, len(string), every))
def substitute_snp(header,genome,SNP):
name = ""
name = header
genomearray = []
genomearray = list(genome.rstrip(" "))
for snp in SNP:
#print snp
#print genomearray[int(snp)-1]
#print SNP[snp][0]
if (genomearray[int(snp)-1]).lower() == (SNP[snp][0]).lower():
genomearray[int(snp)-1] = SNP[snp][1]
return header+"\n"+insert_newlines("".join(genomearray))+"\n"
def nosnp_format(header, genome):
name = ""
name = header
genomearray = []
genomearray = list(genome.rstrip(" "))
return header+"\n"+insert_newlines("".join(genomearray))+"\n"
SNP = {}
def trim(list):
#listn = []
#listn = list
newlist = []
for a in list:
newlist.append((a.rstrip()).lstrip())
return newlist
for line in fileinput.input([snpfile]):
if line.startswith("#") or not line.strip():
continue
else:
info1 = []
info1 = line.rstrip("\n").split("\t")
info = []
info = trim(info1)
if info[strain] != "." and info[strain].startswith("1/1:1"):
if "," not in info[4]:
if info[0] not in SNP:
SNP[info[0]] = {}
SNP[info[0]][info[1]] = []
SNP[info[0]][info[1]].append(info[3])
SNP[info[0]][info[1]].append(info[4])
else:
SNP[info[0]][info[1]] = []
SNP[info[0]][info[1]].append(info[3])
SNP[info[0]][info[1]].append(info[4])
#print SNP
mousefile = open("CN57_CAST_assembly.fa", "w") #output file
genomefile = open(filename, 'r').read() #reading whole file in one string
genomeb = []
dbafasta = ""
genomeb = genomefile.split(">")
for chr in genomeb[1:]:
fasta = ""
dbfasta = ""
header = ""
genome = []
genome = chr.split("\n")
header = ">"+str(genome[0])
print(genome[0])
if (genome[0].rstrip(" ")).lstrip("chr") in SNP:
fasta = "".join(genome[1:])
dbafasta = substitute_snp(header, fasta, SNP[(genome[0].rstrip(" ")).lstrip("chr")])
mousefile.write(str(dbafasta))
else:
fasta = "".join(genome[1:])
dbafasta = nosnp_format(header, fasta)
mousefile.write(str(dbafasta))
mousefile.close()