-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathroc_simulation.py
243 lines (214 loc) · 10.1 KB
/
roc_simulation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# coding=utf-8
"""
Calculates :math:`\\rho` and Tajima's D for neutral and non-neutral data simulated with MSMS.
Simulations are conditioned either on :math:`\\theta` or on the number of segregating sites.
The data is intended to be used to plot roc curves (see relevant Jupyter notebook).
The script invokes a java script and must be run in a directory containing msms.jar file.
A sample run statement is:
nohup python3 /Users/helmutsimon/repos/NeutralityTest/roc_simulation.py 2s00a 10 0 -t 10 -sho 2 -sf 1e-2 > rs_2s00a.txt &
"""
import numpy as np
import os, sys
import gzip, pickle
from selectiontest import selectiontest
from time import time
from collections import Counter
import msprime
import subprocess
import click
import pandas as pd
import scipy
from scitrack import CachingLogger, get_file_hexdigest
LOGGER = CachingLogger(create_dir=True)
def compute_sfs(variant_array):
"""
Compute the site frequency spectrum from a variant array output by MSMS.
"""
n = variant_array.shape[1]
occurrences = np.sum(variant_array, axis=1)
sfs = Counter(occurrences)
sfs = [sfs[i] for i in range(1, n)]
return np.array(sfs)
def read_demography_file(filename, pop_size):
"""
Read demographic history file and translate into MSMS commands. The file uses absolute
population size and exponential growth rate per generation. These are scaled by 4N_0 (pop_size)
for MSMS. The input file format used is compatible with msprime and msprime functions are
used to print a description of the demographic history.
"""
if filename is None:
return list()
if isinstance(filename, str):
if filename[-4:] != '.csv':
filename = filename + '.csv'
infile = open(filename, 'r')
LOGGER.input_file(infile.name)
infile.close()
demo_parameters = pd.read_csv(filename)
elif isinstance(filename, pd.DataFrame):
demo_parameters = filename
else:
raise ValueError('Events_file parameter wrong type: ' + str(type(filename)))
dem_cmds = list()
demographic_events = list()
for index, row in demo_parameters.iterrows():
t = str(row['time'] / (4 * pop_size))
size_change = str(row['size'] / pop_size)
growth_rate = str(row['rate'] * 4 * pop_size)
ppc = msprime.PopulationParametersChange(time=row['time'],
initial_size=row['size'], growth_rate=row['rate'])
demographic_events.append(ppc)
event = 'time: ' + str(row['time']) + ', size: ' + str(row['size']) +\
', rate: ' + str(row['rate'])
#print(event)
LOGGER.log_message(event, label='Demographic change event'.ljust(50))
cmds = ["-eN", t, size_change, "-eG", t, growth_rate]
dem_cmds = dem_cmds + cmds
return dem_cmds, demographic_events
def run_simulations(reps, pop_size, n, theta, seg_sites, growth_rate, SAA, SaA, SF, events_file, recombination_rate):
"""
Run simulations using MSMS.
"""
# Construct MSMS parameters for demographic history and print a summary using msprime.
if events_file is None:
events_commands = list()
else:
events_commands, demographic_events = read_demography_file(events_file, pop_size)
population_configurations = [msprime.PopulationConfiguration(initial_size=pop_size,
sample_size=n, growth_rate=growth_rate)]
demographic_hist = msprime.DemographyDebugger(demographic_events=demographic_events,
population_configurations=population_configurations)
demographic_hist.print_history()
sys.stdout.flush()
# Build the MSMS command line, conditioning either on theta or seg_sites and including selection if required.
msms_cmd = ["java", "-jar", "msms.jar", str(n), str(reps), "-N", str(pop_size)] + events_commands
if growth_rate:
growth_rate_scaled = growth_rate * 4 * pop_size
msms_cmd = msms_cmd + [ "-G", str(growth_rate_scaled)]
if seg_sites is not None and theta is not None:
raise ValueError('seg_sites and theta parameters cannot both be valid.')
if seg_sites is not None:
msms_cmd = msms_cmd + ["-s", str(seg_sites)]
if theta is not None:
msms_cmd = msms_cmd + ["-t", str(theta)]
if SAA is not None:
if events_file is not None or growth_rate:
raise ValueError('Cannnot combine demographic change and selection.')
if SaA is None:
SaA = SAA / 2
msms_cmd = msms_cmd + ["-SAA", str(SAA), "-SaA", str(SaA), "-SF", str(SF)]
if recombination_rate:
msms_cmd = msms_cmd + ["-r", str(recombination_rate)]
print(msms_cmd)
# Run MSMS command and return output.
msms_out = subprocess.check_output(msms_cmd)
return(msms_out)
def process_simulation_output(msms_out, variates0, variates1, reps):
"""
Process MSMS output line by by line (each line corresponding to a single simulation).
We use a common set of variates for all calls to test_neutrality.
"""
sfs_list, trs, taj_D = list(), list(), list()
not_enough, err_nan = 0, 0
ms_lines = msms_out.splitlines()
variant_array = list()
for line in ms_lines[6:]:
line = line.decode('unicode_escape')
if line[:2] in ['//', 'se', 'po']:
variant_array = list()
continue
if line != "":
variant_array.append([*line])
else:
variant_array = np.array(variant_array, dtype=int).T
if variant_array.shape[0] < 4:
not_enough += 1
continue
sfs = compute_sfs(variant_array)
sfs_list.append(sfs)
tr = selectiontest.test_neutrality(sfs, variates0=variates0, variates1=variates1, reps=reps)
if np.isnan(tr):
err_nan += 1
else:
trs.append(tr)
taj_D.append(selectiontest.calculate_D(sfs))
variant_array = list()
LOGGER.log_message(str(not_enough), label='# Discarded for insufficient segregating sites: ')
LOGGER.log_message(str(err_nan), label='# Discarded for returning nan value for rho : ')
return trs, taj_D, sfs_list
@click.command()
@click.argument('job_no')
@click.argument('n', type=int)
@click.argument('growth_rate', type=float)
@click.option('-seg', 'seg_sites', type=int, default=None)
@click.option('-t', 'theta', type=float, default=None)
@click.option('-N', '--pop_size', type=float, default=100000)
@click.option('-rep', '--reps', type=int, default=10000)
@click.option('-sho', '--sho', type=float, default=None)
@click.option('-she', '--she', type=float, default=None)
@click.option('-sf', '--sf', type=float, default=None)
@click.option('-e', '--events_file', default=None)
@click.option('-r', '--recomb_rate', default=0, type=float)
@click.option('-d', '--dirx', default='data', type=click.Path(),
help='Directory name for data and log files. Default is data')
def main(reps, job_no, pop_size, n, seg_sites, theta, growth_rate, sho, she, sf, events_file, recomb_rate, dirx):
start_time = time()
np.set_printoptions(precision=3) #
if not os.path.exists(dirx):
os.makedirs(dirx)
LOGGER.log_file_path = dirx + "/" + str(os.path.basename(__file__)) + '_' + job_no + ".log"
LOGGER.log_args()
LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17))
try:
LOGGER.log_message(str(os.environ['CONDA_DEFAULT_ENV']), label="Conda environment.".ljust(17))
except KeyError:
pass
label = "Imported module".ljust(30)
LOGGER.log_message('Name = ' + np.__name__ + ', version = ' + np.__version__, label=label)
LOGGER.log_message('Name = ' + pd.__name__ + ', version = ' + pd.__version__, label=label)
LOGGER.log_message('Name = ' + scipy.__name__ + ', version = ' + scipy.__version__, label=label)
LOGGER.log_message('Name = ' + msprime.__name__ + ', version = ' + msprime.__version__, label=label)
LOGGER.log_message('Name = ' + selectiontest.__name__ + ', version = ' + selectiontest.__version__, label=label)
results = dict()
print('Neutral population')
msms_out = run_simulations(reps, pop_size, n, theta, seg_sites, 0, None, None, None, None, recomb_rate)
variates0 = np.empty((reps, n - 1), dtype=float)
for i, q in enumerate(selectiontest.sample_wf_distribution(n, reps)):
variates0[i] = q
variates1 = selectiontest.sample_uniform_distribution(n, reps)
trs, taj_D, sfs_list = process_simulation_output(msms_out, variates0, variates1, reps)
results['rho_false'] = trs
results['taj_false'] = taj_D
sfs_list = np.array(sfs_list)
outfile_name = 'data/sfs_neutral_' + job_no + '.pklz'
with gzip.open(outfile_name, 'wb') as outfile:
pickle.dump(sfs_list, outfile)
outfile = open(outfile_name, 'r')
LOGGER.output_file(outfile.name)
LOGGER.log_message(str(np.mean(sfs_list, axis=0)), label='Mean sfs constant population'.ljust(50))
print('Non-neutral population')
msms_out = run_simulations(reps, pop_size, n, theta, seg_sites, growth_rate, sho, she, sf, events_file, recomb_rate)
variates0 = np.empty((reps, n - 1), dtype=float)
for i, q in enumerate(selectiontest.sample_wf_distribution(n, reps)):
variates0[i] = q
variates1 = selectiontest.sample_uniform_distribution(n, reps)
trs, taj_D, sfs_list = process_simulation_output(msms_out, variates0, variates1, reps)
results['rho_true'] = trs
results['taj_true'] = taj_D
sfs_list = np.array(sfs_list)
outfile_name = dirx + '/sfs_non_neutral_' + job_no + '.pklz'
with gzip.open(outfile_name, 'wb') as outfile:
pickle.dump(sfs_list, outfile)
outfile = open(outfile_name, 'r')
LOGGER.output_file(outfile.name)
LOGGER.log_message(str(np.mean(sfs_list, axis=0)), label='Mean sfs non-neutral population'.ljust(50))
fname = dirx + '/roc_data_' + job_no + '.pklz'
with gzip.open(fname, 'wb') as outfile:
pickle.dump(results, outfile)
outfile = open(fname, 'r')
LOGGER.output_file(outfile.name)
outfile.close()
duration = time() - start_time
LOGGER.log_message("%.2f" % (duration / 60.), label="Run duration (minutes)".ljust(50))
if __name__ == "__main__":
main()