-
Notifications
You must be signed in to change notification settings - Fork 0
/
synthetic_fit_funcs.py
813 lines (621 loc) · 31.8 KB
/
synthetic_fit_funcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
import numpy as np
import networkx as nx
import scipy.linalg as la
from tqdm import tqdm
import pandas as pd
import scipy.stats as stats
from numpy.random.mtrand import dirichlet
import subprocess
import seaborn as sb
from joblib import Parallel, delayed
from scipy.integrate import solve_ivp
def cov_to_corr(cov_matrix):
"""Computes correlation matrix from covariance matrix
:param cov_matrix: covariance matrix
:type cov_matrix: np.array
:return: correlation matrix
:rtype: np.array
"""
Dinv = np.diag(1 / np.sqrt(np.diag(cov_matrix)))
return Dinv @ cov_matrix @ Dinv
def dir_fun(x,pcount):
"""Diriclet function with parameter scaled by desired pseudocount
:param x: underlying parameters of distribution (with 0s)
:type x: np.array
:param pcount: pseudocount to add to distribution to avoid 0s. Default 1.
:type pcount: int
:return: samples
:rtype: np.array
"""
a = x+pcount
f = dirichlet(a)
return f
def geo_mean(vec):
"""Computes the geometric mean of a vector
:param vec: vector of numbers
:type vec: np.array
:return: geometric mean of the vector
:rtype: float
"""
return np.exp(np.log(vec).mean())
def geo_mean_nz(vec):
"""Computes the geometric mean of a vector, ignoring 0 entries
:param vec: vector of numbers
:type vec: np.array
:return: geometric mean of the vector with 0 entries removed
:rtype: float
"""
return np.exp(np.log(vec[vec != 0]).mean())
def robust_clr(data):
"""Computes the Robust Centered Log Ratio, which leaves 0s in the data and does not use them in computing CLR
:param data: Data matrix
:type data: np.array
:return: Robust CLR of the data
:rtype: np.array
"""
rgeos = np.apply_along_axis(geo_mean_nz,0,data)
dpls = data.copy()
whr0 = np.where(data == 0)
dpls[whr0] = 1
rclr = np.log(dpls/rgeos)
rclr[whr0] = 0
return rclr
def safelog(x,sval = 0):
"""Computes log or alternative return value for 0
:param x: Argument of log
:type x: float
:param sval: return value for log(0)
:type sval: float
:return: log(x) or sval
:rtype: float
"""
if x > 0:
return np.log(x)
else:
return sval
def proj(v1,v2):
"""Projection of vector v1 onto the direction of vector v2
:param v1: vector to be projected
:type v1: np.array
:param v2: direction of resulting projection
:type v2: np.array
:return: Vector in the direction of v2 (component of v1 in direction v2)
:rtype: np.array
"""
if sum(abs(v2)):
return v2*(np.dot(v2,v1)/np.dot(v2,v2))
else:
return v1*0
def rand_in_perp_old(S,mxvars = 1,minvars = 0.5,bias = 0,pos = None,neg = None):
"""Find a random vector perpendicular to a given set of vectors. L2 norm of the vector will be chosen randomly in (minvars,mxvars). Additionally can
choose in or near the dual cone to a set of vectors.
:param S: Vectors (as rows) that new vector should be perpendicular to
:type S: np.array
:param mxvars: maximum scale of vector
:type mxvars: float
:param minvars: minimum scale of vector
:type minvars: float
:param bias: Extent to bias towards a dual cone. 1 chooses from dual cone, <1 chooses ``close to" the dual cone, 0 does not use the dual cone.
:type bias: float
:param pos: set of vectors for which result will have positive dot product (or biased towards)
:type pos: np.array
:param neg: set of vectors for which result will have negative dot product (or biased towards)
:type neg: np.array
:return: random vector
:rtype: np.array
"""
if (hasattr(pos,"__len__") or hasattr(neg,"__len__")) and (bias !=0):
if (hasattr(pos,"__len__") and hasattr(neg,"__len__")):
# print(pos.shape)
# print(neg.shape)
K = np.concatenate([pos,-neg],axis = 0)
elif hasattr(pos,"__len__"):
# print("pos",pos.shape)
K = pos
else:
# print("neg",neg.shape)
K = -neg
### Rows of K is the cone of vectors we want to biased towards positive dot product with. First we (may) need
#project on the null space of the S.
#need an orthonormal basis perpendicular to the rows of S, i.e. ker(S)
if S.shape[0]:
NS = la.null_space(S)
#now project onto there....
#So we need a basis for S
SRng = la.orth(S.T)
SBasis = np.concatenate([SRng,NS],axis=1)
else:
NS = np.eye(S.shape[1])
SRng = np.array([[]])
SBasis = NS
### And finally the projection:
K_S = la.solve(SBasis,K.T)
if K_S.shape[1]:
K_SPerp = K_S[SRng.shape[1]:]#the final N-S components are the coordinates in S^Perp
####Next we need a vector from K dual.
### This is the vectors y such that Ky >= 0
## so choose a positive vector (or close to it if the bias is not 1)
x = (np.random.rand(K_S.shape[1]) - 0.5) + bias/2
u = la.lstsq(K_SPerp.T,x)[0]
### I guess add a random vector from the null space of K_SPerp.T
NSK = la.null_space(K_SPerp.T)
u = u + np.dot(NSK,np.random.rand(NSK.shape[1])) #I don't think these 2 lines are necessary...
r = np.dot(NS,u)
r = (minvars + (mxvars-minvars)*np.random.rand())*r/np.linalg.norm(r)
else:
#then pick a random vector from there.
r = np.dot(NS,np.random.rand(NS.shape[1])-0.5)
r = (minvars + (mxvars-minvars)*np.random.rand())*r/np.linalg.norm(r)
else:
if S.shape[0]:
#need an orthonormal basis perpendicular to the rows of S, i.e. ker(S)
NS = la.null_space(S)
#then pick a random vector from there.
r = np.dot(NS,np.random.rand(NS.shape[1])-0.5)
r = (minvars + (mxvars-minvars)*np.random.rand())*r/np.linalg.norm(r)
else:
r = np.random.rand(S.shape[1]) - 0.5
r = (minvars + (mxvars-minvars)*np.random.rand())*r/np.linalg.norm(r)
return r
def rand_in_perp(S,mxvars = 1,minvars = 0.5,bias = 0,pos = None,neg = None,maxit = 0):
"""Find a random vector perpendicular to a given set of vectors. L2 norm of the vector will be chosen randomly in (minvars,mxvars). Additionally can
choose in or near the dual cone to a set of vectors.
:param S: Vectors (as rows) that new vector should be perpendicular to
:type S: np.array
:param mxvars: maximum scale of vector
:type mxvars: float
:param minvars: minimum scale of vector
:type minvars: float
:param bias: Extent to bias towards a dual cone. 1 chooses from dual cone, <1 chooses ``close to" the dual cone, 0 does not use the dual cone.
:type bias: float
:param pos: set of vectors for which result will have positive dot product (or biased towards)
:type pos: np.array
:param neg: set of vectors for which result will have negative dot product (or biased towards)
:type neg: np.array
:param maxit: Number of iterations to attempt to find vector in cone before giving up.
:type maxit: int
:return: random vector
:rtype: np.array
"""
if (hasattr(pos,"__len__") or hasattr(neg,"__len__")) and (bias !=0):
if (hasattr(pos,"__len__") and hasattr(neg,"__len__")):
# print(pos.shape)
# print(neg.shape)
K = np.concatenate([pos,-neg],axis = 0)
elif hasattr(pos,"__len__"):
# print("pos",pos.shape)
K = pos
else:
# print("neg",neg.shape)
K = -neg
### Rows of K is the cone of vectors we want to biased towards positive dot product with. First we (may) need
#project on the null space of the S.
#need an orthonormal basis perpendicular to the rows of S, i.e. ker(S)
if S.shape[0]:
NS = la.null_space(S)
else:
NS = np.eye(S.shape[1])
KNS = np.dot(K,NS)
KNS_nullspace = la.null_space(KNS)
if KNS_nullspace.shape[1]:
x = 2*np.random.rand(K.shape[0]) - 1 + bias #if bias == 1, nonnegative. If bias = 0, in [-1,1]
u = la.lstsq(KNS,x)[0]#
stop = 0
while la.norm(np.dot(KNS,u) - x) > 10**-10:
x = 2*np.random.rand(K.shape[0]) - 1 + bias #if bias == 1, nonnegative. If bias = 0, in [-1,1]
u = la.lstsq(KNS,x)[0]#
stop += 1
if stop > maxit:
u = 2*np.random.rand(NS.shape[1]) - 1
break
ubar = np.dot(KNS_nullspace,np.random.rand(KNS_nullspace.shape[1])-0.5) # add on a vector in the null space of KNS. Resulting u should satisfy (K*NS)u >= 0.
if la.norm(np.dot(NS,ubar)):
u = u + ubar#otherwise its not doing anything anyway
else:#if there is only going to be one or no solution in the projected cone.
# print("No or single solution in projection of cone")
u = 2*np.random.rand(NS.shape[1]) - 1
r = np.dot(NS,u)
r = (minvars + (mxvars-minvars)*np.random.rand())*r/np.linalg.norm(r)
else:
if S.shape[0]:
#need an orthonormal basis perpendicular to the rows of S, i.e. ker(S)
NS = la.null_space(S)
#then pick a random vector from there.
r = np.dot(NS,np.random.rand(NS.shape[1])-0.5)# in [-1/2,1/2]
r = (minvars + (mxvars-minvars)*np.random.rand())*r/np.linalg.norm(r)
else:
r = np.random.rand(S.shape[1]) - 0.5
r = (minvars + (mxvars-minvars)*np.random.rand())*r/np.linalg.norm(r)
return r
def adjust_sparsity(sparse_adjust,shffld,spl_i):
inter_king = [shffld[spl_i[i]:spl_i[i+1],spl_i[i]:spl_i[i+1]].sum()/((spl_i[i+1] - spl_i[i])*((spl_i[i+1] - spl_i[i])-1)) for i in range(len(spl_i)-1)]
blk = shffld[spl_i[sparse_adjust["Kingdoms"][0]]:spl_i[sparse_adjust["Kingdoms"][0]+1],spl_i[sparse_adjust["Kingdoms"][1]]:spl_i[sparse_adjust["Kingdoms"][1]+1]]
blk_size = blk.size
desired_sparsity = np.mean(inter_king)*sparse_adjust["SparsityRatio"]
desired_edges = blk_size*desired_sparsity
current_edges = blk.sum()
if sparse_adjust["SparsityRatio"] < 1:
nz = np.where(blk != 0)
keep = np.random.choice(nz[0].size,size = int(desired_edges),replace = False)
keep_coords = (nz[0][keep],nz[1][keep])
newblk = np.zeros_like(blk)
newblk[keep_coords] = 1.0
shffld[spl_i[sparse_adjust["Kingdoms"][0]]:spl_i[sparse_adjust["Kingdoms"][0]+1],spl_i[sparse_adjust["Kingdoms"][1]]:spl_i[sparse_adjust["Kingdoms"][1]+1]] = newblk
shffld[spl_i[sparse_adjust["Kingdoms"][1]]:spl_i[sparse_adjust["Kingdoms"][1]+1],spl_i[sparse_adjust["Kingdoms"][0]]:spl_i[sparse_adjust["Kingdoms"][0]+1]] = newblk.T
if sparse_adjust["SparsityRatio"] > 1:
zers = np.where(blk == 0)
add_in = np.random.choice(zers[0].size,size = int(desired_edges-current_edges),replace = False)
add_in_coords = (zers[0][add_in],zers[1][add_in])
newblk = blk.copy()
newblk[add_in_coords] = 1
shffld[spl_i[sparse_adjust["Kingdoms"][0]]:spl_i[sparse_adjust["Kingdoms"][0]+1],spl_i[sparse_adjust["Kingdoms"][1]]:spl_i[sparse_adjust["Kingdoms"][1]+1]] = newblk
shffld[spl_i[sparse_adjust["Kingdoms"][1]]:spl_i[sparse_adjust["Kingdoms"][1]+1],spl_i[sparse_adjust["Kingdoms"][0]]:spl_i[sparse_adjust["Kingdoms"][0]+1]] = newblk.T
return shffld
def create_groundtruth_covariance(N,graphtype,sparsity,spl_i,mxvars = 0.5,minvars = 0.4,sparse_adjust = None,bias_str = 0, bias_blocks = None,ranksafe = 0):
"""Creates a randomly chosen covariance matrix (symmetric, positive definite, full rank) with the same non-zero entry structure as the adjacency matrix of a random graph. Uses networkX graph generators.
:param N: size of the matrix (result is NxN)
:type N: int
:param graphtype: Type of random graph to base the non-zero structure of the matrix on. Choices are ``WS`` for Watts-Strogatz, ``BA`` for Barabosi-Albert, ``ER`` for Erdos-Reyni, and ``PL`` for powerlaw cluster.
:type graphtype: str
:param sparsity: approximate proportion of the matrix to be 0
:type sparsity: float
:param
:param mxvars: maximum range of variance (diagonal entries). Variances are chosen uniformly at random in the interval defined by minvars,mxvars.
:type mxvars: float
:param minvars: minimum range of variance (diagonal entries)
:type minvars: float
:param sparse_adjust: How to adjust sparsity of different blocks
:type sparse_adjust: dict
:param bias_blocks: Details of how to bias the interactions by blocks. List of [(i,j,+/-1)] with i<j
:type bias_blocks: list
:param bias_str: strength of biases. 1 is always true, 0 is no bias.
:type bias_str: float
:param ranksafe: weight to add in vectors from space perpendicular to previous vectors to insure matrix is full rank.
:type ranksafe: float
:return: _description_
:rtype: _type_
"""
nedges = sparsity*(N**2)
if graphtype == "WS":
K = int(2*nedges/N)
p = 0.3
graph = nx.watts_strogatz_graph(N,K,p)
elif graphtype == "BA":
K = int(nedges/N)
graph = nx.barabasi_albert_graph(N, K)
elif graphtype == "ER":
p = 2*nedges/(N*(N-1))
graph = nx.fast_gnp_random_graph(N, p)
elif graphtype == "PL":
p = 0.3
K = max(int(nedges/(N*(1+p))),1)
graph = nx.powerlaw_cluster_graph(N, K, p)
Adj = nx.to_numpy_array(graph)
sh = np.random.permutation(N)
shffld = Adj[sh][:,sh]
if isinstance(sparse_adjust,dict):
shffld = adjust_sparsity(sparse_adjust,shffld,spl_i)
Sp = shffld.sum()/(N*(N-1))
Q = (minvars + (mxvars-minvars))*np.random.rand()*np.eye(N)
print("[create_groundtruth_covariance] Generating cholesky decomposition")
if not hasattr(bias_blocks,"__len__"):
bias_blocks = []
for j in range(0,len(spl_i)-1):
pos_blks = [Q[spl_i[k]:spl_i[k+1]] for k in range(j) if (k,j,1) in bias_blocks]
if len(pos_blks):
full_pos = np.concatenate(pos_blks,axis = 0)
pos_pattern = np.concatenate([shffld[:,spl_i[k]:spl_i[k+1]] for k in range(j) if (k,j,1) in bias_blocks],axis = 1)
else:
full_pos = np.array([])
neg_blks = [Q[spl_i[k]:spl_i[k+1]] for k in range(j) if (k,j,-1) in bias_blocks]
if len(neg_blks):
full_neg = np.concatenate(neg_blks,axis = 0)
neg_pattern = np.concatenate([shffld[:,spl_i[k]:spl_i[k+1]] for k in range(j) if (k,j,-1) in bias_blocks],axis = 1)
else:
full_neg = np.array([])
for i in tqdm(range(spl_i[j],spl_i[j+1])):
if (j,j,1) in bias_blocks:
if len(pos_blks):
full_pos = np.concatenate([full_pos,Q[spl_i[j]:i]])
pos_pattern = np.concatenate([pos_pattern,shffld[:,spl_i[j]:i]],axis = 1)
else:
full_pos = Q[spl_i[j]:i]
pos_pattern = shffld[:,spl_i[j]:i]
elif (j,j,-1) in bias_blocks:
if len(neg_blks):
full_neg = np.concatenate([full_neg,Q[spl_i[j]:i]])
neg_pattern = np.concatenate([neg_pattern,shffld[:,spl_i[j]:i]],axis = 1)
else:
full_neg = Q[spl_i[j]:i]
neg_pattern = shffld[:,spl_i[j]:i]
if full_pos.shape[0]:
pcone = full_pos[pos_pattern[i].astype(bool)]
else:
pcone = None
if full_neg.shape[0]:
ncone = full_neg[neg_pattern[i].astype(bool)]
else:
ncone = None
Q[i] = rand_in_perp(Q[:i][~shffld[i,:i].astype(bool)],mxvars = mxvars,minvars = minvars,bias = bias_str, pos = pcone,neg = ncone)
if ranksafe:
r2 = rand_in_perp(Q[:i],mxvars = mxvars,minvars = minvars,bias = bias_str, pos = pcone,neg = ncone)
Q[i] = Q[i] + ranksafe*r2
if Q.shape[0]!=Q.shape[1]:
print("[create_groundtruth_covariance] ERROR: Did not choose enough vectors in cholesky decomposition. Size is {}".format(Q.shape))
covar = np.dot(Q,Q.T)
return covar,Sp
def generate_synthetic_samples(NumSmp,covar,mean_log_range = 1,mean_log_center = 0):
"""Generates random "true" synthetic absolute samples by drawing from log-normal distribution
:param NumSmp: Number of samples to generate
:type NumSmp: int
:param covar: Covariance matrix used to define log-normal distribution
:type covar: np.array
:param mean_log_range: width of interval to choose mean values for normal distribution (chosen uniformly in given interval) which is afterwards log-transformed
:type mean_log_range: float
:mean_log_center: center of interval to choose mean values for normal distribution (chosen uniformly in given interval) which is afterwards log-transformed
:type mean_log_center: float
:return: Set of absolute samples
:rtype: pd.DataFrame
"""
N = covar.shape[0]
mean_abundance = mean_log_center + (mean_log_range*np.random.rand(N)- mean_log_range/2)
try:
rv = stats.multivariate_normal(mean = mean_abundance,cov = covar)
except Exception as e:
print(e.__str__())
return None,False
synthetic_truth = pd.DataFrame(index = range(N),columns = ["S{}".format(i) for i in range(NumSmp)])
print("[generate_synthetic_samples] Drawing {} Samples for {} taxa".format(NumSmp,N))
for i in tqdm(range(NumSmp)):
synthetic_truth["S{}".format(i)] = np.exp(rv.rvs())
return synthetic_truth,True
def simulate_reads(sample,depth):
"""Simulate generating amplicon sequencing reads (error-free) from a absolute sample.
:param sample: Known absolute abundances of synthetic sample
:type sample: np.array
:param depth: Number of reads to generate (sequencing depth)
:type depth: int
:return: array of counts indicating number of simulated reads for each synthetic taxa
:rtype: np.array
"""
relative_sample = sample/np.sum(sample)
# reads = np.random.choice(np.arange(len(sample)),p=relative_sample,size = depth)
# readcounts = np.array([sum(reads == i) for i in np.arange(len(sample))])
readcounts = np.zeros(len(sample))
for rd in range(depth):
incr = np.random.choice(np.arange(len(sample)),p = relative_sample)
readcounts[incr] += 1
return readcounts
def simulate_all_reads(samples,depths,nj = 1):
"""Simulate generating amplicon sequencing reads (error-free) from a set of absolute samples.
:param samples: Known absolute abundances of synthetic samples
:type sample: pd.DataFrame
:param depths: Number of reads to generate (sequencing depth) for each sample
:type depth: np.array[int]
:param nj: number of parallel jobs to run for joblib.Parallel
:type nj: int
:return: table of counts indicating number of simulated reads for each synthetic taxa for each sample
:rtype: pd.DataFrame
"""
print("[simulate_all_reads] Drawing Read Assignments for {} taxa, {} samples.".format(samples.shape[0],samples.shape[1]))
all_reads = Parallel(n_jobs=nj)(delayed(simulate_reads)(samples[samples.columns[i]].values,depths[i]) for i in range(len(samples.columns)))
simulated_counts = pd.DataFrame(np.array(all_reads).T,columns = samples.columns, index=samples.index)
return simulated_counts.astype(int)
def generate_synthetic_data(num_taxa,num_samples,**kwargs):
"""Main function to generate a sythetic dataset from a log-normal distribution with error-free simulated sequencing.
:param num_taxa: Number of synthetic taxa in the dataset
:type num_taxa: int
:param num_samples: Number of synthetic samples to generate
:type num_samples: int
:param sparsity: approximate proportion of the covariance matrix to be 0
:param graph_model: Type of random graph to base the non-zero structure of the matrix on. Choices are ``WS`` for Watts-Strogatz, ``BA`` for Barabosi-Albert, ``ER`` for Erdos-Reyni, and ``PL`` for powerlaw cluster.
:param max_variance: maximum range of variance (diagonal entries). Variances are chosen uniformly at random in the interval defined by min_variance,max_variance.
:param min_variance: minimum range of variance (diagonal entries). Variances are chosen uniformly at random in the interval defined by min_variance,max_variance.
:param mean_log_range: width of interval to choose mean values for normal distribution (chosen uniformly in given interval) which is afterwards log-transformed
:param mean_log_center: center of interval to choose mean values for normal distribution (chosen uniformly in given interval) which is afterwards log-transformed
:param data_types: Number of seperate "data types" to simulate - reads will be generated seperately for each
:param average_read_depth: Average sequencing depth
:param std_of_read_depth: Std. of sequencing depth
:param simulate_reads_tasks: number of parallel jobs for simulating reads
:param truth_sparsity_adjustment: how to adjust sparsity in blocks (in kingdoms or across)
:param truth_bias_strength: strength of sign bias
:param truth_bias_structure: structure of sign bias in blocks
:type sparsity: float
:type graph_model: str
:type max_variance: float
:type min_variance: float
:type mean_log_range: float
:type mean_log_center: float
:type data_types: int
:type average_read_depth: int
:type std_of_read_depth: int
:type simulate_reads_tasks: int
:return: Synthetic data set including the following fields: "GroundTruthCovariance","GraphSparsity","AbsoluteSamples","CombinedReads","SeparatedReads","GroupOrder","ReadDepths","TotalRatioVariance","DataSplit"
:rtype: Dict
"""
sparsity = kwargs.get("sparsity",0.05)
graphtype = kwargs.get("graph_model","PL")
mxvar = kwargs.get("max_variance",0.5)
minvar = kwargs.get("min_variance",0.4)
mean_lr = kwargs.get("mean_log_range",1)
mean_lc = kwargs.get("mean_log_center",0)
sparse_adjust = kwargs.get("truth_sparsity_adjustment",None)
b_str = kwargs.get("truth_bias_strength",0)
b_blocks = kwargs.get("truth_bias_structure",None)
num_types = kwargs.get("data_types",1)
minsplit = int(0.2*num_taxa)
maxsplit = int(0.8*num_taxa)
splits = np.sort(np.random.choice(range(minsplit,maxsplit),size = num_types-1,replace = False))
spl_i = np.concatenate([[0],splits,[num_taxa]])
nmtry = 0
giveup = 10
covar_matrix = np.zeros((num_taxa,num_taxa))
covar_matrix_done = False
while not covar_matrix_done:
covar_matrix, actual_sparsity = create_groundtruth_covariance(num_taxa,graphtype,sparsity,spl_i,mxvars=mxvar,minvars=minvar,bias_blocks=b_blocks,bias_str=b_str,sparse_adjust = sparse_adjust,ranksafe=nmtry)
if np.min(np.linalg.svd(covar_matrix)[1]).round(5):
absolute_samples,covar_matrix_done = generate_synthetic_samples(num_samples,covar_matrix,mean_log_range=mean_lr,mean_log_center=mean_lc)
else:
print("Singular Matrix. Matrix smallest singular value is {}.".format(np.min(np.linalg.svd(covar_matrix)[1])))
nmtry += 1
if nmtry == giveup:
print("[generate_synthetic_data] Can't draw synthetic after {} tries. Possible ill-conditioned matrix. Matrix smallest singular value is {}.".format(giveup,np.min(np.linalg.svd(covar_matrix)[1])))
return None
if not isinstance(absolute_samples,pd.DataFrame):
return None
avg_dpth = kwargs.get("average_read_depth",20000)
std_dpth = kwargs.get("std_of_read_depth",1000)
simr_nj = kwargs.get("simulate_reads_tasks",1)
read_depths = {}
sep_reads = {}
reads_order = []
combined_reads = pd.DataFrame()
for j in range(1,num_types+1):
dpths = np.random.normal(avg_dpth,std_dpth,size = num_samples).astype(int)
read_depths["Set{}".format(j)] = dpths
rd = simulate_all_reads(absolute_samples.iloc[spl_i[j-1]:spl_i[j]],dpths,nj = simr_nj)
sep_reads["Set{}".format(j)] = rd
reads_order += [(spl_i[j-1],spl_i[j])]
combined_reads = pd.concat([combined_reads,rd])
if num_types == 2:
spl_at = spl_i[1]
q = (absolute_samples.iloc[:spl_at]/(sep_reads["Set1"]/sep_reads["Set1"].sum())).replace([np.inf, -np.inf], np.nan).mean()
r = (absolute_samples.iloc[spl_at:]/(sep_reads["Set2"]/sep_reads["Set2"].sum())).replace([np.inf, -np.inf], np.nan).mean()
vqr = np.var(q/r)
else:
vqr = 0
return {"GroundTruthCovariance":covar_matrix,"GraphSparsity":actual_sparsity,"AbsoluteSamples":absolute_samples,"CombinedReads":combined_reads,"SeparatedReads":sep_reads,"GroupOrder":reads_order,"ReadDepths":read_depths,"TotalRatioVariance":vqr,"DataSplit":spl_i}
def normalize_table(otu_table,how = "dirichlet",pcount = 1,totcount = 10**5):
"""
Add pseuodocounts to data so that we can use CLR
:param otu_table: the count or relative abundance table (taxa x sample)
:type otu_table: np.array
:param how: method for adding pseudocounts. Options are ``add_pseudocounts``,``dirichlet`` and ``normalize``
:type how: str
:param pcount: pseudocount to add or use for dirichlet distribution. Default 1.
:type pcount: int
:param totcount: approximate total count to assume if given relative abundance data (data is expanded to behave like count data). Default 10**5
:type totcount: float
:return: normalized data with no 0s
:rtype: np.array
method options are
- ``add_pseudocounts`` which adds a small number to each entry (default 1 if data is counts, default 10**4 if data is relative)
- ``dirichlet`` which, for a vector of counts C, draw the fractions from Dirichlet(C+pcount) or for relative abundance Dirichlet(c*totcount+pcount)
- ``normalize`` which just divides each column by its sum
"""
### find out if the data looks like counts or relative abundance
if np.all(otu_table == otu_table.astype(int)):
tablekind = 'count'
else:
tablekind = 'relative'
if how.lower() == "add_pseudocounts":
if tablekind == "count":
otu_table = otu_table + pcount
return otu_table/otu_table.sum(axis = 0)
else:
otu_table = totcount*otu_table + pcount
return otu_table/otu_table.sum(axis = 0)
elif how.lower() == "dirichlet":
if tablekind == "count":
### Method is the same as used in SparCC (friedman et al. 2012)
otu_table = np.apply_along_axis(lambda x:dir_fun(x,pcount),0,otu_table)
return otu_table
else:
otu_table = np.apply_along_axis(lambda x:dir_fun(x,pcount),0,totcount*otu_table)
return otu_table
elif how.lower() == "normalize":
return otu_table/otu_table.sum(axis = 0)
def log_cov(data,normalize = "robust",pcount = 1,totcount = 10**5):
"""Compute the covariance of the log of data
:param data: Table of data vectors (as columns)
:type data: np.array
:param normalize: how the data should be normalized (how 0 entries should be dealt with). Choices are ``robust``, ``add_pseudocounts``,``dirichlet`` and ``normalize``
:type normalize: = str
:param pcount: pseudocount to add or use for dirichlet distribution. Default 1.
:type pcount: int
:param totcount: approximate total count to assume if given relative abundance data (data is expanded to behave like count data). Default 10**5
:type totcount: float
:return: covariance matrix of log-transformed data
:rtype: np.array
method options are
- ``robust`` which ignores 0s
- ``add_pseudocounts`` which adds a small number to each entry (default 1 if data is counts, default 10**4 if data is relative)
- ``dirichlet`` which, for a vector of counts C, draw the fractions from Dirichlet(C+pcount) or for relative abundance Dirichlet(c*totcount+pcount)
- ``normalize`` which just divides each column by its sum
"""
if normalize == "robust":
log_data = np.vectorize(safelog)(data)#data.applymap(safelog)
else:
normed_data = normalize_table(data,how = normalize,pcount = pcount,totcount = totcount)
log_data = np.vectorize(safelog)(normed_data)#.applymap(safelog)
niave_cov = np.cov(log_data)
return niave_cov
def compute_clr(otu_table, norm = "dirichlet",pcount = 1,totcount = 10**5):
"""
:param otu_table: the count or relative abundance table (taxa x sample)
:type otu_table: np.array
:param norm: method for adding pseudocounts. Options are ``add_pseudocounts``,``dirichlet`` and ``normalize``. See :py:func:`normalize_table <compute_clr.normalize_table>`
:type norm: str
:param pcount: See :py:func:`normalize_table <compute_clr.normalize_table>`
:type pcount: int
:param totcount: See :py:func:`normalize_table <compute_clr.normalize_table>`
:type totcount: float
:return: CLR of the data (by column)
:rtype: np.array
"""
if norm == "robust":
print("[compute_clr] computing robust clr transform")
clr_table = robust_clr(otu_table)
else:
print("[compute_clr] normalizing data using {} method".format(norm))
normalized_data = normalize_table(otu_table, how=norm, pcount = pcount, totcount = totcount)
print("[compute_clr] computing clr transform")
clr_table = np.log(normalized_data/(np.apply_along_axis(geo_mean,0,normalized_data)))
return clr_table
def clr_cov(otu_table,**kwargs):
"""
Make a covariance matrix from an otu_table. Can handle data from mixed datatypes with the split_indices parameter, which indicates which rows in the table belong to each data type. The result is the matrix :math:`\\Gamma` from
the Kurtz et al.
:param otu_table: OTU table given as counts or relative abundance
:type otu_table: numpy array
:param split_indices: tuple with indicies corresponding to each data type, so that CLR can be computed separately (see Tipton et al. 2018)
:type split_indices: tuple
:param kwargs: kwargs for :py:func:`compute_clr<make_cov_table.compute_clr>`
:return: CLR covariance matrix :math:`\\Gamma`
:rtype: np.array
"""
otu_table = otu_table.values
norm = kwargs.get("norm","dirichlet")
pcount = kwargs.get("pcount",1)
totcount = kwargs.get("totcount",10**5)
split_indices = kwargs.get("split_indices",())
if not len(split_indices):
split_indices = (range(otu_table.shape[0]),)
clrd_data = np.zeros(otu_table.shape)
for smptype in split_indices:
clrd_data[smptype] = compute_clr(otu_table[smptype],norm=norm,pcount=pcount,totcount=totcount)#[smptype]
return np.cov(clrd_data)
def sparcc(otu_table,normtype = "dirichlet"):
"""Uses SparCC3 to infer covariance of data vectors.
:param otu_table: Data table with columns corresponding to samples.
:type otu_table: np.array or pd.DataFrame
:param normtype: method for adding pseudocounts. Options are ``add_pseudocounts``,``dirichlet`` and ``normalize``. See :py:func:`normalize_table <compute_clr.normalize_table>`
:type normtype: str
:return: inferred covariance matrix
:rtype: np.array
"""
import SparCC3 as spc
print("[sparcc] computing covariance.")
if normtype == "robust":
if not (otu_table == 0).any().any():
res = spc.main(otu_table.T,oprint = False,norm= "normalize")
else:
print("SparCC does not support robust CLR for tables with 0s. Using pseudocount")
res = spc.main(otu_table.T,oprint = False,norm= "pseudo")
elif normtype == "add_pseudocounts":
res = spc.main(otu_table.T,oprint = False,norm= "pseudo")
elif normtype == "dirichlet":
res = spc.main(otu_table.T,oprint = False,norm= "dirichlet")
elif normtype == "normalize":
res = spc.main(otu_table.T,oprint = False,norm= "normalize")
return res[1]