-
Notifications
You must be signed in to change notification settings - Fork 1
/
dr_missing_data.py
3261 lines (3047 loc) · 214 KB
/
dr_missing_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#! python3
# -*-coding:Utf-8 -*
########################################################################################################
########################################################################################################
#
# %%%% !!! IMPORTANT NOTE !!! %%%%
# At the end of this file, a demo presents how this python code can be used. Running this file (python dr_missing_data.py) will run the demo. Importing this module will not run the demo. The demo takes a few minutes.
# %%%% !!! !!! %%%%
# dr_missing_data.py
# This python code implements a framework to deal with missing data in dimensionality reduction (DR).
# The methodology which is implemented is described in the article "Nonlinear Dimensionality Reduction with Missing Data using Parametric Multiple Imputations", from Cyril de Bodt, Dounia Mulders, Michel Verleysen and John A. Lee, published in IEEE Transactions on Neural Networks and Learning Systems, in 2019.
# Link to retrieve the article: https://ieeexplore.ieee.org/abstract/document/8447227
# At the end of this file, a demo presents how this python code can be used. Running this file (python dr_missing_data.py) will run the demo. Importing this module will not run the demo. The demo takes a few minutes. The tested versions of the imported packages are specified at the end of the header.
# If you use this code or the article, please cite as:
# - de Bodt, C., Mulders, D., Verleysen, M., & Lee, J. A. (2019). Nonlinear Dimensionality Reduction With Missing Data Using Parametric Multiple Imputations. IEEE transactions on neural networks and learning systems, 30(4), 1166-1179.
# - BibTeX entry:
#@article{cdb2019drnap,
# title={{N}onlinear {D}imensionality {R}eduction with {M}issing {D}ata using {P}arametric {M}ultiple {I}mputations},
# author={de Bodt, C. and Mulders, D. and Verleysen, M. and Lee, J. A.},
# journal={{IEEE} Trans. Neural Netw. Learn. Syst.},
# volume={30},
# number={4},
# pages={1166--1179},
# year={2019}
#}
# The main functions of this file are:
# - 'mssne_implem': nonlinear dimensionality reduction through multi-scale SNE (Ms SNE), as presented in the reference [2] below and summarized in [1]. This function enables reducing the dimension of a complete data set.
# - 'gmm_fit_K', 'gmm_sev_em_fitting' and 'gmm_sev_sampling': Gaussian mixture modeling of a complete or incomplete data set, as presented in [7, 8, 9] and summarized in [1]. These functions respectively enable to:
# ---> 'gmm_fit_K': fit a Gaussian mixture model on a complete or incomplete data set. The number of mixture components is automatically determined and tuned as detailed in [1].
# ---> 'gmm_sev_em_fitting': fit a Gaussian mixture model with K components on a complete or incomplete data set. The number K of components is a parameter of the function. Setting it to 1 fits a single multivariate Gaussian on the data set, while setting K to 2 fits two Gaussian components on the data set, etc.
# ---> 'gmm_sev_sampling': draw samples from a Gaussian mixture model.
# - 'icknni_implem': implementation of the ICkNNI method as proposed in [15] and employed in the experiments of [1] for the comparison of the performances of the methods. This function enables performing a single imputation of the missing entries in a data set.
# - 'mssne_na_mmg': nonlinear dimensionality reduction through multi-scale SNE of an incomplete data set, using the methodology presented in [1]. This function enables applying multi-scale SNE on a database with missing values by first fitting a Gaussian mixture model on the data set and then dealing with the missing entries either thanks to multiple imputations or conditional mean imputation.
# - 'eval_dr_quality': unsupervised evaluation of the quality of a low-dimensional embedding, as introduced in [3, 4] and employed and summarized in [1, 2, 5]. This function enables computing quality assessment criteria measuring the neighborhood preservation from the high-dimensional space to the low-dimensional one. The documentation of the function explains the meaning of the criteria and how to interpret them.
# - 'knngain': supervised evaluation of the quality of a low-dimensional embedding, as introduced in [6]. This function enables computing criteria related to the accuracy of a KNN classifier in the low-dimensional space. The documentation of the function explains the meaning of the criteria and how to interpret them.
# - 'viz_2d_emb' and 'viz_qa': visualization of a 2-D embedding and of the quality criteria. These functions respectively enable to:
# ---> 'viz_2d_emb': plot a 2-D embedding.
# ---> 'viz_qa': depict the quality criteria computed by 'eval_dr_quality' and 'knngain'.
# The documentations of the functions describe their parameters. The demo shows how they can be used.
# Notations:
# - DR: dimensionality reduction.
# - HD: high-dimensional.
# - LD: low-dimensional.
# - HDS: HD space.
# - LDS: LD space.
# - NA: Not Available, synonym of missing data, missing values and missing entry.
# - NAN: Not A Number, synonym of missing data, missing values and missing entry.
# - Ms SNE: multi-scale stochastic neighbor embedding.
# References:
# [1] de Bodt, C., Mulders, D., Verleysen, M., & Lee, J. A. (2019). Nonlinear Dimensionality Reduction With Missing Data Using Parametric Multiple Imputations. IEEE transactions on neural networks and learning systems, 30(4), 1166-1179.
# [2] Lee, J. A., Peluffo-Ordóñez, D. H., & Verleysen, M. (2015). Multi-scale similarities in stochastic neighbour embedding: Reducing dimensionality while preserving both local and global structure. Neurocomputing, 169, 246-261.
# [3] Lee, J. A., & Verleysen, M. (2009). Quality assessment of dimensionality reduction: Rank-based criteria. Neurocomputing, 72(7-9), 1431-1443.
# [4] Lee, J. A., & Verleysen, M. (2010). Scale-independent quality criteria for dimensionality reduction. Pattern Recognition Letters, 31(14), 2248-2257.
# [5] Lee, J. A., Renard, E., Bernard, G., Dupont, P., & Verleysen, M. (2013). Type 1 and 2 mixtures of Kullback–Leibler divergences as cost functions in dimensionality reduction based on similarity preservation. Neurocomputing, 112, 92-108.
# [6] de Bodt, C., Mulders, D., López-Sánchez, D., Verleysen, M., & Lee, J. A. (2019). Class-aware t-SNE: cat-SNE. In ESANN (pp. 409-414).
# [7] Eirola, E., Doquire, G., Verleysen, M., & Lendasse, A. (2013). Distance estimation in numerical data sets with missing values. Information Sciences, 240, 115-128.
# [8] Eirola, E., Lendasse, A., Vandewalle, V., & Biernacki, C. (2014). Mixture of gaussians for distance estimation with missing data. Neurocomputing, 131, 32-42.
# [9] Sovilj, D., Eirola, E., Miche, Y., Björk, K. M., Nian, R., Akusok, A., & Lendasse, A. (2016). Extreme learning machine for missing data using multiple imputations. Neurocomputing, 174, 220-231.
# [10] Bouveyron, C., Girard, S., & Schmid, C. (2007). High-dimensional data clustering. Computational Statistics & Data Analysis, 52(1), 502-519.
# [11] Ghahramani, Z., & Jordan, M. I. (1995). Learning from incomplete data.
# [12] Rubin, D. B. (2004). Multiple imputation for nonresponse in surveys (Vol. 81). John Wiley & Sons.
# [13] Little, R. J., & Rubin, D. B. (2014). Statistical analysis with missing data. John Wiley & Sons.
# [14] Cattell, R. B. (1966). The scree test for the number of factors. Multivariate behavioral research, 1(2), 245-276.
# [15] Van Hulse, J., & Khoshgoftaar, T. M. (2014). Incomplete-case nearest neighbor imputation in software measurement data. Information Sciences, 259, 596-610.
# author: Cyril de Bodt (ICTEAM - UCLouvain)
# @email: cyril __dot__ debodt __at__ uclouvain.be
# Last modification date: January 7th, 2020
# Copyright (c) 2020 Universite catholique de Louvain (UCLouvain), ICTEAM. All rights reserved.
# This code was tested with Python 3.7.5 (Anaconda distribution, Continuum Analytics, Inc.). It uses the following modules:
# - numpy: version 1.17.4 tested
# - numba: version 0.46.0 tested
# - scipy: version 1.3.2 tested
# - matplotlib: version 3.1.1 tested
# - scikit-learn: version 0.22 tested
# - pandas: version 0.25.3 tested
# You can use, modify and redistribute this software freely, but not for commercial purposes.
# The use of this software is at your own risk; the authors are not responsible for any damage as a result from errors in the software.
########################################################################################################
########################################################################################################
import numpy as np, numba, sklearn.decomposition, scipy.spatial.distance, matplotlib.pyplot as plt, scipy.optimize, time, os, pandas, scipy.stats.mstats, scipy.special
# Name of this file
module_name = "dr_missing_data.py"
##############################
##############################
# General functions used by others in the code.
####################
@numba.jit(nopython=True)
def close_to_zero(v):
"""
Check whether v is close to zero or not.
In:
- v: a scalar or numpy array.
Out:
A boolean or numpy array of boolean of the same shape as v, with True when the entry is close to 0 and False otherwise.
"""
return np.absolute(v) <= 10.0**(-8.0)
@numba.jit(nopython=True)
def arange_except_i(N, i):
"""
Create a 1-D numpy array of integers from 0 to N-1 with step 1, except i.
In:
- N: a strictly positive integer.
- i: a positive integer which is strictly smaller than N.
Out:
A 1-D numpy array of integers from 0 to N-1 with step 1, except i.
"""
arr = np.arange(N)
return np.hstack((arr[:i], arr[i+1:]))
@numba.jit(nopython=True)
def fill_diago(M, v):
"""
Replace the elements on the diagonal of a square matrix M with some value v.
In:
- M: a 2-D numpy array storing a square matrix.
- v: some value.
Out:
M, but in which the diagonal elements have been replaced with v.
"""
for i in range(M.shape[0]):
M[i,i] = v
return M
@numba.jit(nopython=True)
def contains_ident_ex(X):
"""
Returns True if the data set contains two identical samples, False otherwise.
In:
- X: a 2-D numpy array with one example per row and one feature per column.
Out:
A boolean being True if and only if X contains two identical rows.
"""
# Number of samples and of features
N, M = X.shape
# Tolerance
atol = 10.0**(-8.0)
# For each sample
for i in range(N):
if np.any(np.absolute(np.dot((np.absolute(X[i,:]-X[i+1:,:]) > atol).astype(np.float64), np.ones(shape=M, dtype=np.float64)))<=atol):
return True
return False
def eucl_dist_matr(X):
"""
Compute the pairwise Euclidean distances in a data set.
In:
- X: a 2-D np.ndarray with shape (N,M) containing one example per row and one feature per column.
Out:
A 2-D np.ndarray dm with shape (N,N) containing the pairwise Euclidean distances between the data points in X, such that dm[i,j] stores the Euclidean distance between X[i,:] and X[j,:].
"""
return scipy.spatial.distance.squareform(X=scipy.spatial.distance.pdist(X=X, metric='euclidean'), force='tomatrix')
##############################
##############################
# Nonlinear dimensionality reduction through multi-scale SNE (Ms SNE) [2].
# The main function is 'mssne_implem'.
# See its documentation for details.
# The demo at the end of this file and the 'mssne_na_mmg' function present how to use it.
####################
# Default random seed for Ms SNE. Only used if seed_mssne is set to None in mssne_implem and mssne_sev_implem.
seed_MsSNE_def = 40
# Maximum number of iterations in L-BFGS. It has been chosen large enough to ensure that it is almost always possible for the optimization algorithms to find a solution with a gradient which has a close to zero norm before the maximum number of iterations is reached.
dr_nitmax = 100000
# The iterations of L-BFGS stop when max{|g_i | i = 1, ..., n} <= gtol where g_i is the i-th component of the gradient.
dr_gtol = 10**(-5)
# The iterations of L-BFGS stop when (f^k - f^{k+1})/max{|f^k|,|f^{k+1}|,1} <= ftol.
dr_ftol = 2.2204460492503131e-09
# Maximum number of line search steps per L-BFGS-B iteration.
dr_maxls = 50
# The maximum number of variable metric corrections used to define the limited memory matrix of L-BFGS.
dr_maxcor = 10
n_eps_np_float64 = np.finfo(dtype=np.float64).eps
@numba.jit(nopython=True)
def ms_perplexities(N, K_star=2, L_min=-1, L_max=-1):
"""
Define exponentially growing multi-scale perplexities, as in [2].
In:
- N: number of data points.
- K_star: K_{*} as defined in [2], to set the multi-scale perplexities.
- L_min: if -1, set as in [2].
- L_max: if -1, set as in [2].
Out:
A tuple with:
- L: as defined in [2]
- K_h: 1-D numpy array, with the perplexities in increasing order.
"""
if L_min == -1:
L_min = 1
if L_max == -1:
L_max = int(round(np.log2(np.float64(N)/np.float64(K_star))))
L = L_max-L_min+1
K_h = (np.float64(2.0)**(np.linspace(L_min-1, L_max-1, L).astype(np.float64)))*np.float64(K_star)
return L, K_h
def init_lds(X_hds, N, init='pca', n_components=2, rand_state=None):
"""
Initialize the LD embedding.
In:
- X_hds: numpy.ndarray with shape (N, M), containing the HD data set, with one example per row and one dimension per column, or None. If X_hds is set to None, init cannot be equal to 'pca', otherwise an error is raised.
- N: number of examples in the data set. If X_hds is not None, N must be equal to X_hds.shape[0].
- init: determines the initialization of the LD embedding.
---> If isinstance(init, str) is True:
------> If init is equal to 'pca', the LD embedding is initialized with the first n_components principal components of X_hds. X_hds cannot be None in this case, otherwise an error is raised.
------> If init is equal to 'random', the LD embedding is initialized randomly, using a uniform Gaussian distribution with small variance. X_hds may be set to None in this case.
------> Otherwise an error is raised.
---> If isinstance(init, np.ndarray) is True:
------> init must in this case be a 2-D numpy array, with N rows and n_components columns. It stores the LD positions to use for the initialization, with one example per row and one LD dimension per column. init[i,:] contains the initial LD coordinates for the HD sample X_hds[i,:]. X_hds may be set to None in this case. If init.ndim != 2 or init.shape[0] != N or init.shape[1] != n_components, an error is raised.
---> Otherwise, an error is raised.
- n_components: number of dimensions in the LD space.
- rand_state: random state. If it is None, it is set to np.random.
Out:
A numpy ndarray with shape (N, n_components), containing the initialization of the LD data set, with one example per row and one LD dimension per column.
"""
global module_name
if rand_state is None:
rand_state = np.random
if isinstance(init, str):
if init == "pca":
if X_hds is None:
raise ValueError("Error in function init_lds of module {module_name}: init cannot be set to 'pca' if X_hds is None.".format(module_name=module_name))
return sklearn.decomposition.PCA(n_components=n_components, whiten=False, copy=True, svd_solver='auto', iterated_power='auto', tol=0.0, random_state=rand_state).fit_transform(X_hds)
elif init == 'random':
return 10.0**(-4) * rand_state.randn(N, n_components)
else:
raise ValueError("Error in function init_lds of module {module_name}: unknown value '{init}' for init parameter.".format(module_name=module_name, init=init))
elif isinstance(init, np.ndarray):
if init.ndim != 2:
raise ValueError("Error in function init_lds of module {module_name}: init must be 2-D.".format(module_name=module_name))
if init.shape[0] != N:
raise ValueError("Error in function init_lds of module {module_name}: init must have {N} rows, but init.shape[0] = {v}.".format(module_name=module_name, N=N, v=init.shape[0]))
if init.shape[1] != n_components:
raise ValueError("Error in function init_lds of module {module_name}: init must have {n_components} columns, but init.shape[1] = {v}.".format(module_name=module_name, n_components=n_components, v=init.shape[1]))
return init
else:
raise ValueError("Error in function init_lds of module {module_name}: unknown type value '{v}' for init parameter.".format(module_name=module_name, v=type(init)))
@numba.jit(nopython=True)
def ms_ld_bandwidths(tau_hi, K_h, N, n_components, X_lds, fit_U=True):
"""
Compute the multi-scale LD bandwidths and precisions.
In:
- tau_hi: a 2-D numpy array in which element tau_hi[h,i] contains tau_{hi} = 2/pi_{hi}, as defined in [2].
- K_h: 1-D numpy array, with the perplexities at each scale in increasing order.
- N: number of data points.
- n_components: number of components of the LDS.
- X_lds: 2-D numpy array with shape (N, n_components) containing the current value of the LD embedding.
- fit_U: boolean. If True, U is computed as defined in [2]. Otherwise, it is fixed to 1 and tau_hi is not used and its value does not matter.
Out:
A tuple with:
- D_h: if fit_U is True, a 1-D numpy array containing D_{h} at the different scales as defined in [2]. Otherwise, equal to np.empty(shape=tau_hi.shape[0]-1, dtype=np.float64).
- U: if fit_U is True, U as defined in [2]. Otherwise, equal to 1.0.
- p_h: 1-D numpy array with the LD precisions at each scale defined by K_h.
- t_h: 1-D numpy array with the LD bandwidths at each scale defined by K_h.
"""
global n_eps_np_float64
N_f = np.float64(N)
n_c_f = np.float64(n_components)
# Fitting U as defined in [2].
if fit_U:
log_tau_diff = np.log2(tau_hi[1:,:])-np.log2(tau_hi[:-1,:])
log_tau_diff_neq0 = np.nonzero(log_tau_diff!=0)[0]
log_tau_diff[log_tau_diff_neq0] = 1.0/log_tau_diff[log_tau_diff_neq0]
D_h = np.dot(log_tau_diff, np.ones(shape=N, dtype=np.float64))*2.0/N_f
U = np.float64(min(2, max(1, D_h.max()/n_c_f)))
else:
D_h = np.empty(shape=K_h.size-1, dtype=np.float64)
U = 1.0
# Computing the mean variance of the LD dimensions.
mean_var_X_lds = np.float64(0.0)
N_1_f = N_f/np.float64(N-1)
for k in range(n_components):
mean_var_X_lds += np.var(X_lds[:,k])*N_1_f
mean_var_X_lds /= n_c_f
# Computing the LD precisions
p_h = K_h**(U*2.0/n_c_f)
p_h = ((2.0**(1.0+2.0/n_c_f))*p_h.max()/np.maximum(n_eps_np_float64, p_h*mean_var_X_lds)).astype(np.float64)
# Computing the LD bandwidths. We take the maximum with n_eps_np_float64 as t_h will divide some other quantities.
t_h = np.maximum(n_eps_np_float64, 2.0/np.maximum(n_eps_np_float64, p_h))
return D_h, U, p_h, t_h
@numba.jit(nopython=True)
def sne_sim(dsi, vi, i, compute_log=True):
"""
Compute the SNE asymmetric similarities, as well as their log.
N refers to the number of data points.
In:
- dsi: numpy 1-D array of floats with N squared distances with respect to data point i. Element k is the squared distance between data points k and i.
- vi: bandwidth of the exponentials in the similarities with respect to i.
- i: index of the data point with respect to which the similarities are computed, between 0 and N-1.
- compute_log: boolean. If True, the logarithms of the similarities are also computed, and otherwise not.
Out:
A tuple with two elements:
- A 1-D numpy array of floats with N elements. Element k is the SNE similarity between data points i and k.
- If compute_log is True, a 1-D numpy array of floats with N element. Element k is the log of the SNE similarity between data points i and k. By convention, element i is set to 0. If compute_log is False, it is set to np.empty(shape=N, dtype=np.float64).
"""
N = dsi.size
si = np.empty(shape=N, dtype=np.float64)
si[i] = 0.0
log_si = np.empty(shape=N, dtype=np.float64)
indj = arange_except_i(N=N, i=i)
dsij = dsi[indj]
log_num_sij = (dsij.min()-dsij)/vi
si[indj] = np.exp(log_num_sij)
den_si = si.sum()
si /= den_si
if compute_log:
log_si[i] = 0.0
log_si[indj] = log_num_sij - np.log(den_si)
return si, log_si
@numba.jit(nopython=True)
def sne_bsf(dsi, vi, i, log_perp):
"""
Function on which a binary search is performed to find the HD bandwidth of the i^th data point in SNE.
In:
- dsi, vi, i: same as in sne_sim function.
- log_perp: logarithm of the targeted perplexity.
Out:
A float corresponding to the current value of the entropy of the similarities with respect to i, minus log_perp.
"""
si, log_si = sne_sim(dsi=dsi, vi=vi, i=i, compute_log=True)
return -np.dot(si, log_si) - log_perp
@numba.jit(nopython=True)
def sne_bs(dsi, i, log_perp, x0=1.0):
"""
Binary search to find the root of sne_bsf over vi.
In:
- dsi, i, log_perp: same as in sne_bsf function.
- x0: starting point for the binary search. Must be strictly positive.
Out:
A strictly positive float vi such that sne_bsf(dsi, vi, i, log_perp) is close to zero.
"""
fx0 = sne_bsf(dsi=dsi, vi=x0, i=i, log_perp=log_perp)
if close_to_zero(v=fx0):
return x0
elif not np.isfinite(fx0):
raise ValueError("Error in function sne_bs: fx0 is nan.")
elif fx0 > 0:
x_up, x_low = x0, x0/2.0
fx_low = sne_bsf(dsi=dsi, vi=x_low, i=i, log_perp=log_perp)
if close_to_zero(v=fx_low):
return x_low
elif not np.isfinite(fx_low):
# WARNING: cannot find a valid root!
return x_up
while fx_low > 0:
x_up, x_low = x_low, x_low/2.0
fx_low = sne_bsf(dsi=dsi, vi=x_low, i=i, log_perp=log_perp)
if close_to_zero(v=fx_low):
return x_low
if not np.isfinite(fx_low):
return x_up
else:
x_up, x_low = x0*2.0, x0
fx_up = sne_bsf(dsi=dsi, vi=x_up, i=i, log_perp=log_perp)
if close_to_zero(v=fx_up):
return x_up
elif not np.isfinite(fx_up):
return x_low
while fx_up < 0:
x_up, x_low = 2.0*x_up, x_up
fx_up = sne_bsf(dsi=dsi, vi=x_up, i=i, log_perp=log_perp)
if close_to_zero(v=fx_up):
return x_up
while True:
x = (x_up+x_low)/2.0
fx = sne_bsf(dsi=dsi, vi=x, i=i, log_perp=log_perp)
if close_to_zero(v=fx):
return x
elif fx > 0:
x_up = x
else:
x_low = x
@numba.jit(nopython=True)
def sne_hd_similarities(dsm_hds, perp, compute_log=True, start_bs=np.ones(shape=1, dtype=np.float64)):
"""
Computes the matrix of SNE asymmetric HD similarities, as well as their log.
In:
- dsm_hds: 2-D numpy array with shape (N, N), where N is the number of data points. Element [i,j] must be the squared HD distance between i and j.
- perp: perplexity. Must be > 1.
- compute_log: boolean. If true, the logarithms of the similarities are also computed. Otherwise not.
- start_bs: 1-D numpy array with N elements. Element at index i is the starting point of the binary search for the ith data point. If start_bs has only one element, it will be set to np.ones(shape=N, dtype=np.float64).
Out:
A tuple with three elements:
- A 2-D numpy array with shape (N, N) and in which element [i,j] = the HD similarity between i and j. The similarity between i and i is set to 0.
- If compute_log is True, 2-D numpy array with shape (N, N) and in which element [i,j] = the log of the HD similarity between i and j. By convention, log(0) is set to 0. If compute_log is False, it is set to np.empty(shape=(N,N), dtype=np.float64).
- A 1-D numpy array with N elements, where element i is the denominator of the exponentials of the HD similarities with respect to data point i.
"""
if perp <= 1:
raise ValueError("""Error in function sne_hd_similarities of module dr_missing_data.py: the perplexity should be >1.""")
N = dsm_hds.shape[0]
if start_bs.size == 1:
start_bs = np.ones(shape=N, dtype=np.float64)
log_perp = np.log(min(np.float64(perp), np.floor(0.99*np.float64(N))))
# Computing the N**2 HD similarities, for i, j = 0, ..., N-1.
si = np.empty(shape=(N,N), dtype=np.float64)
# Even when compute_log is False, we can not set log_si to None. We need to define it as an array, to be compatible with numba.
log_si = np.empty(shape=(N,N), dtype=np.float64)
arr_vi = np.empty(shape=N, dtype=np.float64)
for i in range(N):
# Computing the denominator of the exponentials of the HD similarities with respect to data point i.
vi = sne_bs(dsi=dsm_hds[i,:], i=i, log_perp=log_perp, x0=start_bs[i])
# Computing the HD similarities between i and j for j=0, ..., N-1.
tmp = sne_sim(dsi=dsm_hds[i,:], vi=vi, i=i, compute_log=compute_log)
si[i,:] = tmp[0]
if compute_log:
log_si[i,:] = tmp[1]
arr_vi[i] = vi
return si, log_si, arr_vi
@numba.jit(nopython=True)
def ms_hd_similarities(dsm_hds, arr_perp):
"""
Compute the matrix of multi-scale HD similarities sigma_{ij}, as defined in [2].
In:
- dsm_hds: 2-D numpy array with shape (N, N), where N is the number of data points. Element [i,j] must be the squared HD distance between i and j.
- arr_perp: numpy 1-D array containing the perplexities for all scales. All the perplexities must be > 1.
Out:
A tuple with:
- A 2-D numpy array with shape (N, N) and in which element [i,j] = the multi-scale HD similarity sigma_{ij}.
- A 2-D numpy array with shape (arr_perp.size, N) and in which element [h,i] = tau_{hi} = 2/pi_{hi}, following the notations of [2].
- sim_hij: 3-D numpy array with shape (arr_perp.size, N, N) where sim_hij[h,:,:] contains the HD similarities at scale arr_perp[h].
"""
# Number of data points
N = dsm_hds.shape[0]
# Number of perplexities
L = arr_perp.size
# Matrix storing the multi-scale HD similarities sigma_{ij}. Element [i,j] contains sigma_{ij}. sigma_{ii} is set to 0.
sigma_ij = np.zeros(shape=(N,N), dtype=np.float64)
# Matrix storing the HD similarities sigma_{hij} at each scale.
sim_hij = np.empty(shape=(L,N,N), dtype=np.float64)
# Matrix storing the HD tau_{hi}. Element [h,i] contains tau_{hi}.
tau_hi = np.empty(shape=(L,N), dtype=np.float64)
# For each perplexity
for h, perp in enumerate(arr_perp):
# Using the bandwidths found at the previous scale to initialize the binary search at the current scale.
if h > 0:
start_bs = tau_hi[h-1,:]
else:
start_bs = np.ones(shape=N, dtype=np.float64)
# Computing the N**2 HD similarities sigma_{hij}
sim_hij[h,:,:], dum, tau_hi[h,:] = sne_hd_similarities(dsm_hds=dsm_hds, perp=perp, compute_log=False, start_bs=start_bs)
# Updating the multi-scale HD similarities
sigma_ij += sim_hij[h,:,:]
# Scaling the multi-scale HD similarities
sigma_ij /= np.float64(L)
# Returning
return sigma_ij, tau_hi, sim_hij
@numba.jit(nopython=True)
def ms_hld_bandwidths(dsm_hds, K_h, N, n_components, X_lds, fit_U=True):
"""
Compute the multi-scale HD and LD bandwidths and precisions.
In:
- dsm_hds: 2-D numpy array with the squared pairwise HD distances between the data points.
- K_h: 1-D numpy array, with the perplexities at each scale in increasing order.
- N: number of data points.
- n_components: number of components of the LDS.
- X_lds: 2-D numpy array with shape (N, n_components) containing the current value of the LD embedding.
- fit_U: same as for ms_ld_bandwidths.
Out:
A tuple with:
- tau_hi: a 2-D numpy array in which element tau_hi[h,i] contains tau_{hi} = 2/pi_{hi}, as defined in [2].
- D_h: if fit_U is True, a 1-D numpy array containing D_{h} at the different scales as defined in [2]. Otherwise, a dummy 1-D numpy array of the same size as when fit_U is True.
- U: U as defined in [2] if fit_U is True, or 1 otherwise.
- p_h: 1-D numpy array with the LD precisions at each scale defined by K_h.
- t_h: 1-D numpy array with the LD bandwidths at each scale defined by K_h.
"""
# Computing the multi-scale HD similarities. Element sigma_ij[i,j] contains sigma_{ij}. sigma_{ii} is set to 0. Element tau_hi[h,i] contains tau_{hi} = 2/pi_{hi}.
sigma_ij, tau_hi = ms_hd_similarities(dsm_hds=dsm_hds, arr_perp=K_h)[:2]
# Computing the LD bandwidths and precisions
D_h, U, p_h, t_h = ms_ld_bandwidths(tau_hi=tau_hi, K_h=K_h, N=N, n_components=n_components, X_lds=X_lds, fit_U=fit_U)
# Returning
return tau_hi, D_h, U, p_h, t_h
@numba.jit(nopython=True)
def mssne_lds_similarities_h(arr_den_s_i, np_eps, arr_ones, dsm_lds_min_row_dsm_lds_t):
"""
Computation of the matrix of Ms SNE asymmetric similarities in LDS at some scale, as well as their log.
We denote by
- dsm_lds: 2-D numpy array with shape (N, N), where N is the number of data points. Element [i,j] must be the squared LD distance between i and j. The diagonal elements are assumed to be equal to np.inf.
- dsm_lds_min_row: equal to dsm_lds.min(axis=1).
- N: number of data points.
In:
- arr_den_s_i: numpy 1-D array with N elements, where element i is the denominator of the exponentials of the LD similarities between i and j for j=0, ..., N-1. It is assumed that arr_den_s_i >= np_eps.
- np_eps: equal to np.finfo(dtype=np.float64).eps.
- arr_ones: equal to np.ones(shape=N, dtype=np.float64).
- dsm_lds_min_row_dsm_lds_t: equal to dsm_lds_min_row-dsm_lds.T.
Out:
A 2-D numpy array with shape (N, N) and in which element [i,j] = the LD similarity between i and j. The LD similarity between i and i is set to 0.
"""
# Numerators of the similarities
s_i = np.exp(dsm_lds_min_row_dsm_lds_t/arr_den_s_i)
# Correcting the diagonal of the similarities.
s_i = fill_diago(s_i, 0.0).astype(np.float64)
# Computing the N**2 LD similarities, for i, j = 0, ..., N-1, and returning.
return (s_i/np.maximum(np_eps, np.dot(arr_ones.astype(np.float64), s_i))).T
@numba.jit(nopython=True)
def mssne_sev_lds_similarities_h(arr_den_s_i, np_eps, arr_ones, dsm_lds_min_row_dsm_lds_t):
"""
Similar to mssne_lds_similarities_h, but for several data sets.
In:
- np_eps, arr_ones: same as for mssne_lds_similarities_h.
- arr_den_s_i: 2-D numpy array with shape (n_samp, N). For each k in range(n_samp), arr_den_s_i[k,:] must be a valid arr_den_s_i argument for mssne_lds_similarities_h.
- dsm_lds_min_row_dsm_lds_t: 3-D numpy array with shape (n_samp, N, N). For each k in range(n_samp), dsm_lds_min_row_dsm_lds_t[k,:,:] must be a valid dsm_lds_min_row_dsm_lds_t argument for mssne_lds_similarities_h.
Out:
A 3-D numpy array ret_v with shape (n_samp, N, N). For each k in range(n_samp), ret_v[k,:,:] is equal to mssne_lds_similarities_h(arr_den_s_i=arr_den_s_i[k,:], np_eps=np_eps, arr_ones=arr_ones, dsm_lds_min_row_dsm_lds_t=dsm_lds_min_row_dsm_lds_t[k,:,:]).
"""
# Number of data sets and number of examples per data set
n_samp, N = dsm_lds_min_row_dsm_lds_t.shape[:2]
# Initializing the array to return
ret_v = np.empty(shape=(n_samp, N, N), dtype=np.float64)
# For each data set
for k in range(n_samp):
ret_v[k,:,:] = mssne_lds_similarities_h(arr_den_s_i=arr_den_s_i[k,:], np_eps=np_eps, arr_ones=arr_ones, dsm_lds_min_row_dsm_lds_t=dsm_lds_min_row_dsm_lds_t[k,:,:])
# Returning
return ret_v
@numba.jit(nopython=True)
def mssne_eval_sim_lds(N, n_perp, t_h, np_eps, arr_ones, dsm_lds_min_row_dsm_lds_t):
"""
Evaluates the LD similarities.
In:
- N: number of data points.
- n_perp: number of perplexities which are considered.
- t_h: 1-D numpy array containing n_perp elements and in which element h contains the LD bandwidth associated with the h^th considered perplexity, which is equal to 2.0/p_h[h]. It is assumed that t_h >= np_eps.
- np_eps, arr_ones, dsm_lds_min_row_dsm_lds_t: as in function mssne_lds_similarities_h.
Out:
A tuple with:
- A 2-D numpy array with shape (N, N) containing the pairwise multi-scale LD similarities. The diagonal elements are forced to 1.0.
- A 3-D numpy array s_hij with shape (n_perp, N, N). For each h in range(n_perp), s_hij[h,:,:] contains the pairwise LD similarities at scale h.
"""
# LD single-scale similarities.
s_hij = np.empty(shape=(n_perp,N,N), dtype=np.float64)
# LD multi-scale similarities.
s_ij = np.zeros(shape=(N,N), dtype=np.float64)
# For each scale
for h in range(n_perp):
# Computing the corresponding LD similarities and updating s_ij
s_hij[h,:,:] = mssne_lds_similarities_h(arr_den_s_i=t_h[h]*arr_ones, np_eps=np_eps, arr_ones=arr_ones, dsm_lds_min_row_dsm_lds_t=dsm_lds_min_row_dsm_lds_t)
s_ij += s_hij[h,:,:]
# Scaling s_ij
s_ij /= np.float64(n_perp)
# Since s_ij is only used as a denominator, we fill its diagonal with ones, to avoid dividing by zero. This does not change the results, as the diagonal of sigma_ij is equal to 0.
s_ij = fill_diago(s_ij, 1.0)
# Setting the remaining 0 elements of s_ij to the smallest non-zero value, to avoid dividing by zero.
s_ij = np.maximum(np_eps, s_ij)
# Returning
return s_ij, s_hij
@numba.jit(nopython=True)
def mssne_sev_eval_sim_lds(N, n_perp, t_h, n_samp, np_eps, arr_ones, dsm_lds_min_row_dsm_lds_t):
"""
Evaluate mssne_eval_sim_lds for several data sets.
In:
- N, n_perp, np_eps, arr_ones, dsm_lds_min_row_dsm_lds_t: same as for mssne_eval_sim_lds.
- t_h: 2-D numpy array with shape (n_samp, n_perp). For each k in range(n_samp), t_h[k,:] contains the LD bandwidths at each scale. It is assumed that t_h >= np_eps.
- n_samp: strictly positive integer indicating the number of data sets for which the LD similarities must be computed.
Out:
A 3-D numpy array s_ij with shape (n_samp, N, N). For each i in range(n_samp), s_ij[i,:,:] contains the result of mssne_eval_sim_lds(N=N, n_perp=n_perp, t_h=t_h[i,:], np_eps=np_eps, arr_ones=arr_ones, dsm_lds_min_row_dsm_lds_t=dsm_lds_min_row_dsm_lds_t).
"""
# Initializing the similarities to return
s_ij = np.empty(shape=(n_samp, N, N), dtype=np.float64)
# For each data set
for i in range(n_samp):
s_ij[i,:,:] = mssne_eval_sim_lds(N=N, n_perp=n_perp, t_h=t_h[i,:], np_eps=np_eps, arr_ones=arr_ones, dsm_lds_min_row_dsm_lds_t=dsm_lds_min_row_dsm_lds_t)[0]
# Returning
return s_ij
def mssne_eval_dsm_lds_min_row_dsm_lds_t_dsm(dsm_lds):
"""
Evaluates the dsm_lds_min_row_dsm_lds_t parameter of function mssne_lds_similarities_h. N denotes the number of samples.
In:
- dsm_lds: numpy 2-D array with shape (N, N), containing the pairwise LD squared distances.
Out:
dsm_lds_min_row_dsm_lds_t as described in mssne_lds_similarities_h.
"""
np.fill_diagonal(a=dsm_lds, val=np.inf)
# Returning
return dsm_lds.min(axis=1)-dsm_lds.T
def mssne_sev_eval_dsm_lds_min_row_dsm_lds_t_dsm(dsm_lds):
"""
Similar to mssne_eval_dsm_lds_min_row_dsm_lds_t_dsm but for several distance matrices.
In:
- dsm_lds: 3-D numpy array with shape (n_samp, N, N). For each k in range(n_samp), dsm_lds[k,:,:] must be a valid argument for function mssne_eval_dsm_lds_min_row_dsm_lds_t_dsm.
Out:
A 3-D numpy array ret_v with shape (n_samp, N, N). For each k in range(n_samp), ret_v[k,:,:] is equal to mssne_eval_dsm_lds_min_row_dsm_lds_t_dsm(dsm_lds=dsm_lds[k,:,:]).
"""
# Number of data sets and number of examples per data set
n_samp, N = dsm_lds.shape[:2]
# Initializing the array to return
ret_v = np.empty(shape=(n_samp, N, N), dtype=np.float64)
# For each data set
for k in range(n_samp):
ret_v[k,:,:] = mssne_eval_dsm_lds_min_row_dsm_lds_t_dsm(dsm_lds=dsm_lds[k,:,:])
# Returning
return ret_v
def mssne_eval_dsm_lds_min_row_dsm_lds_t(X):
"""
Evaluate the dsm_lds_min_row_dsm_lds_t parameter of function mssne_lds_similarities_h.
In:
- X: numpy 2-D array with shape (N, n_components), containing the current values of the LD coordinates. It contains one example per row and one LD dimension per column.
Out:
dsm_lds_min_row_dsm_lds_t as described in mssne_lds_similarities_h.
"""
# Computing the pairwise squared Euclidean distances in the LDS
dsm_lds = scipy.spatial.distance.squareform(X=scipy.spatial.distance.pdist(X=X, metric='sqeuclidean'), force='tomatrix')
# Returning
return mssne_eval_dsm_lds_min_row_dsm_lds_t_dsm(dsm_lds=dsm_lds)
def mssne_obj_fct(x, sigma_ij, N, n_components, p_h, t_h, n_perp):
"""
Compute the value of the objective function of Multi-scale SNE.
In:
- x: numpy 1-D array with N*n_components elements, containing the current values of the LD coordinates. np.reshape(a=x, newshape=(N, n_components)) should yield a 2-D array with one example per row and one LD dimension per column.
- sigma_ij: numpy 2-D array with shape (N,N). Element (i,j) should contain sigma_{ij}, as defined in [2]. Diagonal elements must be equal to 0.
- N: number of data points.
- n_components: dimension of the LDS.
- n_perp: number of perplexities which are considered.
- p_h: 1-D numpy array containing n_perp elements and in which element h contains the LD precision associated with the h^th considered perplexity.
- t_h: 1-D numpy array containing n_perp elements and in which element h contains the LD bandwidth associated with the h^th considered perplexity, which is equal to 2.0/p_h[h]. It is assumed that t_h >= np_eps.
Out:
A scalar equal to the Multi-scale SNE objective function value.
Remark:
- To use scipy optimization functions, the functions mssne_obj_fct and mssne_grad should have the same arguments.
"""
global n_eps_np_float64
# Evaluating the LD similarities.
s_ij = mssne_eval_sim_lds(N=N, n_perp=n_perp, t_h=t_h, np_eps=n_eps_np_float64, arr_ones=np.ones(shape=N, dtype=np.float64), dsm_lds_min_row_dsm_lds_t=mssne_eval_dsm_lds_min_row_dsm_lds_t(X=np.reshape(a=x, newshape=(N, n_components))))[0]
# Computing the cost function value
return scipy.special.rel_entr(sigma_ij, s_ij).sum()
def mssne_sev_obj_fct(x, sigma_ij, N, n_components, p_h, t_h, n_perp):
"""
Evaluate the objective function of Ms SNE for multiple HD similarities and return the mean of the evaluations.
In:
- sigma_ij: numpy 3-D array with shape (n_samp, N, N). For each k in range(n_samp), sigma_ij[k,i,j] should contain sigma_{ij}, as defined in [2], for the k^th data set. Elements for which i=j must be equal to 0.
- p_h: 2-D numpy array with shape (n_samp, n_perp). For each k in range(n_samp), p_h[k,:] contains the LD precisions at each scale.
- t_h: 2-D numpy array with shape (n_samp, n_perp). For each k in range(n_samp), t_h[k,:] contains the LD bandwidths at each scale. It is assumed that t_h >= np_eps.
- x, N, n_components, n_perp: same as in mssne_obj_fct.
Out:
A float being the mean of the calls mssne_obj_fct(x=x, sigma_ij=sigma_ij[k,:,:], N=N, n_components=n_components, p_h=p_h[k,:], t_h=t_h[k,:], n_perp=n_perp) for each k in range(n_samp).
"""
global n_eps_np_float64
n_samp = sigma_ij.shape[0]
# Evaluating the LD similarities for the different data sets. For each i in range(sigma_ij.shape[0]), s_ij[i,:,:] has its diagonal elements forced to zero.
s_ij = mssne_sev_eval_sim_lds(N=N, n_perp=n_perp, t_h=t_h, n_samp=n_samp, np_eps=n_eps_np_float64, arr_ones=np.ones(shape=N, dtype=np.float64), dsm_lds_min_row_dsm_lds_t=mssne_eval_dsm_lds_min_row_dsm_lds_t(X=np.reshape(a=x, newshape=(N, n_components))))
# Computing the mean cost function value
return np.sum(scipy.special.rel_entr(sigma_ij, s_ij))/np.float64(n_samp)
@numba.jit(nopython=True)
def mssne_eval_grad(N, n_perp, t_h, np_eps, arr_ones, dsm_lds_min_row_dsm_lds_t, n_components, p_h, X, sigma_ij):
"""
Evaluate the Ms SNE gradient.
In:
- N: number of data points.
- n_perp: number of perplexities which are considered.
- t_h: 1-D numpy array containing n_perp elements and in which element h contains the LD bandwidth associated with the h^th considered perplexity, which is equal to 2.0/p_h[h]. It is assumed that t_h >= np_eps.
- np_eps, arr_ones, dsm_lds_min_row_dsm_lds_t: same as in function mssne_lds_similarities_h.
- n_components: dimension of the LDS.
- p_h: 1-D numpy array containing n_perp elements and in which element h contains the LD precision associated with the h^th considered perplexity.
- X: numpy 2-D array with shape (N, n_components), containing the current values of the LD coordinates. It contains one example per row and one LD dimension per column.
- sigma_ij: numpy 3-D array with shape (n_samp, N, N). For each k in range(n_samp), sigma_ij[k,i,j] should contain sigma_{ij}, as defined in [2], for the k^th data set. Elements for which i=j must be equal to 0.
Out:
A 2-D numpy array with shape (N, n_components) containing the evaluation of the Ms SNE gradient.
"""
# Computing the LD similarities
s_ij, s_hij = mssne_eval_sim_lds(N=N, n_perp=n_perp, t_h=t_h, np_eps=np_eps, arr_ones=arr_ones, dsm_lds_min_row_dsm_lds_t=dsm_lds_min_row_dsm_lds_t)
# Computing the quotient of sigma_ij by s_ij
ss_ij = sigma_ij/s_ij
# Computing the gradient
grad = np.zeros(shape=(N, n_components), dtype=np.float64)
for h in range(n_perp):
# Computing the product between ss_ij and s_hij[h,:,:], summing over the columns, substracting the result from each row of ss_ij and multiplying by s_hij[h,:,:].
Mh = s_hij[h,:,:]*((ss_ij.T - np.dot(ss_ij*s_hij[h,:,:], arr_ones)).T)
Mh += Mh.T
# Updating the gradient
grad += p_h[h]*((X.T*np.dot(Mh, arr_ones)).T - np.dot(Mh, X))
# Returning
return grad/np.float64(n_perp)
@numba.jit(nopython=True)
def mssne_eval_sev_grad(N, n_perp, t_h, np_eps, arr_ones, dsm_lds_min_row_dsm_lds_t, n_components, p_h, X, sigma_ij, n_samp):
"""
Evaluate the mean Ms SNE gradient over several data sets.
In:
- N, n_perp, np_eps, arr_ones, dsm_lds_min_row_dsm_lds_t, n_components, X: same as for mssne_eval_grad.
- t_h: 2-D numpy array with shape (n_samp, n_perp). For each k in range(n_samp), t_h[k,:] contains the LD bandwidths at each scale. It is assumed that t_h >= np_eps.
- n_samp: strictly positive integer indicating the number of data sets for which the similarities must be computed.
- sigma_ij: numpy 3-D array with shape (n_samp, N, N). For each k in range(n_samp), sigma_ij[k,i,j] should contain sigma_{ij}, as defined in [2], for the k^th data set. Elements for which i=j must be equal to 0.
- p_h: 2-D numpy array with shape (n_samp, n_perp). For each k in range(n_samp), p_h[k,:] contains the LD precisions at each scale.
Out:
A 2-D numpy array with shape (N, n_components) containing the mean of the Ms SNE gradients of the n_samp data sets provided through the arguments.
"""
# Initializing the gradient
grad = np.zeros(shape=(N, n_components), dtype=np.float64)
# For each data set
for i in range(n_samp):
grad += mssne_eval_grad(N=N, n_perp=n_perp, t_h=t_h[i,:], np_eps=np_eps, arr_ones=arr_ones, dsm_lds_min_row_dsm_lds_t=dsm_lds_min_row_dsm_lds_t, n_components=n_components, p_h=p_h[i,:], X=X, sigma_ij=sigma_ij[i,:,:])
# Returning the gradient mean
return grad/np.float64(n_samp)
def mssne_grad(x, sigma_ij, N, n_components, p_h, t_h, n_perp):
"""
Compute the value of the gradient of the objective function of Multi-scale SNE.
In:
- x: numpy 1-D array with N*n_components elements, containing the current values of the LD coordinates. np.reshape(a=x, newshape=(N, n_components)) should yield a 2-D array with one example per row and one LD dimension per column.
- sigma_ij: numpy 2-D array with shape (N,N). Element (i,j) should contain sigma_{ij}, as defined in [2]. Diagonal elements must be equal to 0.
- N: number of data points.
- n_components: dimension of the LDS.
- n_perp: number of perplexities which are considered.
- p_h: 1-D numpy array containing n_perp elements and in which element h contains the LD precision associated with the h^th considered perplexity.
- t_h: 1-D numpy array containing n_perp elements and in which element h contains the LD bandwidth associated with the h^th considered perplexity, which is equal to 2.0/p_h[h]. It is assumed that t_h >= n_eps_np_float64.
Out:
A 1-D numpy array with N*n_components elements, where element i is the coordinate of the gradient associated to x[i].
Remark:
- In order to use the scipy optimization functions, the functions mssne_obj_fct and mssne_grad should have the same arguments.
"""
global n_eps_np_float64
X = np.reshape(a=x, newshape=(N, n_components))
# Evaluating the gradient
grad = mssne_eval_grad(N=N, n_perp=n_perp, t_h=t_h, np_eps=n_eps_np_float64, arr_ones=np.ones(shape=N, dtype=np.float64), dsm_lds_min_row_dsm_lds_t=mssne_eval_dsm_lds_min_row_dsm_lds_t(X=X), n_components=n_components, p_h=p_h, X=X, sigma_ij=sigma_ij)
# Returning the reshaped gradient
return np.reshape(a=grad, newshape=N*n_components)
def mssne_sev_grad(x, sigma_ij, N, n_components, p_h, t_h, n_perp):
"""
Evaluate the gradient of Ms SNE for multiple HD similarities and return the mean of the evaluations.
In:
- sigma_ij: numpy 3-D array with shape (n_samp, N, N). For each k in range(n_samp), sigma_ij[k,i,j] should contain sigma_{ij}, as defined in [2], for the k^th data set. Elements for which i=j must be equal to 0.
- p_h: 2-D numpy array with shape (n_samp, n_perp). For each k in range(n_samp), p_h[k,:] contains the LD precisions at each scale.
- t_h: 2-D numpy array with shape (n_samp, n_perp). For each k in range(n_samp), t_h[k,:] contains the LD bandwidths at each scale. It is assumed that t_h >= n_eps_np_float64.
- x, N, n_components, n_perp: same as in mssne_grad.
Out:
A 1-D numpy array with N*n_components elements, where element i is the mean coordinate of the gradient associated to x[i].
"""
X = np.reshape(a=x, newshape=(N, n_components))
# Evaluating the gradient
grad = mssne_eval_sev_grad(N=N, n_perp=n_perp, t_h=t_h, np_eps=np.finfo(dtype=np.float64).eps, arr_ones=np.ones(shape=N, dtype=np.float64), dsm_lds_min_row_dsm_lds_t=mssne_eval_dsm_lds_min_row_dsm_lds_t(X=X), n_components=n_components, p_h=p_h, X=X, sigma_ij=sigma_ij, n_samp=sigma_ij.shape[0])
# Returning the reshaped gradient
return np.reshape(a=grad, newshape=N*n_components)
def mssne_sim_hds_bandwidth(X_hds, K_h, N, n_components, X_lds, fit_U=True, dsm_hds=None, dm_fct=None):
"""
Evaluate the multi-scale HD and LD bandwidths and precisions, or directly return the HD similarities at the different scales.
In:
- X_hds: 2-D numpy.ndarray with shape (N, M), containing the HD data set, with one example per row and one dimension per column. If None, dsm_hds must be specified and different from None, otherwise an error is raised. X_hds is only used if dsm_hds is set to None.
- K_h: 1-D numpy array with the perplexities at each scale in increasing order.
- N: number of data points.
- n_components: number of components in the LDS.
- X_lds: 2-D numpy array with shape (N, n_components) containing the current value of the LD embedding.
- fit_U: same as for ms_ld_bandwidths.
- dsm_hds: (optional) 2-D numpy.ndarray with shape (N,N) containing the pairwise SQUARED HD distances between the data points in X_hds. dsm_hds[i,j] stores the SQUARED HD distance between X_hds[i,:] and X_hds[j,:]. If dsm_hds is specified and not None, X_hds is not used and can be set to None. If dsm_hds is None, it is deduced from X_hds using squared Euclidean distances if dm_fct is None, or using dm_fct(X_hds)**2 if dm_fct is not None. Hence, if dsm_hds is None, X_hds cannot be None, otherwise an error is raised.
- dm_fct: (optional) a function taking X_hds as argument and returning a 2-D np.ndarray dm_hds with shape (N,N) containing the pairwise HD distances (NOT squared) between the data points in X_hds. dm_hds[i,j] stores the HD distance (NOT squared) between X_hds[i,:] and X_hds[j,:]. If dsm_hds is specified and not None, X_hds and dm_fct are not used and can be set to None. If dsm_hds is None, it is deduced from X_hds using squared Euclidean distance if dm_fct is None, or using dm_fct(X_hds)**2 if dm_fct is not None. Hence, dm_fct is only used if dsm_hds is None and in this case, X_hds cannot be None, otherwise an error is raised. An example of a valid function for the dm_fct parameter is the eucl_dist_matr one.
Out:
If fit_U is True, a tuple with:
- dsm_hds: 2-D numpy array with the squared pairwise HD distances between the data points.
- tau_hi: a 2-D numpy array in which element tau_hi[h,i] contains tau_{hi} = 2/pi_{hi}, as defined in [2].
- p_h: 1-D numpy array with the LD precisions at each scale defined by K_h.
- t_h: 1-D numpy array with the LD bandwidths at each scale defined by K_h.
If fit_U is False, return a 3-D numpy array sim_hij with shape (K_h.size, N, N) where sim_hij[h,:,:] contains the HD similarities at scale K_h[h].
"""
global module_name
# Computing the squared HD distances.
if dsm_hds is None:
if X_hds is None:
raise ValueError("Error in function mssne_sim_hds_bandwidth of module {module_name}: if dsm_hds is None, X_hds cannot be None.".format(module_name=module_name))
if dm_fct is None:
dsm_hds = scipy.spatial.distance.squareform(X=scipy.spatial.distance.pdist(X=X_hds, metric='sqeuclidean'), force='tomatrix')
else:
dsm_hds = (dm_fct(X_hds)**2).astype(np.float64)
if fit_U:
# Computing the multi-scale HD and LD bandwidths and precisions
tau_hi, D_h, U, p_h, t_h = ms_hld_bandwidths(dsm_hds=dsm_hds, K_h=K_h, N=N, n_components=n_components, X_lds=X_lds, fit_U=fit_U)
return dsm_hds, tau_hi, p_h, t_h
else:
# Returning the HD similarities at the scales indicated by K_h
return ms_hd_similarities(dsm_hds=dsm_hds, arr_perp=K_h)[2]
def mssne_sev_sim_hds_bandwidth(X_hds_sev, K_h, N, n_components, X_lds, fit_U=True, dm_fct=None):
"""
Call the function mssne_sim_hds_bandwidth on multiple HD data sets.
In:
- X_hds_sev: 3-D numpy array with shape (n_samp, N, M). For i in range(n_samp), X_hds_sev[i,:,:] contains a HD data set, with one example per row and one dimension per column.
- K_h: 1-D numpy array with the perplexities at each scale in increasing order. Its size is denoted by L.
- N: number of data points.
- n_components: number of components of the LDS.
- X_lds: 2-D numpy array with shape (N, n_components) containing the current value of the LD embedding.
- fit_U: same as in mssne_sev_implem.
- dm_fct: same as in mssne_sim_hds_bandwidth function. See mssne_sim_hds_bandwidth for a description.
Out:
If fit_U is True, a tuple with:
- dsm_hds: 3-D numpy array with shape (n_samp, N, N). For each i in range(n_samp), dsm_hds[i,:,:] contains the squared pairwise HD distances between the data points in X_hds_sev[i,:,:].
- tau_hi: a 3-D numpy array with shape (n_samp, L, N). For each k in range(n_samp), tau_hi[k,h,i] contains tau_{hi} = 2/pi_{hi}, as defined in [2], for the data set X_hds_sev[k,:,:].
- p_h: 2-D numpy array with shape (n_samp, L). For each k in range(n_samp), p_h[k,:] contains the LD precisions at each scale defined by K_h for the data set X_hds_sev[k,:,:].
- t_h: 2-D numpy array with shape (n_samp, L). For each k in range(n_samp), t_h[k,:] contains the LD bandwidths at each scale defined by K_h for the data set X_hds_sev[k,:,:].
If fit_U is False, a tuple with:
- sim_hij: a 3-D numpy array with shape (K_h.size, N, N). sim_hij[h,:,:] contains the expectation of the HD similarities at scale K_h[h] over the HD data sets.
- p_h: 1-D numpy array with K_h.size elements, containing the LD precisions at each scale.
- t_h: 1-D numpy array with K_h.size elements, containing the LD bandwidths at each scale.
"""
n_samp = X_hds_sev.shape[0]
# Number of scales
L = K_h.size
# Initializing the returned values
if fit_U:
dsm_hds, tau_hi, p_h, t_h = np.empty(shape=(n_samp, N, N), dtype=np.float64), np.empty(shape=(n_samp, L, N), dtype=np.float64), np.empty(shape=(n_samp, L), dtype=np.float64), np.empty(shape=(n_samp, L), dtype=np.float64)
else:
# Computing the LD bandwidths and precisions. The value of tau_hi does not matter as fit_U is False; it must still be set to a numpy array of np.float64 for numba compatibility.
p_h, t_h = ms_ld_bandwidths(tau_hi=np.empty(shape=(L, N), dtype=np.float64), K_h=K_h, N=N, n_components=n_components, X_lds=X_lds, fit_U=fit_U)[2:]
# Initializing the HD similarities at each scale that will be returned
sim_hij = np.zeros(shape=(L, N, N), dtype=np.float64)
# For each data set
for k in range(n_samp):
ret_k = mssne_sim_hds_bandwidth(X_hds=X_hds_sev[k,:,:], K_h=K_h, N=N, n_components=n_components, X_lds=X_lds, fit_U=fit_U, dm_fct=dm_fct)
if fit_U:
dsm_hds[k,:,:], tau_hi[k,:,:], p_h[k,:], t_h[k,:] = ret_k
else:
sim_hij += ret_k
# Returning
if fit_U:
return dsm_hds, tau_hi, p_h, t_h
else:
# Normalizing sim_hij before returning. This enables computing the expectation of the HD similarities over the HD data sets.
sim_hij /= np.float64(n_samp)
return sim_hij, p_h, t_h
def mssne_manage_seed(seed_mssne=None):
"""
Manage the random seed in mssne_implem and mssne_sev_implem.
In:
- seed_mssne: an integer or None. If it is None, it is set to seed_MsSNE_def. If it is not an integer, an error is raised.
Out:
If seed_mssne > 0, np.random.RandomState(seed_mssne) is returned. Otherwise, np.random is returned.
"""
global seed_MsSNE_def, module_name
if seed_mssne is None:
seed_mssne = seed_MsSNE_def
if seed_mssne != int(round(seed_mssne)):
raise ValueError("Error in function mssne_manage_seed of module {module_name}: seed_mssne must be an integer.".format(module_name=module_name))
if seed_mssne > 0:
return np.random.RandomState(seed_mssne)
else:
return np.random
def mssne_implem(X_hds, init='pca', n_components=2, ret_sim_hds=False, fit_U=True, dm_hds=None, seed_mssne=None):
"""
This function applies Multi-scale SNE to reduce the dimension of a data set [2].
In:
- X_hds: 2-D numpy.ndarray with shape (N, M), containing the HD data set, with one example per row and one dimension per column, or None. If X_hds is not None, it is assumed that it does not contain duplicated examples. X_hds can only be None if dm_hds is not None and init is not set to 'pca', otherwise an error is raised.
- init: determines the initialization of the LD embedding.
---> If isinstance(init, str) is True:
------> If init is equal to 'pca', the LD embedding is initialized with the first n_components principal components of X_hds. X_hds cannot be None in this case, even if dm_hds is specified, otherwise an error is raised.
------> If init is equal to 'random', the LD embedding is initialized randomly, using a uniform Gaussian distribution with small variance. X_hds may be set to None in this case if dm_hds is specified.
------> Otherwise an error is raised.
---> If isinstance(init, np.ndarray) is True:
------> init must in this case be a 2-D numpy array, with N rows and n_components columns. It stores the LD positions to use for the initialization, with one example per row and one LD dimension per column. init[i,:] contains the initial LD coordinates for the HD sample X_hds[i,:]. X_hds may be set to None in this case if dm_hds is specified. If init.ndim != 2 or init.shape[0] != N or init.shape[1] != n_components, an error is raised.
---> Otherwise, an error is raised.
- n_components: dimension of the LDS.
- ret_sim_hds: boolean. If True, the multi-scale HD similarities are also returned.
- fit_U: boolean indicating whether to fit the U in the definition of the LD similarities in [2]. If True, the U is tuned as in [2]. Otherwise, it is forced to 1. Setting fit_U to True usually tends to slightly improve DR quality at the expense of slightly increasing computation time.
- dm_hds: (optional) 2-D numpy array with the pairwise HD distances (NOT squared) between the data points. If dm_hds is None, it is deduced from X_hds using Euclidean distances. If dm_hds is not None, then the pairwise HD distances are not recomputed and X_hds may either be None or defined; if X_hds is not None, it will only be used if init is set to 'pca', to initialize the LD embedding to the first n_components principal components of X_hds. Hence, if both dm_hds and X_hds are not None and if init is set to 'pca', dm_hds and X_hds are assumed to be compatible, i.e. dm_hds[i,j] is assumed to store the HD distance between X_hds[i,:] and X_hds[j,:].
- seed_mssne: seed to use for the random state. Check mssne_manage_seed for a description.
Out:
If ret_sim_hds is False, a 2-D numpy.ndarray X_lds with shape (N, n_components), containing the LD data set, with one example per row and one dimension per column. X_lds[i,:] contains the LD coordinates of the HD sample X_hds[i,:].
If ret_sim_hds is True, a tuple with:
- X_lds as described in the case of ret_sim_hds = False.
- a 2-D numpy array sigma_ij with shape (N, N), where N is the number of samples. It contains the multi-scale pairwise HD similarities between the samples. sigma_ij[i,j] stores the multi-scale HD similarity between X_hds[i,:] and X_hds[j,:].
Remarks:
- L-BFGS algorithm is used, as suggested in [2].
- Multi-scale optimization is performed, as presented in [2].
- Euclidean distances are employed to evaluate the pairwise HD similarities by default, as in [1]. Other distances can be used in the HD space by specifying the dm_hds parameter. Euclidean distances are employed in the LD embedding.
"""
global dr_nitmax, dr_gtol, dr_ftol, dr_maxls, dr_maxcor, n_eps_np_float64, module_name
# Checking the value of dm_hds
dm_hds_none = dm_hds is None
if dm_hds_none:
dsm_hds = None
if X_hds is None:
raise ValueError("Error in function mssne_implem of module {module_name}: X_hds and dm_hds cannot both be None.".format(module_name=module_name))
else:
dsm_hds = dm_hds**2
dsm_hds = dsm_hds.astype(np.float64)
# Defining the random state
rand_state = mssne_manage_seed(seed_mssne)
# Number of data points
if dm_hds_none:
N = X_hds.shape[0]
else:
N = dsm_hds.shape[0]
if fit_U:
arr_ones = np.ones(shape=N, dtype=np.int64)
# Product of N and n_components
prod_N_nc = N*n_components
# Maximum number of L-BFGS steps at each stage of the multi-scale optimization.
nit_max = dr_nitmax
# Tolerance for the norm of the gradient in the L-BFGS algorithm
gtol = dr_gtol
# Tolerance for the relative update of the objective function value.
ftol = dr_ftol
# Smallest float
np_eps = n_eps_np_float64
# Function to compute the gradient of the objective.
fct_grad = mssne_grad
# Function to compute the objective value
fct_obj = mssne_obj_fct
# Maximum number of line search steps per L-BFGS iteration.
maxls = dr_maxls
# Maximum number of variable metric corrections used to define the limited memory matrix in L-BFGS.
maxcor = dr_maxcor
# Defining K_star for the multi-scale perplexities, following the notations of [2].
K_star = 2
# Computing the multi-scale perplexities, following the notations of [2].
L, K_h = ms_perplexities(N=N, K_star=K_star)
# Initializing the LD embedding.
X_lds = init_lds(X_hds=X_hds, N=N, init=init, n_components=n_components, rand_state=rand_state)
# Computing the multi-scale HD bandwidths if fit_U is True, and the HD similarities at the different scales otherwise. We also evaluate the LD bandwidths and precisions.
retv_mssne_sim = mssne_sim_hds_bandwidth(X_hds=X_hds, K_h=K_h, N=N, n_components=n_components, X_lds=X_lds, fit_U=fit_U, dsm_hds=dsm_hds)
if fit_U:
dsm_hds, tau_hi, p_h, t_h = retv_mssne_sim
dsm_hds_min_row_dsm_lds_t = mssne_eval_dsm_lds_min_row_dsm_lds_t_dsm(dsm_lds=dsm_hds)
else:
sim_hij_allh = retv_mssne_sim
p_h, t_h = ms_ld_bandwidths(tau_hi=np.empty(shape=(L, N), dtype=np.float64), K_h=K_h, N=N, n_components=n_components, X_lds=X_lds, fit_U=fit_U)[2:]
# Note that t_h >= np_eps as ensured in function ms_ld_bandwidths
# Reshaping X_lds as the optimization functions work with 1-D arrays.
X_lds = np.reshape(a=X_lds, newshape=prod_N_nc)
# Matrix storing the multi-scale HD similarities sigma_{ij}. Element [i,j] contains sigma_{ij}. sigma_{ii} is set to 0. We need to recompute them progressively by adding perplexities one at the time as we perform multi-scale optimization.
sigma_ij = np.zeros(shape=(N,N), dtype=np.float64)
# Multi-scale optimization. n_perp is the number of currently considered perplexities.
for n_perp in range(1, L+1, 1):
# Index of the currently added perplexity.
h = L-n_perp
# LD precisions
cur_p_h = p_h[h:]
# LD bandwidths
cur_t_h = t_h[h:]
# Computing the N**2 HD similarities sigma_{hij} if fit_U is True, using the bandwidths tau_hi which were already computed. Otherwise, we just need to gather them.
if fit_U:
sigma_hij = mssne_lds_similarities_h(arr_den_s_i=tau_hi[h,:], np_eps=np_eps, arr_ones=arr_ones, dsm_lds_min_row_dsm_lds_t=dsm_hds_min_row_dsm_lds_t)
else:
sigma_hij = sim_hij_allh[h,:,:]
# Updating the multi-scale HD similarities
sigma_ij = (sigma_ij*(np.float64(n_perp)-1.0) + sigma_hij)/np.float64(n_perp)
# Defining the arguments of the L-BFGS algorithm
args = (sigma_ij, N, n_components, cur_p_h, cur_t_h, n_perp)
# Running L-BFGS
res = scipy.optimize.minimize(fun=fct_obj, x0=X_lds, args=args, method='L-BFGS-B', jac=fct_grad, bounds=None, callback=None, options={'disp':False, 'maxls':maxls, 'gtol':gtol, 'maxiter':nit_max, 'maxcor':maxcor, 'maxfun':np.inf, 'ftol':ftol})
X_lds = res.x
# Reshaping the result
X_lds = np.reshape(a=X_lds, newshape=(N, n_components))
# Returning
if ret_sim_hds:
return X_lds, sigma_ij
else:
return X_lds
def mssne_sev_implem(X_hds_sev, init='random', n_components=2, fit_U=False, dm_fct=None, seed_mssne=None):
"""
Similar to mssne_implem, but minimizes the mean cost function computed over several data sets.
This function can be used to compute a LD embedding of an incomplete data set by minimizing the expectation of the multi-scale SNE cost function, which is estimated thanks to multiple imputations, as detailed in [1].
In: