-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathdag_emb_model.py
904 lines (721 loc) · 37.7 KB
/
dag_emb_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import time
import math
import numpy as np
from collections import defaultdict, Counter
from numpy import random as np_random
from six import string_types
from gensim import utils, matutils
from GensimBasicKeyedVectors import Vocab,KeyedVectorsBase
try:
from autograd import grad # Only required for optionally verifying gradients while training
from autograd import numpy as grad_np
AUTOGRAD_PRESENT = True
except ImportError:
AUTOGRAD_PRESENT = False
class DAGEmbeddingBatch(object):
"""Compute gradients and loss for a training batch.
Class for computing gradients and loss for a training batch and storing intermediate state to
avoid recomputing multiple times.
"""
def __init__(self,
vectors_u,
vectors_v,
indices_u,
indices_v,
rels_reversed,
dag_embedding_model):
"""
Initialize instance with sets of vectors for which distances are to be computed.
Parameters
----------
vectors_u : numpy.array
Vectors of all nodes `u` in the batch.
Expected shape (1, dim, batch_size).
vectors_v : numpy.array
Vectors of all positively related nodes `v` and negatively sampled nodes `v'`,
for each node `u` in the batch.
Expected shape (1 + neg_size, dim, batch_size).
indices_u : list
List of node indices for each of the vectors in `vectors_u`.
indices_v : list
Nested list of lists, each of which is a list of node indices
for each of the vectors in `vectors_v` for a specific node `u`.
rels_reversed : bool
If the true edges are from u to v, or reversed, from v to u.
"""
self.vectors_u = vectors_u # (1, dim, batch_size)
self.vectors_v = vectors_v # (1 + neg_size, dim, batch_size)
self.indices_u = indices_u
self.indices_v = indices_v
self.norms_u = np.linalg.norm(self.vectors_u, axis=1) # (1, batch_size)
self.norms_u_sq = self.norms_u ** 2
self.norms_v = np.linalg.norm(self.vectors_v, axis=1) # (1 + neg_size, batch_size)
self.norms_v_sq = self.norms_v ** 2
self.one_minus_norms_sq_u = 1 - self.norms_u_sq # (1, batch_size)
self.one_minus_norms_sq_v = 1 - self.norms_v_sq # (1 + neg_size, batch_size)
self.rels_reversed = rels_reversed
self.loss_gradients_u = None
self.loss_gradients_v = None
self.loss = None
self.pos_loss = None # positive part of the loss
self.neg_loss = None # negative part of the loss
self._loss_gradients_computed = False
self._loss_computed = False
def compute_all(self):
"""Convenience method to perform all computations."""
self._compute_loss()
self._compute_loss_gradients()
def _compute_loss(self):
"""Compute and store loss value for the given batch of examples."""
raise NotImplementedError
def _compute_loss_gradients(self):
"""Compute and store gradients of loss function for all input vectors."""
raise NotImplementedError
class DAGEmbeddingKeyedVectors(KeyedVectorsBase):
"""Class to contain vectors and vocab for the
:class:`~gensim.models.poincare.DAGEmbeddingKeyedVectorsModel` training class.
Used to perform operations on the vectors such as vector lookup, distance etc.
"""
def __init__(self):
super(DAGEmbeddingKeyedVectors, self).__init__()
def vector_distance(self, vector_1, vector_2):
"""
Return poincare distance between two input vectors. Convenience method over `vector_distance_batch`.
Parameters
----------
vector_1 : numpy.array
input vector
vector_2 : numpy.array
input vector
Returns
-------
numpy.float
Distance between `vector_1` and `vector_2`.
"""
return DAGEmbeddingKeyedVectors.vector_distance_batch(vector_1, vector_2[np.newaxis, :])[0]
def distance(self, w1, w2):
"""
Return distance between vectors for nodes `w1` and `w2`.
Parameters
----------
w1 : str or int
Key for first node.
w2 : str or int
Key for second node.
Returns
-------
float
distance between the vectors for nodes `w1` and `w2`.
Examples
--------
>>> model.distance('mammal.n.01', 'carnivore.n.01')
2.13
Notes
-----
Raises KeyError if either of `w1` and `w2` is absent from vocab.
"""
vector_1 = self.word_vec(w1)
vector_2 = self.word_vec(w2)
return self.vector_distance(vector_1, vector_2)
def most_similar(self, node_or_vector, topn=10, restrict_vocab=None):
"""
Find the top-N most similar nodes to the given node or vector, sorted in increasing order of distance.
Parameters
----------
node_or_vector : str/int or numpy.array
node key or vector for which similar nodes are to be found.
topn : int or None, optional
number of similar nodes to return, if `None`, returns all.
restrict_vocab : int or None, optional
Optional integer which limits the range of vectors which are searched for most-similar values.
For example, restrict_vocab=10000 would only check the first 10000 node vectors in the vocabulary order.
This may be meaningful if vocabulary is sorted by descending frequency.
Returns
--------
list of tuples (str, float)
List of tuples containing (node, distance) pairs in increasing order of distance.
Examples
--------
>>> vectors.most_similar('lion.n.01')
[('lion_cub.n.01', 0.4484), ('lionet.n.01', 0.6552), ...]
"""
if not restrict_vocab:
all_distances = self.distances(node_or_vector)
else:
nodes_to_use = self.index2word[:restrict_vocab]
all_distances = self.distances(node_or_vector, nodes_to_use)
if isinstance(node_or_vector, string_types + (int,)):
node_index = self.vocab[node_or_vector].index
else:
node_index = None
if not topn:
closest_indices = matutils.argsort(all_distances)
else:
closest_indices = matutils.argsort(all_distances, topn=1 + topn)
result = [
(self.index2word[index], float(all_distances[index]))
for index in closest_indices if (not node_index or index != node_index) # ignore the input node
]
if topn:
result = result[:topn]
return result
def vector_distance_batch(self, vector_1, vectors_all):
"""
Return distances between one vector and a set of other vectors.
Parameters
----------
vector_1 : numpy.array
vector from which distances are to be computed.
expected shape (dim,)
vectors_all : numpy.array
for each row in vectors_all, distance from vector_1 is computed.
expected shape (num_vectors, dim)
Returns
-------
numpy.array
Contains distance between vector_1 and each row in vectors_all.
shape (num_vectors,)
"""
raise NotImplementedError
def distances_from_indices(self, node_index, other_indices=()):
assert node_index < len(self.syn0)
input_vector = self.syn0[node_index]
if not other_indices:
other_vectors = self.syn0
else:
other_vectors = self.syn0[other_indices]
return self.vector_distance_batch(input_vector, other_vectors)
def is_a_scores_vector_batch(self, alpha, parent_vectors, other_vectors, rel_reversed):
raise NotImplementedError
def is_a_scores_from_indices(self, alpha, parent_indices, other_indices, rel_reversed):
parent_vectors = self.syn0[parent_indices]
other_vectors = self.syn0[other_indices]
return self.is_a_scores_vector_batch(alpha, parent_vectors, other_vectors, rel_reversed)
def distances(self, node_or_vector, other_nodes=()):
"""
Compute distances from given node or vector to all nodes in `other_nodes`.
If `other_nodes` is empty, return distance between `node_or_vector` and all nodes in vocab.
Parameters
----------
node_or_vector : str/int or numpy.array
Node key or vector from which distances are to be computed.
other_nodes : iterable of str/int or None
For each node in `other_nodes` distance from `node_or_vector` is computed.
If None or empty, distance of `node_or_vector` from all nodes in vocab is computed (including itself).
Returns
-------
numpy.array
Array containing distances to all nodes in `other_nodes` from input `node_or_vector`,
in the same order as `other_nodes`.
Examples
--------
>>> model.distances('mammal.n.01', ['carnivore.n.01', 'dog.n.01'])
np.array([2.1199, 2.0710]
>>> model.distances('mammal.n.01')
np.array([0.43753847, 3.67973852, ..., 6.66172886])
Notes
-----
Raises KeyError if either `node_or_vector` or any node in `other_nodes` is absent from vocab.
"""
if isinstance(node_or_vector, string_types):
input_vector = self.word_vec(node_or_vector)
else:
input_vector = node_or_vector
if other_nodes == None:
other_vectors = self.syn0
else:
other_indices = [self.vocab[node].index for node in other_nodes]
other_vectors = self.syn0[other_indices]
return self.vector_distance_batch(input_vector, other_vectors)
###################################################################################################
class DAGEmbeddingModel(utils.SaveLoad):
"""Class for training, using and evaluating DAG Embeddings.
The model can be stored/loaded via its :meth:`~DAGEmbeddingModel.save`
and :meth:`~DAGEmbeddingModel.load` methods, or stored/loaded in the word2vec format
via `model.kv.save_word2vec_format` and :meth:`~DAGEmbeddingKeyedVectors.load_word2vec_format`.
Note that training cannot be resumed from a model loaded via `load_word2vec_format`, if you wish to train further,
use :meth:`~DAGEmbeddingModel.save` and :meth:`~DAGEmbeddingModel.load`
methods instead.
"""
def __init__(self,
train_data,
dim=50, # Number of dimensions of the trained model.
init_range=(-0.0001, 0.0001), # Range within which the vectors are randomly initialized.
lr=0.1, # Learning rate for training.
opt='', # rsgd or exp_map or sgd
burn_in=50, # Number of epochs to use for burn-in initialization (0 means no burn-in).
seed=0,
logger=None,
BatchClass=DAGEmbeddingBatch,
KeyedVectorsClass=DAGEmbeddingKeyedVectors,
num_negative=10, # Number of negative samples to use.
### How to sample negatives for an edge (u,v)
neg_sampl_strategy='true_neg', # 'all' (all nodes used for negative sampling) or 'true_neg' (only not connected nodes)
# 'all_non_leaves' or 'true_neg_non_leaves'
where_not_to_sample='ancestors', # both or ancestors or children. Has no effect if neg_sampl_strategy = 'all'.
neg_edges_attach='child', # How to form negative edges: 'parent' (u,v') or 'child' (u', v) or 'both'
always_v_in_neg=True, # always include the true edge (u,v) as negative.
neg_sampling_power=0.0, # 0 for uniform, 1 for unigram, 0.75 for word2vec
):
"""Initialize and train a DAG embedding model from an iterable of relations.
Parameters
----------
train_data : iterable of (str, str)
Iterable of relations, e.g. a list of tuples, or a Relations instance streaming from a file.
Note that the relations are treated as ordered pairs, i.e. a relation (a, b) does not imply the
opposite relation (b, a). In case the relations are symmetric, the data should contain both relations
(a, b) and (b, a).
"""
self.logger = logger
self.train_data = train_data
self.kv = KeyedVectorsClass()
self.BatchClass = BatchClass
self.dim = dim
self.only_leaves_updated = False
self.train_lr = lr # Learning rate for training
self.lr = lr # Current learning rate
self.opt = opt
assert self.opt in ['rsgd', 'exp_map', 'sgd']
self.burn_in_lr = self.lr / 10 # Learning rate for burn-in
self.burn_in = burn_in
self._burn_in_done = False
self.num_check_gradients = 0 # Check up to 10 gradients using Autograd
self._loss_grad = None
self.where_not_to_sample = where_not_to_sample
assert self.where_not_to_sample in ['both', 'ancestors', 'children']
self.neg_edges_attach = neg_edges_attach
assert self.neg_edges_attach in ['parent', 'child', 'both']
self.always_v_in_neg = always_v_in_neg
self.num_negative = num_negative
self._neg_sampling_power = neg_sampling_power
assert self._neg_sampling_power >= 0 and self._neg_sampling_power <= 2
self.neg_sampl_strategy = neg_sampl_strategy
assert self.neg_sampl_strategy in ['all', 'true_neg', 'all_non_leaves', 'true_neg_non_leaves']
self._np_rand = np_random.RandomState(seed)
self._load_relations_and_indexes()
self._init_embeddings(init_range)
def _load_relations_and_indexes(self):
"""Load relations from the train data and build vocab."""
self.kv.vocab = {} # word -> (index, count)
self.kv.index2word = [] # index -> word
self.all_relations = [] # List of all relation pairs
self.adjacent_nodes = defaultdict(set) # Mapping from node index to its neighboring node indices
if '_non_leaves' in self.neg_sampl_strategy:
self.non_leaves_indices_set = set()
self.non_leaves_adjacent_nodes = defaultdict(set)
self.logger.info("Loading relations from train data..")
for relation in self.train_data:
if len(relation) != 2:
raise ValueError('Relation pair "%s" should have exactly two items' % repr(relation))
# Ignore self-edges
assert relation[0] != relation[1]
for item in relation:
if item not in self.kv.vocab:
self.kv.vocab[item] = Vocab(count=1, index=len(self.kv.index2word))
self.kv.index2word.append(item)
# Like in https://github.com/facebookresearch/poincare-embeddings
self.kv.vocab[relation[0]].count += 1
node_1, node_2 = relation # Edge direction : node1 -> node2, swapped in the csv file, but correctly read in PoincareRelations.
node_1_index, node_2_index = self.kv.vocab[node_1].index, self.kv.vocab[node_2].index
if self.where_not_to_sample in ['both', 'children']:
self.adjacent_nodes[node_1_index].add(node_2_index)
if self.where_not_to_sample in ['both', 'ancestors']:
self.adjacent_nodes[node_2_index].add(node_1_index)
if '_non_leaves' in self.neg_sampl_strategy:
self.non_leaves_indices_set.add(node_1_index)
self.all_relations.append((node_1_index, node_2_index))
for node_idx in range(len(self.kv.index2word)):
self.adjacent_nodes[node_idx].add(node_idx) # Do not sample current node
if '_non_leaves' in self.neg_sampl_strategy:
for node_idx in range(len(self.kv.index2word)):
self.non_leaves_adjacent_nodes[node_idx].add(node_idx)
for adj_node_idx in self.adjacent_nodes[node_idx]:
if adj_node_idx in self.non_leaves_indices_set:
self.non_leaves_adjacent_nodes[node_idx].add(adj_node_idx)
self.logger.info("Loaded %d relations from train data, %d nodes",
len(self.all_relations), len(self.kv.vocab))
self.indices_set = set((range(len(self.kv.index2word)))) # Set of all node indices
freq_array = np.array([self.kv.vocab[self.kv.index2word[i]].count
for i in range(len(self.kv.index2word))], dtype=np.float64)
unigrams_at_power_array = np.power(freq_array, self._neg_sampling_power)
self._node_probabilities = unigrams_at_power_array / unigrams_at_power_array.sum()
self._node_probabilities_cumsum = np.cumsum(self._node_probabilities)
if '_non_leaves' in self.neg_sampl_strategy:
self.non_leaves_indices_array = np.array(list(self.non_leaves_indices_set))
unigrams_at_power_array_non_leaves = unigrams_at_power_array[self.non_leaves_indices_array]
self._node_probabilities_non_leaves = unigrams_at_power_array_non_leaves / \
unigrams_at_power_array_non_leaves.sum()
self._node_probabilities_cumsum_non_leaves = np.cumsum(self._node_probabilities_non_leaves)
def _init_embeddings(self, init_range):
"""Randomly initialize vectors for the items in the vocab."""
shape = (len(self.kv.index2word), self.dim)
self.kv.syn0 = self._np_rand.uniform(init_range[0], init_range[1], shape).astype(np.float64)
def _sample_negatives(self, node_index, connected_node):
"""Return a sample of negatives for the given node.
Parameters
----------
node_index : int
Index of the positive node for which negative samples are to be returned.
Returns
-------
numpy.array
Array of shape (self.num_negative,) containing indices of negative nodes for the given node index.
"""
k = self.num_negative # num negatives
if self.neg_sampl_strategy == 'all' or len(self.adjacent_nodes[node_index]) == len(self.indices_set): # root node
uniform_0_1_numbers = self._np_rand.random_sample(self.num_negative)
negs = list(np.searchsorted(self._node_probabilities_cumsum, uniform_0_1_numbers))
elif self.neg_sampl_strategy == 'all_non_leaves':
uniform_0_1_numbers = self._np_rand.random_sample(self.num_negative)
negs = list(np.searchsorted(self._node_probabilities_cumsum_non_leaves, uniform_0_1_numbers))
negs = self.non_leaves_indices_array[negs]
elif self.neg_sampl_strategy == 'true_neg':
n = len(self.indices_set)
a = len(self.adjacent_nodes[node_index])
gamma = float(n) / (n - a)
if gamma > n / (max(k,1) * math.log(n)):
# Very expensive branch: O(n + k*log(n)). Should be avoided when possible.
valid_negatives = np.array(list(self.indices_set - self.adjacent_nodes[node_index])) # O(n). Includes node_index.
valid_node_probs = self._node_probabilities[valid_negatives]
valid_node_probs = valid_node_probs / valid_node_probs.sum()
valid_node_cumsum = np.cumsum(valid_node_probs) # O(n)
uniform_0_1_numbers = self._np_rand.random_sample(k)
valid_negative_indices = np.searchsorted(valid_node_cumsum, uniform_0_1_numbers) # O(k * log n)
negs = list(valid_negatives[valid_negative_indices])
else:
# Less expensive branch: O(n / (n-a) * k * log n)
# we sample gamma * k negatives and hope to find at least k true negatives
negatives = []
remain_to_sample = k
while remain_to_sample > 0:
num_to_sample = int(gamma * remain_to_sample)
uniform_0_1_numbers = self._np_rand.random_sample(num_to_sample)
new_potential_negatives =\
np.searchsorted(self._node_probabilities_cumsum, uniform_0_1_numbers) # O(gamma * k * log n)
# time complexity O(gamma * k),
# but len(new_good_negatives) is in expectation (1 - a/n) * len(new_potential_candidates)
new_good_negatives = [x for x in new_potential_negatives
if x not in self.adjacent_nodes[node_index]]
num_new_good_negatives = min(len(new_good_negatives), remain_to_sample)
negatives.extend(new_good_negatives[0 : num_new_good_negatives])
remain_to_sample -= num_new_good_negatives
negs = negatives
elif self.neg_sampl_strategy == 'true_neg_non_leaves':
n = len(self.non_leaves_indices_set)
a = len(self.non_leaves_adjacent_nodes[node_index])
gamma = float(n) / (n - a)
if gamma > n / (max(k,1) * math.log(n)):
# Very expensive branch: O(n + k*log(n)). Should be avoided when possible.
valid_negatives = np.array(list(self.non_leaves_indices_set - self.non_leaves_adjacent_nodes[node_index])) # O(n). Includes node_index.
valid_node_probs = self._node_probabilities_non_leaves[valid_negatives]
valid_node_probs = valid_node_probs / valid_node_probs.sum()
valid_node_cumsum = np.cumsum(valid_node_probs) # O(n)
uniform_0_1_numbers = self._np_rand.random_sample(k)
valid_negative_indices = np.searchsorted(valid_node_cumsum, uniform_0_1_numbers) # O(k * log n)
negs = list(valid_negatives[valid_negative_indices])
else:
# Less expensive branch: O(n / (n-a) * k * log n)
# we sample gamma * k negatives and hope to find at least k true negatives
negatives = []
remain_to_sample = k
while remain_to_sample > 0:
num_to_sample = int(gamma * remain_to_sample)
uniform_0_1_numbers = self._np_rand.random_sample(num_to_sample)
new_potential_negatives =\
np.searchsorted(self._node_probabilities_cumsum_non_leaves, uniform_0_1_numbers) # O(gamma * k * log n)
new_potential_negatives = self.non_leaves_indices_array[new_potential_negatives]
# time complexity O(gamma * k),
# but len(new_good_negatives) is in expectation (1 - a/n) * len(new_potential_candidates)
new_good_negatives = [x for x in new_potential_negatives
if x not in self.non_leaves_adjacent_nodes[node_index]]
num_new_good_negatives = min(len(new_good_negatives), remain_to_sample)
negatives.extend(new_good_negatives[0 : num_new_good_negatives])
remain_to_sample -= num_new_good_negatives
negs = negatives
# Should we always include 'v' as negative in all batches ?
if self.always_v_in_neg:
negs[0] = connected_node
return negs
def _prepare_training_batch(self, relations, all_negatives, rels_reversed):
"""Creates training batch and computes gradients and loss for the batch.
Parameters
----------
relations : list of tuples
List of tuples of positive examples of the form (node_1_index, node_2_index).
all_negatives : list of lists
List of lists of negative samples for each node_1 in the positive examples.
rels_reversed : bool
If the relations are from u to v or, reversed, from v to u.
Returns
-------
:class:`~DAGEmbeddingBatchBatch`
Contains node indices, computed gradients and loss for the batch.
"""
batch_size = len(relations)
indices_u, indices_v = [], []
for relation, negatives in zip(relations, all_negatives):
u, v = relation
indices_u.append(u)
indices_v.append(v)
indices_v.extend(negatives)
vectors_u = self.kv.syn0[indices_u].T[np.newaxis, :, :] # (1, dim, batch_size)
vectors_v = self.kv.syn0[indices_v].reshape((batch_size, 1 + self.num_negative, self.dim))
vectors_v = vectors_v.swapaxes(0, 1).swapaxes(1, 2) # (1 + neg, dim, batch_size)
batch = self.BatchClass(vectors_u, vectors_v, indices_u, indices_v, rels_reversed, self)
batch.compute_all()
if self.num_check_gradients < 5:
self.num_check_gradients += 1
self._check_gradients(relations, all_negatives, batch, rels_reversed)
return batch
def _handle_duplicates(self, vector_updates, node_indices):
"""Handles occurrences of multiple updates to the same node in a batch of vector updates.
Parameters
----------
vector_updates : numpy.array
Array with each row containing updates to be performed on a certain node.
node_indices : list
Node indices on which the above updates are to be performed on.
Notes
-----
Mutates the `vector_updates` array.
Required because vectors[[2, 1, 2]] += np.array([-0.5, 1.0, 0.5]) performs only the last update
on the row at index 2.
"""
counts = Counter(node_indices)
for node_index, count in counts.items():
if count == 1 and not (self.only_leaves_updated and node_index in self.non_leaves_indices_set):
continue
positions = [i for i, index in enumerate(node_indices) if index == node_index]
# Move all updates to the same node to the last such update, zeroing all the others
vector_updates[positions[-1]] = vector_updates[positions].sum(axis=0)
vector_updates[positions[:-1]] = 0
if self.only_leaves_updated and node_index in self.non_leaves_indices_set:
vector_updates[positions[-1]] = 0
def _clip_vectors(self, vectors):
"""Clip vectors. E.g. to have a norm of less than one for the hyperbolic setting.
Parameters
----------
vectors : numpy.array
Can be 1-D,or 2-D (in which case the norm for each row is checked).
Returns
-------
numpy.array
Array with vectors clipped.
"""
raise NotImplementedError
@staticmethod
def exponential_map(x, v):
"""
Computes the exponential map exp_x(v).
:param x, v: d-dimensional tensors, the last dimension is the embedding dimension
"""
d = x.ndim
lam_x = 2.0 / (1.0 - np.linalg.norm(x, axis=d-1) ** 2) # d-1 dims
norm_v = np.linalg.norm(v, axis=d-1) # d-1 dims
angle = lam_x * norm_v # d-1 dims
cosh = np.cosh(np.minimum(np.maximum(angle, -20), 20)) # d-1 dims
tanh = np.tanh(angle) # d-1 dims
normalized_v = v / np.maximum(norm_v[:, np.newaxis], 1e-6) # d dims
x_dot_norm_v = (x * normalized_v).sum(axis=d-1) # d-1 dims
numerator = (lam_x * (1 + x_dot_norm_v * tanh))[:, np.newaxis] * x + tanh[:, np.newaxis] * normalized_v # d dims
numitor = 1 / cosh + (lam_x - 1) + lam_x * x_dot_norm_v * tanh # d-1 dims
return numerator / (numitor[:, np.newaxis])
def _update_vectors_batch(self, batch):
"""Updates vectors for nodes in the given batch using RSGD.
Parameters
----------
batch : :class:`~hyperbolic.DAGEmbeddingBatch`
Batch containing computed loss gradients and node indices of the batch for which
updates are to be done.
"""
grad_u, grad_v = batch.loss_gradients_u, batch.loss_gradients_v # grad_u: (dim, batch_size) ; grad_v: (1 + neg_size, dim, batch_size)
indices_u, indices_v = batch.indices_u, batch.indices_v
batch_size = len(indices_u)
if self.opt == 'rsgd' or self.opt == 'exp_map':
# batch.one_minus_norms_sq_u: (1, batch_size), grad_u: (dim, batch_size)
u_updates = (self.lr * (batch.one_minus_norms_sq_u ** 2) / 4 * grad_u).T # (batch_size, dim)
self._handle_duplicates(u_updates, indices_u)
# batch.one_minus_norms_sq_v: (1 + neg_size, batch_size) , grad_v: (1 + neg_size, dim, batch_size)
v_updates = self.lr * (batch.one_minus_norms_sq_v ** 2)[:, np.newaxis] / 4 * grad_v
v_updates = v_updates.swapaxes(1, 2).swapaxes(0, 1) # (batch_size, 1 + neg_size, dim)
v_updates = v_updates.reshape(((1 + self.num_negative) * batch_size, self.dim))
self._handle_duplicates(v_updates, indices_v)
if self.opt == 'rsgd':
self.kv.syn0[indices_u] -= u_updates
self.kv.syn0[indices_v] -= v_updates
elif self.opt == 'exp_map':
self.kv.syn0[indices_u] = self.exponential_map(self.kv.syn0[indices_u], -u_updates)
self.kv.syn0[indices_v] = self.exponential_map(self.kv.syn0[indices_v], -v_updates)
elif self.opt == 'sgd':
u_updates = (self.lr * grad_u).T # (batch_size, dim)
self._handle_duplicates(u_updates, indices_u)
v_updates = self.lr * grad_v
v_updates = v_updates.swapaxes(1, 2).swapaxes(0, 1) # (batch_size, 1 + neg_size, dim)
v_updates = v_updates.reshape(((1 + self.num_negative) * batch_size, self.dim))
self._handle_duplicates(v_updates, indices_v)
self.kv.syn0[indices_u] -= u_updates
self.kv.syn0[indices_v] -= v_updates
self.kv.syn0[indices_u] = self._clip_vectors(self.kv.syn0[indices_u])
self.kv.syn0[indices_v] = self._clip_vectors(self.kv.syn0[indices_v])
def _train_on_batch(self, relations):
"""Performs training for a single training batch.
Parameters
----------
relations : list of tuples
List of tuples of positive examples of the form (node_1_index, node_2_index).
Returns
-------
loss value
"""
# Sample negatives in a batch: list of lists of negative node indices
loss = 0.0
pos_loss = 0.0
neg_loss = 0.0
if self.neg_edges_attach in ['child', 'both']:
relations = [(v,u) for (u,v) in relations] #####
all_negatives = [self._sample_negatives(relation[0], relation[1]) for relation in relations]
batch = self._prepare_training_batch(relations, all_negatives, rels_reversed=True)
self._update_vectors_batch(batch)
loss += batch.loss
pos_loss += batch.pos_loss
neg_loss += batch.neg_loss
if self.neg_edges_attach in ['parent', 'both']:
all_negatives = [self._sample_negatives(relation[0], relation[1]) for relation in relations]
batch = self._prepare_training_batch(relations, all_negatives, rels_reversed=False)
self._update_vectors_batch(batch)
loss += batch.loss
pos_loss += batch.pos_loss
neg_loss += batch.neg_loss
return loss, pos_loss, neg_loss
def _train_batchwise(self, epochs, batch_size, print_every=1):
"""Trains embeddings using specified parameters.
Parameters
----------
epochs : int
Number of iterations (epochs) over the corpus.
batch_size : int, optional
Number of examples to train on in a single batch.
print_every : int, optional
Prints progress and average loss after every `print_every` epochs.
"""
avg_loss = 0.0
avg_pos_loss = 0.0
avg_neg_loss = 0.0
for epoch in range(1, epochs + 1):
last_time = time.time()
relation_indices = list(range(len(self.all_relations)))
self._np_rand.shuffle(relation_indices)
avg_loss = 0.0
avg_pos_loss = 0.0
avg_neg_loss = 0.0
for batch_num, i in enumerate(range(0, len(relation_indices), batch_size), start=1):
batch_indices = relation_indices[i:i + batch_size]
relations_batch = [self.all_relations[idx] for idx in batch_indices]
# Train on this mini-batch
l, pl, nl = self._train_on_batch(relations_batch)
avg_loss += l
avg_pos_loss += pl
avg_neg_loss += nl
avg_loss *= float(batch_size) / len(relation_indices)
avg_pos_loss *= float(batch_size) / len(relation_indices)
avg_neg_loss *= float(batch_size) / len(relation_indices)
if epoch % print_every == 0:
self.logger.info('Epoch %d: LOSS: %.2f, POS LOSS: %.2f, NEG LOSS: %.2f' %
(epoch, avg_loss, avg_pos_loss, avg_neg_loss))
time_taken = time.time() - last_time
speed = len(relation_indices) / time_taken
self.logger.info(
'Time taken for %d examples: %.2f s, %.2f examples / s'
% (len(relation_indices), time_taken, speed))
return avg_loss, avg_pos_loss, avg_neg_loss
def train(self, epochs, batch_size, print_every=1):
"""Trains Poincare embeddings using loaded data and model parameters.
Parameters
----------
batch_size : int, optional
Number of examples to train on in a single batch.
epochs : int
Number of iterations (epochs) over the corpus.
print_every : int, optional
Prints progress and average loss after every `print_every` epochs.
"""
# Some divide-by-zero results are handled explicitly
old_settings = np.seterr(divide='ignore', invalid='ignore')
if self.burn_in > 0 and not self._burn_in_done:
self.logger.info("Starting burn-in (%d epochs)-------------------------------", self.burn_in)
self.lr = self.burn_in_lr
self._train_batchwise(
epochs=self.burn_in, batch_size=batch_size, print_every=print_every)
self._burn_in_done = True
freq_array = np.ones(len(self.kv.index2word), dtype=np.float64)
unigrams_at_power_array = freq_array
self._node_probabilities = unigrams_at_power_array / unigrams_at_power_array.sum()
self._node_probabilities_cumsum = np.cumsum(self._node_probabilities)
self._neg_sampling_power = 0.0 ###########
self.logger.info("Burn-in finished")
self.lr = self.train_lr
self.logger.info("Starting training (%d epochs)----------------------------------------", epochs)
avg_loss, avg_pos_loss, avg_neg_loss = self._train_batchwise(
epochs=epochs, batch_size=batch_size, print_every=print_every)
self.logger.info("Training finished")
np.seterr(**old_settings)
return avg_loss, avg_pos_loss, avg_neg_loss
def save(self, *args, **kwargs):
"""Save complete model to disk, inherited from :class:`gensim.utils.SaveLoad`."""
self._loss_grad = None # Can't pickle autograd fn to disk
super(DAGEmbeddingModel, self).save(*args, **kwargs)
@classmethod
def load(cls, *args, **kwargs):
"""Load model from disk, inherited from :class:`~gensim.utils.SaveLoad`."""
model = super(DAGEmbeddingModel, cls).load(*args, **kwargs)
return model
def _check_gradients(self, relations, all_negatives, batch, rels_reversed, tol=1e-6):
"""Compare computed gradients for batch to autograd gradients.
Parameters
----------
batch : PoincareBatch instance
Batch for which computed gradients are to checked.
relations : list of tuples
List of tuples of positive examples of the form (node_1_index, node_2_index).
all_negatives : list of lists
List of lists of negative samples for each node_1 in the positive examples.
"""
if not AUTOGRAD_PRESENT:
self.logger.warning('autograd could not be imported, cannot do gradient checking')
self.logger.warning('please install autograd to enable gradient checking')
return
if self._loss_grad is None:
self._loss_grad = grad(self._loss_fn)
max_diff = 0.0
for i, (relation, negatives) in enumerate(zip(relations, all_negatives)):
u, v = relation
auto_gradients = self._loss_grad(
np.vstack((self.kv.syn0[u], self.kv.syn0[[v] + negatives])),
rels_reversed
)
computed_gradients = np.vstack((batch.loss_gradients_u[:, i], batch.loss_gradients_v[:, :, i])) # (1 + neg_size, dim, batch_size)
diff = np.abs(auto_gradients - computed_gradients).max()
if diff > max_diff:
max_diff = diff
self.logger.info('Max difference between computed gradients and autograd gradients: %.10f. Tol = %.10f' % (max_diff, tol))
assert max_diff < tol, (
'Max difference between computed gradients and autograd gradients %.10f, '
'greater than tolerance %.10f' % (max_diff, tol))
def _loss_fn(self, matrix, rels_reversed):
"""Given a numpy array with vectors for u, v and negative samples, computes loss value.
Parameters
----------
matrix : numpy.array
Array containing vectors for u, v and negative samples, of shape (2 + negative_size, dim).
rels_reversed : bool
Returns
-------
float
Computed loss value.
Warnings
--------
Only used for autograd gradients, since autograd requires a specific function signature.
"""
raise NotImplementedError