From 4368cf41484437d5eb006c5e6c3f0b974fbf478f Mon Sep 17 00:00:00 2001 From: Jack Wu Date: Mon, 16 Oct 2017 22:51:34 -0700 Subject: [PATCH] Add "most_similar_to_given" method for KeyedVectors (#1582) * finished adding 2 new functions * imported argmax to word2vec * reformatted * remove `most_similar_to_given` from w2v class * Fix PEP8 --- gensim/models/keyedvectors.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 497f9d86f0..b46c3a21ec 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -71,7 +71,8 @@ from numpy import dot, zeros, dtype, float32 as REAL,\ double, array, vstack, fromstring, sqrt, newaxis,\ - ndarray, sum as np_sum, prod, ascontiguousarray + ndarray, sum as np_sum, prod, ascontiguousarray,\ + argmax from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc from gensim.corpora.dictionary import Dictionary @@ -616,6 +617,30 @@ def similarity(self, w1, w2): """ return dot(matutils.unitvec(self[w1]), matutils.unitvec(self[w2])) + def most_similar_to_given(self, w1, word_list): + """Return the word from word_list most similar to w1. + + Args: + w1 (str): a word + word_list (list): list of words containing a word most similar to w1 + + Returns: + the word in word_list with the highest similarity to w1 + + Raises: + KeyError: If w1 or any word in word_list is not in the vocabulary + + Example:: + + >>> trained_model.most_similar_to_given('music', ['water', 'sound', 'backpack', 'mouse']) + 'sound' + + >>> trained_model.most_similar_to_given('snake', ['food', 'pencil', 'animal', 'phone']) + 'animal' + + """ + return word_list[argmax([self.similarity(w1, word) for word in word_list])] + def n_similarity(self, ws1, ws2): """ Compute cosine similarity between two sets of words.