diff --git a/docs/notebooks/keras_wrapper.ipynb b/docs/notebooks/keras_wrapper.ipynb index 88d24af681..74ce50f001 100644 --- a/docs/notebooks/keras_wrapper.ipynb +++ b/docs/notebooks/keras_wrapper.ipynb @@ -19,7 +19,7 @@ "metadata": {}, "source": [ "The wrappers available (as of now) are :\n", - "* Word2Vec (uses the function ```get_embedding_layer``` defined in ```gensim.models.keyedvectors```)" + "* Word2Vec (uses the function ```get_keras_embedding``` defined in ```gensim.models.keyedvectors```)" ] }, { @@ -38,20 +38,11 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": { - "collapsed": false, "scrolled": true }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using TensorFlow backend.\n" - ] - } - ], + "outputs": [], "source": [ "from gensim.models import word2vec" ] @@ -65,10 +56,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": true - }, + "execution_count": 2, + "metadata": {}, "outputs": [], "source": [ "sentences = [\n", @@ -93,19 +82,9 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:gensim.models.word2vec:under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n" - ] - } - ], + "execution_count": 3, + "metadata": {}, + "outputs": [], "source": [ "model = word2vec.Word2Vec(sentences, size=100, min_count=1, hs=1)" ] @@ -128,11 +107,17 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [], + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n" + ] + } + ], "source": [ "import numpy as np\n", "from keras.engine import Input\n", @@ -144,19 +129,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We would use the layer returned by the function `get_embedding_layer` in the Keras model." + "We would use the layer returned by the function `get_keras_embedding` in the Keras model." ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, + "execution_count": 5, + "metadata": {}, "outputs": [], "source": [ "wv = model.wv\n", - "embedding_layer = wv.get_embedding_layer()" + "embedding_layer = wv.get_keras_embedding()" ] }, { @@ -168,16 +151,15 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, + "execution_count": 6, + "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/chinmaya13/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:7: UserWarning: Update your `Model` call to the Keras 2 API: `Model(outputs=Tensor(\"do..., inputs=[" + "" ] }, - "execution_count": 14, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -451,10 +413,8 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": false - }, + "execution_count": 13, + "metadata": {}, "outputs": [], "source": [ "from keras.models import Sequential\n", @@ -479,10 +439,8 @@ }, { "cell_type": "code", - "execution_count": 21, - "metadata": { - "collapsed": true - }, + "execution_count": 14, + "metadata": {}, "outputs": [], "source": [ "# global variables\n", @@ -562,19 +520,9 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:gensim.models.word2vec:under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n" - ] - } - ], + "execution_count": 15, + "metadata": {}, + "outputs": [], "source": [ "# we are training our Word2Vec model here\n", "w2v_training_data_path = os.path.join(os.getcwd(), 'datasets/word_vectors_training_data.txt')\n", @@ -596,10 +544,8 @@ }, { "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": false - }, + "execution_count": 16, + "metadata": {}, "outputs": [], "source": [ "trainclassdict = subjectkeywords()\n", @@ -616,14 +562,12 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": { - "collapsed": false - }, + "execution_count": 17, + "metadata": {}, "outputs": [], "source": [ "# get embedding layer corresponding to our trained Word2Vec model\n", - "embedding_layer = w2v_model_wv.get_embedding_layer()\n", + "embedding_layer = w2v_model_wv.get_keras_embedding()\n", "\n", "# create a convnet to solve our classification task\n", "sequence_input = Input(shape=(maxlen,), dtype='int32')\n", @@ -643,35 +587,33 @@ }, { "cell_type": "code", - "execution_count": 25, - "metadata": { - "collapsed": false - }, + "execution_count": 18, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n", - "45/45 [==============================] - 0s - loss: 1.1154 - acc: 0.2222 \n", + "45/45 [==============================] - 0s - loss: 1.1035 - acc: 0.2000 \n", "Epoch 2/10\n", - "45/45 [==============================] - 0s - loss: 1.0949 - acc: 0.3333 \n", + "45/45 [==============================] - 0s - loss: 1.0988 - acc: 0.3333 \n", "Epoch 3/10\n", - "45/45 [==============================] - 0s - loss: 1.0426 - acc: 0.8667 \n", + "45/45 [==============================] - 0s - loss: 1.0972 - acc: 0.3333 \n", "Epoch 4/10\n", - "45/45 [==============================] - 0s - loss: 0.8931 - acc: 0.9556 \n", + "45/45 [==============================] - 0s - loss: 1.0948 - acc: 0.6444 \n", "Epoch 5/10\n", - "45/45 [==============================] - 0s - loss: 0.6967 - acc: 0.9778 \n", + "45/45 [==============================] - 0s - loss: 1.0938 - acc: 0.5778 \n", "Epoch 6/10\n", - "45/45 [==============================] - 0s - loss: 0.4727 - acc: 0.9556 \n", + "45/45 [==============================] - 0s - loss: 1.0936 - acc: 0.5778 \n", "Epoch 7/10\n", - "45/45 [==============================] - 0s - loss: 0.2991 - acc: 0.9778 \n", + "45/45 [==============================] - 0s - loss: 1.0900 - acc: 0.5111 \n", "Epoch 8/10\n", - "45/45 [==============================] - 0s - loss: 0.1795 - acc: 0.9778 \n", + "45/45 [==============================] - 0s - loss: 1.0879 - acc: 0.5111 \n", "Epoch 9/10\n", - "45/45 [==============================] - 0s - loss: 0.1218 - acc: 0.9778 \n", + "45/45 [==============================] - 0s - loss: 1.0856 - acc: 0.5778 \n", "Epoch 10/10\n", - "45/45 [==============================] - 0s - loss: 0.0889 - acc: 0.9778 \n" + "45/45 [==============================] - 0s - loss: 1.0834 - acc: 0.5556 \n" ] } ], @@ -698,16 +640,14 @@ }, { "cell_type": "code", - "execution_count": 26, - "metadata": { - "collapsed": false - }, + "execution_count": 19, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'mathematics': 0.96289372, 'physics': 0.025273025, 'theology': 0.011833278}\n" + "{'mathematics': 0.33123398, 'physics': 0.34042257, 'theology': 0.32834342}\n" ] } ], @@ -733,15 +673,6 @@ "The result above clearly suggests (~ 98% probability!) that the input `artificial intelligence` should belong to the category `mathematics`, which conforms very well with the expected output in this case.\n", "In general, the output could depend on several factors including the number of filters for the conv-net, the training data for the word-vectors, the training data for the classifier etc." ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -764,5 +695,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 75cc11ee69..8711322124 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -78,11 +78,6 @@ from six import string_types, iteritems from six.moves import xrange from scipy import stats -try: - from keras.layers import Embedding - KERAS_INSTALLED = True -except ImportError: - KERAS_INSTALLED = False logger = logging.getLogger(__name__) @@ -833,11 +828,13 @@ def init_sims(self, replace=False): else: self.syn0norm = (self.syn0 / sqrt((self.syn0 ** 2).sum(-1))[..., newaxis]).astype(REAL) - def get_embedding_layer(self, train_embeddings=False): + def get_keras_embedding(self, train_embeddings=False): """ Return a Keras 'Embedding' layer with weights set as the Word2Vec model's learned word embeddings """ - if not KERAS_INSTALLED: + try: + from keras.layers import Embedding + except ImportError: raise ImportError("Please install Keras to use this function") weights = self.syn0 diff --git a/gensim/test/test_keras_integration.py b/gensim/test/test_keras_integration.py index 996f1b7f7b..41bc74a967 100644 --- a/gensim/test/test_keras_integration.py +++ b/gensim/test/test_keras_integration.py @@ -66,7 +66,7 @@ def testEmbeddingLayerCosineSim(self): keras_w2v_model = self.model_cos_sim keras_w2v_model_wv = keras_w2v_model.wv - embedding_layer = keras_w2v_model_wv.get_embedding_layer() + embedding_layer = keras_w2v_model_wv.get_keras_embedding() input_a = Input(shape=(1,), dtype='int32', name='input_a') input_b = Input(shape=(1,), dtype='int32', name='input_b') @@ -135,7 +135,7 @@ def testEmbeddingLayer20NewsGroup(self): keras_w2v.build_vocab(texts_w2v) keras_w2v.train(texts, total_examples=keras_w2v.corpus_count, epochs=keras_w2v.iter) keras_w2v_wv = keras_w2v.wv - embedding_layer = keras_w2v_wv.get_embedding_layer() + embedding_layer = keras_w2v_wv.get_keras_embedding() # create a 1D convnet to solve our classification task sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')