From 3215ea4ebe58224b044cf4c1d62cdc35f8070169 Mon Sep 17 00:00:00 2001 From: Loreto Parisi Date: Tue, 20 Dec 2016 20:02:57 +0100 Subject: [PATCH 01/11] added Word2vec to Tensorflow 2D tensor file --- gensim/scripts/word2vec2tensor.py | 70 +++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 gensim/scripts/word2vec2tensor.py diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py new file mode 100644 index 0000000000..fdc69fe55a --- /dev/null +++ b/gensim/scripts/word2vec2tensor.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2016 Loreto Parisi +# Copyright (C) 2016 Silvio Ogliastri +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +USAGE: $ python -m gensim.scripts.word2vec2tensor --input --output +Where: + : Input Word2Vec model + : 2D tensor TSV output file name prefix +Output: + The script will create two TSV files. A 2d tensor format file, and a Word Embedding metadata file. Both files will + us the --output file name as prefix +This script is used to convert the word2vec format to Tensorflow 2D tensor and metadata formats for Embedding Visualization +For more information about TensorBoard format see: https://www.tensorflow.org/versions/master/how_tos/embedding_viz/ +""" + +import os +import sys +import random +import logging +import argparse + +import gensim + +logger = logging.getLogger(__name__) + +''' + Convert Word2Vec mode to 2D tensor TSV file and metadata file + @word2vec_model_path word2vec model + @tensor_filename tensor filename prefix +''' +def word2vec2tensor(word2vec_model_path,tensor_filename): + + model = gensim.models.Word2Vec.load_word2vec_format(word2vec_model_path, binary=True) + outfiletsv = tensor_filename + '_tensor.tsv' + outfiletsvmeta = tensor_filename + '_metadata.tsv' + + with open(outfiletsv, 'w+') as file_vector: + with open(outfiletsvmeta, 'w+') as file_metadata: + for word in model.index2word: + file_metadata.write(word.encode('utf-8') + '\n') + vector_row = '\t'.join(map(str, model[word])) + file_vector.write(vector_row + '\n') + +if __name__ == "__main__": + logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.root.setLevel(level=logging.INFO) + logger.info("running %s", ' '.join(sys.argv)) + + # check and process cmdline input + program = os.path.basename(sys.argv[0]) + if len(sys.argv) < 2: + print(globals()['__doc__'] % locals()) + sys.exit(1) + + parser = argparse.ArgumentParser() + parser.add_argument( + "-i", "--input", required=True, + help="Input word2vec model") + parser.add_argument( + "-o", "--output", required=True, + help="Output tensor file name prefix") + args = parser.parse_args() + + word2vec2tensor(args.input, args.output) + + logger.info("finished running %s", program) \ No newline at end of file From 23fac4062391090cb05b04699bb7e299dd8cce1b Mon Sep 17 00:00:00 2001 From: Loreto Parisi Date: Wed, 21 Dec 2016 09:43:24 +0100 Subject: [PATCH 02/11] Update word2vec2tensor.py --- gensim/scripts/word2vec2tensor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py index fdc69fe55a..b242d51e58 100644 --- a/gensim/scripts/word2vec2tensor.py +++ b/gensim/scripts/word2vec2tensor.py @@ -3,7 +3,7 @@ # # Copyright (C) 2016 Loreto Parisi # Copyright (C) 2016 Silvio Ogliastri -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +# Copyright (C) 2016 Radim Rehurek """ USAGE: $ python -m gensim.scripts.word2vec2tensor --input --output @@ -67,4 +67,4 @@ def word2vec2tensor(word2vec_model_path,tensor_filename): word2vec2tensor(args.input, args.output) - logger.info("finished running %s", program) \ No newline at end of file + logger.info("finished running %s", program) From d24406aa078c3357686d54e51d6b15715f2ed53c Mon Sep 17 00:00:00 2001 From: Loreto Parisi Date: Wed, 21 Dec 2016 09:44:43 +0100 Subject: [PATCH 03/11] Update word2vec2tensor.py --- gensim/scripts/word2vec2tensor.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py index b242d51e58..34d3056bd5 100644 --- a/gensim/scripts/word2vec2tensor.py +++ b/gensim/scripts/word2vec2tensor.py @@ -27,13 +27,12 @@ logger = logging.getLogger(__name__) -''' +def word2vec2tensor(word2vec_model_path,tensor_filename): + ''' Convert Word2Vec mode to 2D tensor TSV file and metadata file @word2vec_model_path word2vec model @tensor_filename tensor filename prefix -''' -def word2vec2tensor(word2vec_model_path,tensor_filename): - + ''' model = gensim.models.Word2Vec.load_word2vec_format(word2vec_model_path, binary=True) outfiletsv = tensor_filename + '_tensor.tsv' outfiletsvmeta = tensor_filename + '_metadata.tsv' From 77a96c0b0ddc2da38cd262d9ee67ec6f4e177b1f Mon Sep 17 00:00:00 2001 From: Loreto Parisi Date: Wed, 21 Dec 2016 09:50:13 +0100 Subject: [PATCH 04/11] instructions how to load TSV files in projector --- gensim/scripts/word2vec2tensor.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py index 34d3056bd5..a4f76d7188 100644 --- a/gensim/scripts/word2vec2tensor.py +++ b/gensim/scripts/word2vec2tensor.py @@ -14,7 +14,15 @@ The script will create two TSV files. A 2d tensor format file, and a Word Embedding metadata file. Both files will us the --output file name as prefix This script is used to convert the word2vec format to Tensorflow 2D tensor and metadata formats for Embedding Visualization -For more information about TensorBoard format see: https://www.tensorflow.org/versions/master/how_tos/embedding_viz/ +To use the generated TSV 2D tensor and metadata file in the Projector Visualizer, please +1) Open http://projector.tensorflow.org/. +2) Choose "Load Data" from the left menu. +3) Select "Choose file" in "Load a TSV file of vectors." and choose you local "_tensor.tsv" file +4) Select "Choose file" in "Load a TSV file of metadata." and choose you local "_metadata.tsv" file + +For more information about TensorBoard TSV format please visit: +https://www.tensorflow.org/versions/master/how_tos/embedding_viz/ + """ import os From 103d0d63808c0e2027e9e45d8994a2cd8bce08ad Mon Sep 17 00:00:00 2001 From: Loreto Parisi Date: Wed, 21 Dec 2016 09:51:59 +0100 Subject: [PATCH 05/11] Update word2vec2tensor.py --- gensim/scripts/word2vec2tensor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py index a4f76d7188..73219c4827 100644 --- a/gensim/scripts/word2vec2tensor.py +++ b/gensim/scripts/word2vec2tensor.py @@ -51,6 +51,9 @@ def word2vec2tensor(word2vec_model_path,tensor_filename): file_metadata.write(word.encode('utf-8') + '\n') vector_row = '\t'.join(map(str, model[word])) file_vector.write(vector_row + '\n') + + logger.info("2D tensor file saved to %s" % outfiletsv) + logger.info("Tensor metadata file saved to %s" % outfiletsvmeta) if __name__ == "__main__": logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) From fa7dbad3508cc8aec83882d5e03102a8af226bff Mon Sep 17 00:00:00 2001 From: Loreto Parisi Date: Wed, 21 Dec 2016 09:56:09 +0100 Subject: [PATCH 06/11] Update word2vec2tensor.py --- gensim/scripts/word2vec2tensor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py index 73219c4827..574908c26a 100644 --- a/gensim/scripts/word2vec2tensor.py +++ b/gensim/scripts/word2vec2tensor.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # Copyright (C) 2016 Loreto Parisi -# Copyright (C) 2016 Silvio Ogliastri +# Copyright (C) 2016 Silvio Olivastri # Copyright (C) 2016 Radim Rehurek """ From 0bf3b85633134e8e6bf6b8a9a9ee13691b21886c Mon Sep 17 00:00:00 2001 From: Loreto Parisi Date: Wed, 21 Dec 2016 15:40:10 +0100 Subject: [PATCH 07/11] model.wv.index2word to generate tensor --- gensim/scripts/word2vec2tensor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py index 574908c26a..160dc20a68 100644 --- a/gensim/scripts/word2vec2tensor.py +++ b/gensim/scripts/word2vec2tensor.py @@ -47,7 +47,7 @@ def word2vec2tensor(word2vec_model_path,tensor_filename): with open(outfiletsv, 'w+') as file_vector: with open(outfiletsvmeta, 'w+') as file_metadata: - for word in model.index2word: + for word in model.wv.index2word: file_metadata.write(word.encode('utf-8') + '\n') vector_row = '\t'.join(map(str, model[word])) file_vector.write(vector_row + '\n') From 96d3a584e2d1f104974d76b5ec3116ec7898bcf2 Mon Sep 17 00:00:00 2001 From: Loreto Parisi Date: Wed, 21 Dec 2016 15:42:02 +0100 Subject: [PATCH 08/11] Update word2vec2tensor.py --- gensim/scripts/word2vec2tensor.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py index 160dc20a68..030c1836a8 100644 --- a/gensim/scripts/word2vec2tensor.py +++ b/gensim/scripts/word2vec2tensor.py @@ -37,9 +37,10 @@ def word2vec2tensor(word2vec_model_path,tensor_filename): ''' - Convert Word2Vec mode to 2D tensor TSV file and metadata file - @word2vec_model_path word2vec model - @tensor_filename tensor filename prefix + Convert Word2Vec mode to 2D tensor TSV file and metadata file + Args: + param1 (str): word2vec model file path + param2 (str): filename prefix ''' model = gensim.models.Word2Vec.load_word2vec_format(word2vec_model_path, binary=True) outfiletsv = tensor_filename + '_tensor.tsv' From 15ab0e56a84ef58292e818fc3145abc894c13284 Mon Sep 17 00:00:00 2001 From: Loreto Parisi Date: Wed, 21 Dec 2016 21:09:18 +0100 Subject: [PATCH 09/11] Update word2vec2tensor.py --- gensim/scripts/word2vec2tensor.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py index 030c1836a8..13af0d5ebc 100644 --- a/gensim/scripts/word2vec2tensor.py +++ b/gensim/scripts/word2vec2tensor.py @@ -35,14 +35,15 @@ logger = logging.getLogger(__name__) -def word2vec2tensor(word2vec_model_path,tensor_filename): +def word2vec2tensor(word2vec_model_path,tensor_filename, binary=False): ''' Convert Word2Vec mode to 2D tensor TSV file and metadata file Args: param1 (str): word2vec model file path param2 (str): filename prefix + param2 (bool): set True to use a binary Word2Vec model, defaults to False ''' - model = gensim.models.Word2Vec.load_word2vec_format(word2vec_model_path, binary=True) + model = gensim.models.Word2Vec.load_word2vec_format(word2vec_model_path, binary=binary) outfiletsv = tensor_filename + '_tensor.tsv' outfiletsvmeta = tensor_filename + '_metadata.tsv' @@ -74,8 +75,11 @@ def word2vec2tensor(word2vec_model_path,tensor_filename): parser.add_argument( "-o", "--output", required=True, help="Output tensor file name prefix") + parser.add_argument( "-b", "--binary", + required=False, + help="If word2vec model in binary format, set True, else False") args = parser.parse_args() - word2vec2tensor(args.input, args.output) + word2vec2tensor(args.input, args.output, args.binary) logger.info("finished running %s", program) From 6007c29896125a24678b5d141eb498d86737258f Mon Sep 17 00:00:00 2001 From: Loreto Parisi Date: Wed, 21 Dec 2016 21:13:58 +0100 Subject: [PATCH 10/11] updated docs for new command line arguments --- gensim/scripts/word2vec2tensor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py index 13af0d5ebc..883de12298 100644 --- a/gensim/scripts/word2vec2tensor.py +++ b/gensim/scripts/word2vec2tensor.py @@ -6,10 +6,11 @@ # Copyright (C) 2016 Radim Rehurek """ -USAGE: $ python -m gensim.scripts.word2vec2tensor --input --output +USAGE: $ python -m gensim.scripts.word2vec2tensor --input --output [--binary] Where: : Input Word2Vec model : 2D tensor TSV output file name prefix + : Set True if Word2Vec model is binary. Defaults to False. Output: The script will create two TSV files. A 2d tensor format file, and a Word Embedding metadata file. Both files will us the --output file name as prefix From cc9fb70576864b87971beb3a1ba036b3ab1256ec Mon Sep 17 00:00:00 2001 From: Loreto Parisi Date: Wed, 21 Dec 2016 21:17:10 +0100 Subject: [PATCH 11/11] Update word2vec2tensor.py --- gensim/scripts/word2vec2tensor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py index 883de12298..d2184b15c0 100644 --- a/gensim/scripts/word2vec2tensor.py +++ b/gensim/scripts/word2vec2tensor.py @@ -50,7 +50,7 @@ def word2vec2tensor(word2vec_model_path,tensor_filename, binary=False): with open(outfiletsv, 'w+') as file_vector: with open(outfiletsvmeta, 'w+') as file_metadata: - for word in model.wv.index2word: + for word in model.index2word: file_metadata.write(word.encode('utf-8') + '\n') vector_row = '\t'.join(map(str, model[word])) file_vector.write(vector_row + '\n')