From f5f92d657c05c0b856362a7ad67062d5d348ac24 Mon Sep 17 00:00:00 2001 From: Saltsmart <40828952+Saltsmart@users.noreply.github.com> Date: Thu, 4 Aug 2022 15:01:17 +0800 Subject: [PATCH] [Model] update APIs for gensim 4.x (#361) * change parameters' name for gensim.models.Word2Vec * update requirements for gensim * update other APIs with gensim 4.2.0 * update test_deepwalk * Delete .vscode directory * Update setup.py * 4.2.0 --- cogdl/models/emb/deepwalk.py | 4 ++-- cogdl/models/emb/dgk.py | 8 ++++---- cogdl/models/emb/gatne.py | 21 +++++++++++---------- cogdl/models/emb/metapath2vec.py | 4 ++-- cogdl/models/emb/node2vec.py | 4 ++-- docs/requirements.txt | 2 +- setup.py | 2 +- tests/models/emb/test_deepwalk.py | 6 +++--- 8 files changed, 26 insertions(+), 25 deletions(-) diff --git a/cogdl/models/emb/deepwalk.py b/cogdl/models/emb/deepwalk.py index f1b88627..4a6b27f2 100644 --- a/cogdl/models/emb/deepwalk.py +++ b/cogdl/models/emb/deepwalk.py @@ -59,12 +59,12 @@ def forward(self, graph, embedding_model_creator=Word2Vec, return_dict=False): print("training word2vec...") model = embedding_model_creator( walks, - size=self.dimension, + vector_size=self.dimension, window=self.window_size, min_count=0, sg=1, workers=self.worker, - iter=self.iteration, + epochs=self.iteration, ) id2node = dict([(vid, node) for vid, node in enumerate(nx_g.nodes())]) embeddings = np.asarray([model.wv[str(id2node[i])] for i in range(len(id2node))]) diff --git a/cogdl/models/emb/dgk.py b/cogdl/models/emb/dgk.py index 7d4e6d2d..c00c78be 100644 --- a/cogdl/models/emb/dgk.py +++ b/cogdl/models/emb/dgk.py @@ -88,17 +88,17 @@ def forward(self, graphs, **kwargs): model = Word2Vec( self.gl_collections, - size=self.hidden_dim, + vector_size=self.hidden_dim, window=self.window, min_count=self.min_count, sample=self.sampling_rate, workers=self.n_workers, - iter=self.epochs, + epochs=self.epochs, alpha=self.alpha, ) - vectors = np.asarray([model.wv[str(node)] for node in model.wv.index2word]) + vectors = np.asarray([model.wv[str(node)] for node in model.wv.index_to_key]) S = vectors.dot(vectors.T) - node2id = dict(zip(model.wv.index2word, range(len(model.wv.index2word)))) + node2id = dict(zip(model.wv.index_to_key, range(len(model.wv.index_to_key)))) num_graph, size_vocab = len(graphs), len(node2id) norm_prob = np.zeros((num_graph, size_vocab)) diff --git a/cogdl/models/emb/gatne.py b/cogdl/models/emb/gatne.py index fbe0d46d..50ed92ac 100644 --- a/cogdl/models/emb/gatne.py +++ b/cogdl/models/emb/gatne.py @@ -1,7 +1,8 @@ import numpy as np import networkx as nx from collections import defaultdict -from gensim.models.keyedvectors import Vocab +from gensim.models.keyedvectors import Vocab # Retained for now to ease the loading of older models. +# See: https://radimrehurek.com/gensim/models/keyedvectors.html?highlight=vocab#gensim.models.keyedvectors.CompatVocab import random import math import tqdm @@ -110,12 +111,12 @@ def __init__( def forward(self, network_data): device = "cpu" if not torch.cuda.is_available() else "cuda" all_walks = generate_walks(network_data, self.walk_num, self.walk_length, schema=self.schema) - vocab, index2word = generate_vocab(all_walks) + vocab, index_to_key = generate_vocab(all_walks) train_pairs = generate_pairs(all_walks, vocab) edge_types = list(network_data.keys()) - num_nodes = len(index2word) + num_nodes = len(index_to_key) edge_type_count = len(edge_types) epochs = self.epochs @@ -189,7 +190,7 @@ def forward(self, network_data): node_neigh = torch.tensor([neighbors[i] for _ in range(edge_type_count)]).to(device) node_emb = model(train_inputs, train_types, node_neigh) for j in range(edge_type_count): - final_model[edge_types[j]][index2word[i]] = node_emb[j].cpu().detach().numpy() + final_model[edge_types[j]][index_to_key[i]] = node_emb[j].cpu().detach().numpy() return final_model @@ -349,7 +350,7 @@ def generate_pairs(all_walks, vocab, window_size=5): def generate_vocab(all_walks): - index2word = [] + index_to_key = [] raw_vocab = defaultdict(int) for walks in all_walks: @@ -359,14 +360,14 @@ def generate_vocab(all_walks): vocab = {} for word, v in raw_vocab.items(): - vocab[word] = Vocab(count=v, index=len(index2word)) - index2word.append(word) + vocab[word] = Vocab(count=v, index=len(index_to_key)) + index_to_key.append(word) - index2word.sort(key=lambda word: vocab[word].count, reverse=True) - for i, word in enumerate(index2word): + index_to_key.sort(key=lambda word: vocab[word].count, reverse=True) + for i, word in enumerate(index_to_key): vocab[word].index = i - return vocab, index2word + return vocab, index_to_key def get_batches(pairs, neighbors, batch_size): diff --git a/cogdl/models/emb/metapath2vec.py b/cogdl/models/emb/metapath2vec.py index 285d8f2c..8ab2ec25 100644 --- a/cogdl/models/emb/metapath2vec.py +++ b/cogdl/models/emb/metapath2vec.py @@ -73,12 +73,12 @@ def forward(self, data): walks = [[str(node) for node in walk] for walk in walks] model = Word2Vec( walks, - size=self.dimension, + vector_size=self.dimension, window=self.window_size, min_count=0, sg=1, workers=self.worker, - iter=self.iteration, + epochs=self.iteration, ) id2node = dict([(vid, node) for vid, node in enumerate(G.nodes())]) embeddings = np.asarray([model.wv[str(id2node[i])] for i in range(len(id2node))]) diff --git a/cogdl/models/emb/node2vec.py b/cogdl/models/emb/node2vec.py index 1217bfe9..1b0f4be8 100644 --- a/cogdl/models/emb/node2vec.py +++ b/cogdl/models/emb/node2vec.py @@ -82,12 +82,12 @@ def forward(self, graph, return_dict=False): walks = [[str(node) for node in walk] for walk in walks] model = Word2Vec( walks, - size=self.dimension, + vector_size=self.dimension, window=self.window_size, min_count=0, sg=1, workers=self.worker, - iter=self.iteration, + epochs=self.iteration, ) id2node = dict([(vid, node) for vid, node in enumerate(G.nodes())]) embeddings = np.asarray([model.wv[str(id2node[i])] for i in range(len(id2node))]) diff --git a/docs/requirements.txt b/docs/requirements.txt index 47fddc71..e2c126ce 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -8,7 +8,7 @@ matplotlib tqdm numpy>=1.21 scipy -gensim<4.0 +gensim>=4.0 grave scikit_learn==0.24.2 tabulate diff --git a/setup.py b/setup.py index a2288fff..e83a367f 100644 --- a/setup.py +++ b/setup.py @@ -78,7 +78,7 @@ def find_version(filename): "tqdm", "numpy>=1.21", "scipy", - "gensim<4.0", + "gensim>=4.0", "grave", "scikit_learn", "tabulate", diff --git a/tests/models/emb/test_deepwalk.py b/tests/models/emb/test_deepwalk.py index 9c674644..f1a0e900 100644 --- a/tests/models/emb/test_deepwalk.py +++ b/tests/models/emb/test_deepwalk.py @@ -20,7 +20,7 @@ def __init__(self, data: Dict[str, List[float]]) -> None: embed_3 = [0.3, 0.2, 0.1, -0.1] -def creator(walks, size, window, min_count, sg, workers, iter): +def creator(walks, vector_size, window, min_count, sg, workers, epochs): return Word2VecFake({"0": embed_1, "1": embed_2, "2": embed_3}) @@ -93,9 +93,9 @@ def test_will_pass_correct_number_of_walks(): graph = Graph(edge_index=(torch.LongTensor([0, 1]), torch.LongTensor([1, 2]))) captured_walks_no = [] - def creator_mocked(walks, size, window, min_count, sg, workers, iter): + def creator_mocked(walks, vector_size, window, min_count, sg, workers, epochs): captured_walks_no.append(len(walks)) - return creator(walks, size, window, min_count, sg, workers, iter) + return creator(walks, vector_size, window, min_count, sg, workers, epochs) model(graph, creator_mocked) assert captured_walks_no[0] == args.walk_num * graph.num_nodes