diff --git a/cogdl/datasets/kg_data.py b/cogdl/datasets/kg_data.py index 48959eea..71793fdd 100644 --- a/cogdl/datasets/kg_data.py +++ b/cogdl/datasets/kg_data.py @@ -1,10 +1,186 @@ import os.path as osp +import numpy as np + import torch from cogdl.data import Graph, Dataset from cogdl.utils import download_url +class BidirectionalOneShotIterator(object): + + + def __init__(self, dataloader_head, dataloader_tail): + self.iterator_head = self.one_shot_iterator(dataloader_head) + self.iterator_tail = self.one_shot_iterator(dataloader_tail) + self.step = 0 + + def __next__(self): + self.step += 1 + if self.step % 2 == 0: + data = next(self.iterator_head) + else: + data = next(self.iterator_tail) + return data + + @staticmethod + def one_shot_iterator(dataloader): + """ + Transform a PyTorch Dataloader into python iterator + """ + while True: + for data in dataloader: + yield data + + +class TestDataset(torch.utils.data.Dataset): + def __init__(self, triples, all_true_triples, nentity, nrelation, mode): + self.len = len(triples) + self.triple_set = set(all_true_triples) + self.triples = triples + self.nentity = nentity + self.nrelation = nrelation + self.mode = mode + + def __len__(self): + return self.len + + def __getitem__(self, idx): + head, relation, tail = self.triples[idx] + + if self.mode == "head-batch": + tmp = [ + (0, rand_head) if (rand_head, relation, tail) not in self.triple_set else (-1, head) + for rand_head in range(self.nentity) + ] + tmp[head] = (0, head) + elif self.mode == "tail-batch": + tmp = [ + (0, rand_tail) if (head, relation, rand_tail) not in self.triple_set else (-1, tail) + for rand_tail in range(self.nentity) + ] + tmp[tail] = (0, tail) + else: + raise ValueError("negative batch mode %s not supported" % self.mode) + + tmp = torch.LongTensor(tmp) + filter_bias = tmp[:, 0].float() + negative_sample = tmp[:, 1] + + positive_sample = torch.LongTensor((head, relation, tail)) + + return positive_sample, negative_sample, filter_bias, self.mode + + @staticmethod + def collate_fn(data): + positive_sample = torch.stack([_[0] for _ in data], dim=0) + negative_sample = torch.stack([_[1] for _ in data], dim=0) + filter_bias = torch.stack([_[2] for _ in data], dim=0) + mode = data[0][3] + return positive_sample, negative_sample, filter_bias, mode + + +class TrainDataset(torch.utils.data.Dataset): + def __init__(self, triples, nentity, nrelation, negative_sample_size, mode): + self.len = len(triples) + self.triples = triples + self.triple_set = set(triples) + self.nentity = nentity + self.nrelation = nrelation + self.negative_sample_size = negative_sample_size + self.mode = mode + self.count = self.count_frequency(triples) + self.true_head, self.true_tail = self.get_true_head_and_tail(self.triples) + + def __len__(self): + return self.len + + def __getitem__(self, idx): + positive_sample = self.triples[idx] + + head, relation, tail = positive_sample + + subsampling_weight = self.count[(head, relation)] + self.count[(tail, -relation - 1)] + subsampling_weight = torch.sqrt(1 / torch.Tensor([subsampling_weight])) + + negative_sample_list = [] + negative_sample_size = 0 + + while negative_sample_size < self.negative_sample_size: + negative_sample = np.random.randint(self.nentity, size=self.negative_sample_size * 2) + if self.mode == "head-batch": + mask = np.in1d(negative_sample, self.true_head[(relation, tail)], assume_unique=True, invert=True) + elif self.mode == "tail-batch": + mask = np.in1d(negative_sample, self.true_tail[(head, relation)], assume_unique=True, invert=True) + else: + raise ValueError("Training batch mode %s not supported" % self.mode) + negative_sample = negative_sample[mask] + negative_sample_list.append(negative_sample) + negative_sample_size += negative_sample.size + + negative_sample = np.concatenate(negative_sample_list)[: self.negative_sample_size] + + negative_sample = torch.LongTensor(negative_sample) + + positive_sample = torch.LongTensor(positive_sample) + + return positive_sample, negative_sample, subsampling_weight, self.mode + + @staticmethod + def collate_fn(data): + positive_sample = torch.stack([_[0] for _ in data], dim=0) + negative_sample = torch.stack([_[1] for _ in data], dim=0) + subsample_weight = torch.cat([_[2] for _ in data], dim=0) + mode = data[0][3] + return positive_sample, negative_sample, subsample_weight, mode + + @staticmethod + def count_frequency(triples, start=4): + """ + Get frequency of a partial triple like (head, relation) or (relation, tail) + The frequency will be used for subsampling like word2vec + """ + count = {} + for head, relation, tail in triples: + if (head, relation) not in count: + count[(head, relation)] = start + else: + count[(head, relation)] += 1 + + if (tail, -relation - 1) not in count: + count[(tail, -relation - 1)] = start + else: + count[(tail, -relation - 1)] += 1 + return count + + @staticmethod + def get_true_head_and_tail(triples): + """ + Build a dictionary of true triples that will + be used to filter these true triples for negative sampling + """ + + true_head = {} + true_tail = {} + + for head, relation, tail in triples: + if (head, relation) not in true_tail: + true_tail[(head, relation)] = [] + true_tail[(head, relation)].append(tail) + if (relation, tail) not in true_head: + true_head[(relation, tail)] = [] + true_head[(relation, tail)].append(head) + + for relation, tail in true_head: + true_head[(relation, tail)] = np.array(list(set(true_head[(relation, tail)]))) + for head, relation in true_tail: + true_tail[(head, relation)] = np.array(list(set(true_tail[(head, relation)]))) + + return true_head, true_tail + + + + def read_triplet_data(folder): filenames = ["train2id.txt", "valid2id.txt", "test2id.txt"] count = 0 @@ -18,7 +194,6 @@ def read_triplet_data(folder): relation_dic = {} for filename in filenames: with open(osp.join(folder, filename), "r") as f: - _ = int(f.readline().strip()) if "train" in filename: train_start_idx = len(triples) elif "valid" in filename: @@ -27,6 +202,8 @@ def read_triplet_data(folder): test_start_idx = len(triples) for line in f: items = line.strip().split() + if len(items) != 3: + continue edge_index.append([int(items[0]), int(items[1])]) edge_attr.append(int(items[2])) triples.append((int(items[0]), int(items[2]), int(items[1]))) @@ -110,9 +287,8 @@ def get(self, idx): def download(self): for name in self.raw_file_names: - # download_url("{}/{}/{}".format(self.url, self.name, name), self.raw_dir) download_url(self.url.format(self.name, name), self.raw_dir, name=name) - + def process(self): ( data, diff --git a/cogdl/experiments.py b/cogdl/experiments.py index 585852ea..dbf52729 100644 --- a/cogdl/experiments.py +++ b/cogdl/experiments.py @@ -206,10 +206,12 @@ def train(args): # noqa: C901 logger=args.logger, log_path=args.log_path, project=args.project, - no_test=args.no_test, + return_model=args.return_model, nstage=args.nstage, actnn=args.actnn, fp16=args.fp16, + do_test=args.do_test, + do_valid=args.do_valid, ) # Go!!! diff --git a/cogdl/models/__init__.py b/cogdl/models/__init__.py index d33177b5..50c8039f 100644 --- a/cogdl/models/__init__.py +++ b/cogdl/models/__init__.py @@ -44,6 +44,10 @@ def build_model(args): SUPPORTED_MODELS = { + "transe":"cogdl.models.emb.transe.TransE", + "complex":"cogdl.models.emb.complex.ComplEx", + "distmult":"cogdl.models.emb.distmult.DistMult", + "rotate":"cogdl.models.emb.rotate.RotatE", "hope": "cogdl.models.emb.hope.HOPE", "spectral": "cogdl.models.emb.spectral.Spectral", "hin2vec": "cogdl.models.emb.hin2vec.Hin2vec", diff --git a/cogdl/models/emb/complex.py b/cogdl/models/emb/complex.py new file mode 100644 index 00000000..0f6f57b8 --- /dev/null +++ b/cogdl/models/emb/complex.py @@ -0,0 +1,37 @@ +import torch +from torch import Tensor +import torch.nn as nn +import torch.nn.functional as F + +from .. import BaseModel +from .knowledge_base import KGEModel + + +class ComplEx(KGEModel): + r""" + the implementation of ComplEx model from the paper `"Complex Embeddings for Simple Link Prediction"` + borrowed from `KnowledgeGraphEmbedding` + """ + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + parser.add_argument("--embedding_size", type=int, default=500, help="Dimensionality of embedded vectors") + parser.add_argument("--gamma", type=float,default=12.0, help="Hyperparameter for embedding") + parser.add_argument("--double_entity_embedding", default=True) + parser.add_argument("--double_relation_embedding", default=True) + def score(self, head, relation, tail, mode): + re_head, im_head = torch.chunk(head, 2, dim=2) + re_relation, im_relation = torch.chunk(relation, 2, dim=2) + re_tail, im_tail = torch.chunk(tail, 2, dim=2) + + if mode == "head-batch": + re_score = re_relation * re_tail + im_relation * im_tail + im_score = re_relation * im_tail - im_relation * re_tail + score = re_head * re_score + im_head * im_score + else: + re_score = re_head * re_relation - im_head * im_relation + im_score = re_head * im_relation + im_head * re_relation + score = re_score * re_tail + im_score * im_tail + + score = score.sum(dim=2) + return score diff --git a/cogdl/models/emb/distmult.py b/cogdl/models/emb/distmult.py new file mode 100644 index 00000000..d9f6dc0f --- /dev/null +++ b/cogdl/models/emb/distmult.py @@ -0,0 +1,25 @@ +from .. import BaseModel +from .knowledge_base import KGEModel + + +class DistMult(KGEModel): + r"""The DistMult model from the ICLR 2015 paper `"EMBEDDING ENTITIES AND RELATIONS FOR LEARNING AND INFERENCE IN KNOWLEDGE BASES" + ` + borrowed from `KnowledgeGraphEmbedding` + """ + + def __init__( + self, nentity, nrelation, hidden_dim, gamma, double_entity_embedding=False, double_relation_embedding=False + ): + super(DistMult, self).__init__( + nentity, nrelation, hidden_dim, gamma, double_entity_embedding, double_relation_embedding + ) + + def score(self, head, relation, tail, mode): + if mode == "head-batch": + score = head * (relation * tail) + else: + score = (head * relation) * tail + + score = score.sum(dim=2) + return score diff --git a/cogdl/models/emb/knowledge_base.py b/cogdl/models/emb/knowledge_base.py new file mode 100644 index 00000000..52214535 --- /dev/null +++ b/cogdl/models/emb/knowledge_base.py @@ -0,0 +1,104 @@ +import torch +import torch.nn as nn + +from .. import BaseModel + + +class KGEModel(BaseModel): + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + parser.add_argument("--embedding_size", type=int, default=500, help="Dimensionality of embedded vectors") + parser.add_argument("--gamma", type=float,default=12.0, help="Hyperparameter for embedding") + parser.add_argument("--double_entity_embedding", action="store_true") + parser.add_argument("--double_relation_embedding", action="store_true") + + @classmethod + def build_model_from_args(cls, args): + return cls( + args.num_entities, + args.num_rels, + args.embedding_size, + args.gamma, + args.double_entity_embedding, + args.double_relation_embedding, + ) + + def __init__( + self, nentity, nrelation, hidden_dim, gamma, double_entity_embedding=False, double_relation_embedding=False + ): + + super(KGEModel, self).__init__() + self.nentity = nentity + self.nrelation = nrelation + self.hidden_dim = hidden_dim + self.epsilon = 2.0 + + self.gamma = nn.Parameter(torch.Tensor([gamma]), requires_grad=False) + + self.embedding_range = nn.Parameter( + torch.Tensor([(self.gamma.item() + self.epsilon) / hidden_dim]), requires_grad=False + ) + + self.entity_dim = hidden_dim * 2 if double_entity_embedding else hidden_dim + self.relation_dim = hidden_dim * 2 if double_relation_embedding else hidden_dim + + self.entity_embedding = nn.Parameter(torch.zeros(nentity, self.entity_dim)) + nn.init.uniform_(tensor=self.entity_embedding, a=-self.embedding_range.item(), b=self.embedding_range.item()) + + self.relation_embedding = nn.Parameter(torch.zeros(nrelation, self.relation_dim)) + nn.init.uniform_(tensor=self.relation_embedding, a=-self.embedding_range.item(), b=self.embedding_range.item()) + + def forward(self, sample, mode="single"): + """ + Forward function that calculate the score of a batch of triples. + In the 'single' mode, sample is a batch of triple. + In the 'head-batch' or 'tail-batch' mode, sample consists two part. + The first part is usually the positive sample. + And the second part is the entities in the negative samples. + Because negative samples and positive samples usually share two elements + in their triple ((head, relation) or (relation, tail)). + """ + + if mode == "single": + batch_size, negative_sample_size = sample.size(0), 1 + + head = torch.index_select(self.entity_embedding, dim=0, index=sample[:, 0]).unsqueeze(1) + + relation = torch.index_select(self.relation_embedding, dim=0, index=sample[:, 1]).unsqueeze(1) + + tail = torch.index_select(self.entity_embedding, dim=0, index=sample[:, 2]).unsqueeze(1) + + elif mode == "head-batch": + tail_part, head_part = sample + batch_size, negative_sample_size = head_part.size(0), head_part.size(1) + + head = torch.index_select(self.entity_embedding, dim=0, index=head_part.view(-1)).view( + batch_size, negative_sample_size, -1 + ) + + relation = torch.index_select(self.relation_embedding, dim=0, index=tail_part[:, 1]).unsqueeze(1) + + tail = torch.index_select(self.entity_embedding, dim=0, index=tail_part[:, 2]).unsqueeze(1) + + elif mode == "tail-batch": + head_part, tail_part = sample + batch_size, negative_sample_size = tail_part.size(0), tail_part.size(1) + + head = torch.index_select(self.entity_embedding, dim=0, index=head_part[:, 0]).unsqueeze(1) + + relation = torch.index_select(self.relation_embedding, dim=0, index=head_part[:, 1]).unsqueeze(1) + + tail = torch.index_select(self.entity_embedding, dim=0, index=tail_part.view(-1)).view( + batch_size, negative_sample_size, -1 + ) + + else: + raise ValueError("mode %s not supported" % mode) + + score = self.score(head, relation, tail, mode) + + return score + + def score(self, head, relation, tail, mode): + raise NotImplementedError diff --git a/cogdl/models/emb/rotate.py b/cogdl/models/emb/rotate.py new file mode 100644 index 00000000..bd8f66e3 --- /dev/null +++ b/cogdl/models/emb/rotate.py @@ -0,0 +1,57 @@ +from email.policy import default +import torch +from torch import Tensor +import torch.nn as nn +import torch.nn.functional as F + +from .. import BaseModel +from .knowledge_base import KGEModel + + +class RotatE(KGEModel): + r""" + Implementation of RotatE model from the paper `"RotatE: Knowledge Graph Embedding by Relational Rotation in Complex Space" + `. + borrowed from `KnowledgeGraphEmbedding` + """ + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + parser.add_argument("--embedding_size", type=int, default=500, help="Dimensionality of embedded vectors") + parser.add_argument("--gamma", type=float,default=12.0, help="Hyperparameter for embedding") + parser.add_argument("--double_entity_embedding", default=True) + parser.add_argument("--double_relation_embedding", action="store_true") + def __init__( + self, nentity, nrelation, hidden_dim, gamma, double_entity_embedding=False, double_relation_embedding=False + ): + super(RotatE, self).__init__(nentity, nrelation, hidden_dim, gamma, True, double_relation_embedding) + + def score(self, head, relation, tail, mode): + pi = 3.14159265358979323846 + + re_head, im_head = torch.chunk(head, 2, dim=2) + re_tail, im_tail = torch.chunk(tail, 2, dim=2) + + # Make phases of relations uniformly distributed in [-pi, pi] + + phase_relation = relation / (self.embedding_range.item() / pi) + + re_relation = torch.cos(phase_relation) + im_relation = torch.sin(phase_relation) + + if mode == "head-batch": + re_score = re_relation * re_tail + im_relation * im_tail + im_score = re_relation * im_tail - im_relation * re_tail + re_score = re_score - re_head + im_score = im_score - im_head + else: + re_score = re_head * re_relation - im_head * im_relation + im_score = re_head * im_relation + im_head * re_relation + re_score = re_score - re_tail + im_score = im_score - im_tail + + score = torch.stack([re_score, im_score], dim=0) + score = score.norm(dim=0) + + score = self.gamma.item() - score.sum(dim=2) + return score diff --git a/cogdl/models/emb/transe.py b/cogdl/models/emb/transe.py new file mode 100644 index 00000000..c2ad91d6 --- /dev/null +++ b/cogdl/models/emb/transe.py @@ -0,0 +1,29 @@ +import torch +from torch import Tensor +import torch.nn as nn +import torch.nn.functional as F +from .knowledge_base import KGEModel + + + +class TransE(KGEModel): + r"""The TransE model from paper `"Translating Embeddings for Modeling Multi-relational Data" + ` + borrowed from `KnowledgeGraphEmbedding` + """ + + def __init__( + self, nentity, nrelation, hidden_dim, gamma, double_entity_embedding=False, double_relation_embedding=False + ): + super(TransE, self).__init__(nentity, nrelation, hidden_dim, gamma, True, True) + + + + def score(self, head, relation, tail, mode): + if mode == "head-batch": + score = head + (relation - tail) + else: + score = (head + relation) - tail + + score = self.gamma.item() - torch.norm(score, p=1, dim=2) + return score \ No newline at end of file diff --git a/cogdl/options.py b/cogdl/options.py index 0bef1cc7..ceab714b 100644 --- a/cogdl/options.py +++ b/cogdl/options.py @@ -46,11 +46,13 @@ def get_parser(): parser.add_argument("--master-port", type=int, default=13425) parser.add_argument("--master-addr", type=str, default="localhost") - parser.add_argument("--no-test", action="store_true") + parser.add_argument("--return_model", action="store_true") parser.add_argument("--actnn", action="store_true") parser.add_argument("--fp16", action="store_true") parser.add_argument("--rp-ratio", type=int, default=1) + parser.add_argument("--do_test", default=True) + parser.add_argument("--do_valid", default=True) # fmt: on return parser diff --git a/cogdl/trainer/trainer.py b/cogdl/trainer/trainer.py index bc6d0f25..e912411d 100644 --- a/cogdl/trainer/trainer.py +++ b/cogdl/trainer/trainer.py @@ -14,7 +14,14 @@ from cogdl.wrappers.data_wrapper.base_data_wrapper import DataWrapper from cogdl.wrappers.model_wrapper.base_model_wrapper import ModelWrapper, EmbeddingModelWrapper -from cogdl.trainer.trainer_utils import evaluation_comp, load_model, save_model, ddp_end, ddp_after_epoch, Printer +from cogdl.trainer.trainer_utils import ( + evaluation_comp, + load_model, + save_model, + ddp_end, + ddp_after_epoch, + Printer, +) from cogdl.trainer.embed_trainer import EmbeddingTrainer from cogdl.trainer.controller import DataController from cogdl.loggers import build_logger @@ -71,12 +78,14 @@ def __init__( logger: str = None, log_path: str = "./runs", project: str = "cogdl-exp", - no_test: bool = False, + return_model: bool = False, actnn: bool = False, fp16: bool = False, rp_ratio: int = 1, attack=None, attack_mode="injection", + do_test: bool = True, + do_valid: bool = True, ): self.epochs = epochs self.nstage = nstage @@ -104,7 +113,7 @@ def __init__( self.cpu_inference = cpu_inference - self.no_test = no_test + self.return_model = return_model self.on_train_batch_transform = None self.on_eval_batch_transform = None @@ -130,6 +139,8 @@ def __init__( self.fp16 = fp16 self.attack = attack self.attack_mode = attack_mode + self.do_test = do_test + self.do_valid = do_valid if actnn: try: @@ -193,7 +204,7 @@ def run(self, model_w: ModelWrapper, dataset_w: DataWrapper): self.train(self.devices[0], model_w, dataset_w) best_model_w = load_model(model_w, self.checkpoint_path).to(self.devices[0]) - if self.no_test: + if self.return_model: return best_model_w.model final_test = self.evaluate(best_model_w, dataset_w) @@ -212,8 +223,14 @@ def evaluate(self, model_w: ModelWrapper, dataset_w: DataWrapper, cpu=False): # disable `distributed` to inference once only self.distributed_training = False dataset_w.prepare_test_data() - final_val = self.validate(model_w, dataset_w, self.devices[0]) - final_test = self.test(model_w, dataset_w, self.devices[0]) + if self.do_valid: + final_val = self.validate(model_w, dataset_w, self.devices[0]) + else: + final_val = {} + if self.do_test: + final_test = self.test(model_w, dataset_w, self.devices[0]) + else: + final_test = {} if final_val is not None and "val_metric" in final_val: final_val[f"val_{self.evaluation_metric}"] = final_val["val_metric"] @@ -364,23 +381,24 @@ def train(self, rank, model_w, dataset_w): # noqa: C901 print_str_dict["train_loss"] = training_loss val_loader = dataset_w.on_val_wrapper() - if val_loader is not None and epoch % self.eval_step == 0: - # inductive setting .. - dataset_w.eval() - # do validation in inference device - val_result = self.validate(model_w, dataset_w, rank) - if val_result is not None: - monitoring = val_result[self.monitor] - if compare_fn(monitoring, best_index): - best_index = monitoring - best_epoch = epoch - patience = 0 - best_model_w = copy.deepcopy(model_w) - else: - patience += 1 - if self.early_stopping and patience >= self.patience: - break - print_str_dict[f"val_{self.evaluation_metric}"] = monitoring + if self.do_valid is True: + if val_loader is not None and epoch % self.eval_step == 0: + # inductive setting .. + dataset_w.eval() + # do validation in inference device + val_result = self.validate(model_w, dataset_w, rank) + if val_result is not None: + monitoring = val_result[self.monitor] + if compare_fn(monitoring, best_index): + best_index = monitoring + best_epoch = epoch + patience = 0 + best_model_w = copy.deepcopy(model_w) + else: + patience += 1 + if self.early_stopping and patience >= self.patience: + break + print_str_dict[f"val_{self.evaluation_metric}"] = monitoring if self.distributed_training: if rank == 0: diff --git a/cogdl/trainer/trainer_utils.py b/cogdl/trainer/trainer_utils.py index 320982b5..2de9827d 100644 --- a/cogdl/trainer/trainer_utils.py +++ b/cogdl/trainer/trainer_utils.py @@ -1,7 +1,7 @@ from typing import Dict import numpy as np - +import os import torch import torch.distributed as dist @@ -54,6 +54,16 @@ def save_model(model, path, epoch): print(f"Saving {epoch}-th model to {path} ...") torch.save(model.state_dict(), path) + if hasattr(model, "entity_embedding"): + entity_embedding = model.entity_embedding.numpy() + print('Saving entity_embedding to ',path) + np.save(os.path.join(path, "entity_embedding"), entity_embedding) + + if hasattr(model, "relation_embedding"): + relation_embedding = model.relation_embedding.numpy() + print('Saving entity_embedding to ',path) + np.save(os.path.join(entity_embedding, "relation_embedding"), relation_embedding) + def load_model(model, path): print(f"Loading model from {path} ...") diff --git a/cogdl/wrappers/data_wrapper/__init__.py b/cogdl/wrappers/data_wrapper/__init__.py index e764dab6..8a63cbfc 100644 --- a/cogdl/wrappers/data_wrapper/__init__.py +++ b/cogdl/wrappers/data_wrapper/__init__.py @@ -31,6 +31,7 @@ def fetch_data_wrapper(name): SUPPORTED_DW = { + "triple_link_prediction_dw": "cogdl.wrappers.data_wrapper.link_prediction.TripleDataWrapper", "cluster_dw": "cogdl.wrappers.data_wrapper.node_classification.ClusterWrapper", "graphsage_dw": "cogdl.wrappers.data_wrapper.node_classification.GraphSAGEDataWrapper", "m3s_dw": "cogdl.wrappers.data_wrapper.node_classification.M3SDataWrapper", diff --git a/cogdl/wrappers/data_wrapper/link_prediction/__init__.py b/cogdl/wrappers/data_wrapper/link_prediction/__init__.py index df1c7b1f..d36a96c0 100644 --- a/cogdl/wrappers/data_wrapper/link_prediction/__init__.py +++ b/cogdl/wrappers/data_wrapper/link_prediction/__init__.py @@ -1,3 +1,4 @@ from .embedding_link_prediction_dw import EmbeddingLinkPredictionDataWrapper from .gnn_kg_link_prediction_dw import GNNKGLinkPredictionDataWrapper from .gnn_link_prediction_dw import GNNLinkPredictionDataWrapper +from .triple_link_prediction_dw import TripleDataWrapper diff --git a/cogdl/wrappers/data_wrapper/link_prediction/gnn_link_prediction_dw.py b/cogdl/wrappers/data_wrapper/link_prediction/gnn_link_prediction_dw.py index f55d7023..15ff1b9d 100644 --- a/cogdl/wrappers/data_wrapper/link_prediction/gnn_link_prediction_dw.py +++ b/cogdl/wrappers/data_wrapper/link_prediction/gnn_link_prediction_dw.py @@ -1,6 +1,5 @@ import numpy as np import torch - from .. import DataWrapper diff --git a/cogdl/wrappers/data_wrapper/link_prediction/triple_link_prediction_dw.py b/cogdl/wrappers/data_wrapper/link_prediction/triple_link_prediction_dw.py new file mode 100644 index 00000000..f62d2f7f --- /dev/null +++ b/cogdl/wrappers/data_wrapper/link_prediction/triple_link_prediction_dw.py @@ -0,0 +1,91 @@ +from .. import DataWrapper +from cogdl.datasets.kg_data import BidirectionalOneShotIterator, TestDataset, TrainDataset +from torch.utils.data import DataLoader + + +class TripleDataWrapper(DataWrapper): + @classmethod + def add_args(self, parser): + # fmt: off + parser.add_argument("--batch_size", type=int, default=1024) + parser.add_argument("--test_batch_size", type=int, default=4) + self.parser = parser + return self.parser + # fmt: on + + def __init__(self, dataset): + super(TripleDataWrapper, self).__init__(dataset) + self.args = self.parser.parse_args() + self.dataset = dataset + self.negative_sample_size = self.args.negative_sample_size + self.batch_size = self.args.batch_size + + def train_wrapper(self): + dataset = self.dataset + + train_iter = self.output_iter(dataset, self.negative_sample_size, self.batch_size) + return train_iter + + def val_wrapper(self): + dataset = self.dataset + train_triples = dataset.triples[dataset.train_start_idx : dataset.valid_start_idx] + + valid_triples = dataset.triples[dataset.valid_start_idx : dataset.test_start_idx] + test_triples = dataset.triples[dataset.test_start_idx :] + all_true_triples = train_triples + valid_triples + test_triples + test_dataloader_head = DataLoader( + TestDataset(valid_triples, all_true_triples, dataset.num_entities, dataset.num_relations, "head-batch"), + batch_size=self.args.test_batch_size, + collate_fn=TestDataset.collate_fn, + ) + + test_dataloader_tail = DataLoader( + TestDataset(valid_triples, all_true_triples, dataset.num_entities, dataset.num_relations, "tail-batch"), + batch_size=self.args.test_batch_size, + collate_fn=TestDataset.collate_fn, + ) + + return (test_dataloader_head, test_dataloader_tail) + + def test_wrapper(self): + dataset = self.dataset + train_triples = dataset.triples[dataset.train_start_idx : dataset.valid_start_idx] + + valid_triples = dataset.triples[dataset.valid_start_idx : dataset.test_start_idx] + test_triples = dataset.triples[dataset.test_start_idx :] + all_true_triples = train_triples + valid_triples + test_triples + test_dataloader_head = DataLoader( + TestDataset(test_triples, all_true_triples, dataset.num_entities, dataset.num_relations, "head-batch"), + batch_size=self.args.test_batch_size, + collate_fn=TestDataset.collate_fn, + ) + + test_dataloader_tail = DataLoader( + TestDataset(test_triples, all_true_triples, dataset.num_entities, dataset.num_relations, "tail-batch"), + batch_size=self.args.test_batch_size, + collate_fn=TestDataset.collate_fn, + ) + return (test_dataloader_head, test_dataloader_tail) + + @staticmethod + def output_iter(dataset, negative_sample_size, batch_size): + train_triples = dataset.triples[dataset.train_start_idx : dataset.valid_start_idx] + nentity, nrelation = dataset._num_entities, dataset._num_relations + + # Set training dataloader iterator + train_dataloader_head = DataLoader( + TrainDataset(train_triples, nentity, nrelation, negative_sample_size, "head-batch"), + batch_size=batch_size, + shuffle=True, + collate_fn=TrainDataset.collate_fn, + ) + + train_dataloader_tail = DataLoader( + TrainDataset(train_triples, nentity, nrelation, negative_sample_size, "tail-batch"), + batch_size=batch_size, + shuffle=True, + collate_fn=TrainDataset.collate_fn, + ) + + train_iterator = BidirectionalOneShotIterator(train_dataloader_head, train_dataloader_tail) + return train_iterator diff --git a/cogdl/wrappers/default_match.py b/cogdl/wrappers/default_match.py index 3bbb187f..f75880ea 100644 --- a/cogdl/wrappers/default_match.py +++ b/cogdl/wrappers/default_match.py @@ -79,6 +79,11 @@ def set_default_wrapper_config(): "pte", "hin2vec", ] + triple_link_prediction_models=["transe","distmult", "rotate", "complex"] + triple_link_prediction_wrappers=dict() + for item in triple_link_prediction_models: + triple_link_prediction_wrappers[item] = {"mw": "triple_link_prediction_mw", "dw": "triple_link_prediction_dw"} + node_classification_wrappers = dict() for item in node_classification_models: @@ -147,6 +152,7 @@ def set_default_wrapper_config(): merged.update(heterogeneous_gnn_wrappers) merged.update(heterogeneous_emb_wrappers) merged.update(other_wrappers) + merged.update(triple_link_prediction_wrappers) return merged diff --git a/cogdl/wrappers/model_wrapper/__init__.py b/cogdl/wrappers/model_wrapper/__init__.py index ff3d7446..edcf778a 100644 --- a/cogdl/wrappers/model_wrapper/__init__.py +++ b/cogdl/wrappers/model_wrapper/__init__.py @@ -32,6 +32,7 @@ def fetch_model_wrapper(name): SUPPORTED_MW = { + "triple_link_prediction_mw":"cogdl.wrappers.model_wrapper.link_prediction.TripleModelWrapper", "dgi_mw": "cogdl.wrappers.model_wrapper.node_classification.DGIModelWrapper", "gcnmix_mw": "cogdl.wrappers.model_wrapper.node_classification.GCNMixModelWrapper", "grace_mw": "cogdl.wrappers.model_wrapper.node_classification.GRACEModelWrapper", diff --git a/cogdl/wrappers/model_wrapper/link_prediction/__init__.py b/cogdl/wrappers/model_wrapper/link_prediction/__init__.py index 59eca4e7..9e958b20 100644 --- a/cogdl/wrappers/model_wrapper/link_prediction/__init__.py +++ b/cogdl/wrappers/model_wrapper/link_prediction/__init__.py @@ -1,3 +1,4 @@ from .embedding_link_prediction_mw import EmbeddingLinkPredictionModelWrapper from .gnn_kg_link_prediction_mw import GNNKGLinkPredictionModelWrapper from .gnn_link_prediction_mw import GNNLinkPredictionModelWrapper +from .triple_link_prediction_mw import TripleModelWrapper diff --git a/cogdl/wrappers/model_wrapper/link_prediction/triple_link_prediction_mw.py b/cogdl/wrappers/model_wrapper/link_prediction/triple_link_prediction_mw.py new file mode 100644 index 00000000..6a20fdd2 --- /dev/null +++ b/cogdl/wrappers/model_wrapper/link_prediction/triple_link_prediction_mw.py @@ -0,0 +1,150 @@ +import torch +import torch.nn as nn +import os +import json +import numpy as np +from .. import ModelWrapper + +from cogdl.utils.link_prediction_utils import cal_mrr, DistMultLayer, ConvELayer +from cogdl.datasets.kg_data import BidirectionalOneShotIterator, TestDataset, TrainDataset +import torch.nn.functional as F + + +class TripleModelWrapper(ModelWrapper): + @classmethod + def add_args(self, parser): + # fmt: off + parser.add_argument("--negative_adversarial_sampling", default=False) + parser.add_argument("--negative_sample_size", type=int , default=128) + parser.add_argument("--uni_weight", action="store_true", help="Otherwise use subsampling weighting like in word2vec") + parser.add_argument("--regularization", default=1e-9, type=float) + parser.add_argument('--lr', default=0.001, type=float) + parser.add_argument("--adversarial_temperature", default=1.0, type=float) + parser.add_argument("--save-emb-path", default="./checkpoints") + parser.add_argument("--eval-step", type=int, default=501) + parser.add_argument("--do_test", default=True) + parser.add_argument("--do_valid", default=True) + + self.parser = parser + return self.parser + # fmt: on + + def __init__(self, model, optimizer_cfg): + super(TripleModelWrapper, self).__init__() + + self.model = model + self.optimizer_cfg = optimizer_cfg + self.args = self.parser.parse_args() + + def train_step(self, subgraph): + """ + A single train step. Apply back-propation and return the loss + """ + train_iterator = subgraph + positive_sample, negative_sample, subsampling_weight, mode = next(train_iterator) + + positive_sample = positive_sample.to(self.device) + negative_sample = negative_sample.to(self.device) + subsampling_weight = subsampling_weight.to(self.device) + + negative_score = self.model((positive_sample, negative_sample), mode=mode) + + if self.args.negative_adversarial_sampling: + # In self-adversarial sampling, we do not apply back-propagation on the sampling weight + negative_score = (F.softmax(negative_score * self.args.adversarial_temperature, dim=1).detach() * F.logsigmoid(-negative_score)).sum(dim=1) + else: + negative_score = F.logsigmoid(-negative_score).mean(dim=1) + + positive_score = self.model(positive_sample) + + positive_score = F.logsigmoid(positive_score).squeeze(dim=1) + + if self.args.uni_weight: + positive_sample_loss = -positive_score.mean() + negative_sample_loss = -negative_score.mean() + else: + positive_sample_loss = -(subsampling_weight * positive_score).sum() / subsampling_weight.sum() + negative_sample_loss = -(subsampling_weight * negative_score).sum() / subsampling_weight.sum() + + loss = (positive_sample_loss + negative_sample_loss) / 2 + if self.args.regularization != 0.0: + # Use L3 regularization for ComplEx and DistMult + regularization = self.args.regularization * ( + self.model.entity_embedding.norm(p=3) ** 3 + self.model.relation_embedding.norm(p=3).norm(p=3) ** 3 + ) + loss = loss + regularization + + return loss + + def test_step(self, subgraph): + print("Test Dataset:") + metrics = self.eval_step(subgraph) + return dict(mrr=metrics["MRR"], mr=metrics["MR"], hits1=metrics["HITS@1"], hits3=metrics["HITS@3"], hits10=metrics["HITS@10"]) + + def val_step(self, subgraph): + print("Val Dataset:") + metrics = self.eval_step(subgraph) + return dict(mrr=metrics["MRR"], mr=metrics["MR"], hits1=metrics["HITS@1"], hits3=metrics["HITS@3"], hits10=metrics["HITS@10"]) + + def eval_step(self, subgraph): + test_dataloader_head, test_dataloader_tail = subgraph + logs = [] + step = 0 + test_dataset_list = [test_dataloader_head, test_dataloader_tail] + total_steps = sum([len(dataset) for dataset in test_dataset_list]) + + for test_dataset in test_dataset_list: + for positive_sample, negative_sample, filter_bias, mode in test_dataset: + positive_sample = positive_sample.to(self.device) + negative_sample = negative_sample.to(self.device) + filter_bias = filter_bias.to(self.device) + + batch_size = positive_sample.size(0) + + score = self.model((positive_sample, negative_sample), mode) + score += filter_bias + + # Explicitly sort all the entities to ensure that there is no test exposure bias + argsort = torch.argsort(score, dim=1, descending=True) + + if mode == "head-batch": + positive_arg = positive_sample[:, 0] + elif mode == "tail-batch": + positive_arg = positive_sample[:, 2] + else: + raise ValueError("mode %s not supported" % mode) + + for i in range(batch_size): + # Notice that argsort is not ranking + ranking = (argsort[i, :] == positive_arg[i]).nonzero() + assert ranking.size(0) == 1 + + # ranking + 1 is the true ranking used in evaluation metrics + ranking = 1 + ranking.item() + logs.append( + { + "MRR": 1.0 / ranking, + "MR": float(ranking), + "HITS@1": 1.0 if ranking <= 1 else 0.0, + "HITS@3": 1.0 if ranking <= 3 else 0.0, + "HITS@10": 1.0 if ranking <= 10 else 0.0, + } + ) + + if step % 1000 == 0: + print("Evaluating the model... (%d/%d)" % (step, total_steps)) + + step += 1 + + metrics = {} + for metric in logs[0].keys(): + metrics[metric] = sum([log[metric] for log in logs]) / len(logs) + print("The Dataset metrics:", metrics) + return metrics + + def setup_optimizer(self): + lr, weight_decay = self.optimizer_cfg["lr"], self.optimizer_cfg["weight_decay"] + return torch.optim.AdamW(self.parameters(), lr=lr, weight_decay=weight_decay) + + def set_early_stopping(self): + return "mrr", ">" diff --git a/examples/custom_triple_dataset.py b/examples/custom_triple_dataset.py new file mode 100644 index 00000000..aedbd91d --- /dev/null +++ b/examples/custom_triple_dataset.py @@ -0,0 +1,17 @@ +from cogdl import experiment +from cogdl.datasets.kg_data import KnowledgeGraphDataset +import os.path as osp + +# ./data/custom_dataset/raw need "train2id.txt", "valid2id.txt", "test2id.txt" +class Test_kgDatset(KnowledgeGraphDataset): + def __init__(self, data_path="/home/cogdl/data"): + dataset = "custom_dataset" + path = osp.join(data_path, dataset) + super((Test_kgDatset), self).__init__(path, dataset) + + def download(self): + pass + +if __name__ == "__main__": + dataset =Test_kgDatset() + experiment(dataset=dataset, model="transe",do_valid=False,do_test=True,epochs=500,eval_step=501) diff --git a/examples/generate_emb.py b/examples/generate_emb.py index 0c552636..991cbc24 100644 --- a/examples/generate_emb.py +++ b/examples/generate_emb.py @@ -17,6 +17,6 @@ # build a pipeline for generating embeddings using unsupervised GNNs # pass model name and num_features with its hyper-parameters to this API -generator = pipeline("generate-emb", model="mvgrl", no_test=True, num_features=8, hidden_size=4) +generator = pipeline("generate-emb", model="mvgrl", return_model=True, num_features=8, hidden_size=4) outputs = generator(edge_index, x=np.random.randn(8, 8)) print(outputs) diff --git a/tests/tasks/test_triple_link_prediction.py b/tests/tasks/test_triple_link_prediction.py new file mode 100644 index 00000000..effca6ae --- /dev/null +++ b/tests/tasks/test_triple_link_prediction.py @@ -0,0 +1,60 @@ +from cogdl.options import get_default_args +from cogdl.experiments import train + + + +default_dict_kg = { + "epochs": 2, + "batch_size": 1024, + "cpu":True, + "lr": 0.001, + "negative_ratio": 3, + "checkpoint": False, + "save_dir": ".", + "device_id": [0], + "actnn": False, + "do_test":False, + "do_valid":False , + "eval_step":3, +} + + +def get_default_args_kg(dataset, model, dw="triple_link_prediction_dw", mw="triple_link_prediction_mw"): + args = get_default_args(dataset=dataset, model=model, dw=dw, mw=mw) + for key, value in default_dict_kg.items(): + args.__setattr__(key, value) + return args + + +def test_transe_fb15k(): + args = get_default_args_kg(dataset="fb15k", model="transe") + ret = train(args) + #assert 0 <= ret["mrr"] <= 1 + + +def test_complex_fb15k(): + args = get_default_args_kg(dataset="fb15k", model="complex") + args.double_entity_embedding = True + args.double_relation_embedding=True + ret = train(args) + #assert 0 <= ret["mrr"] <= 1 + + +def test_distmult_wn18(): + args = get_default_args_kg(dataset="wn18", model="distmult") + ret = train(args) + #assert 0 <= ret["mrr"] <= 1 + +def test_rotate_wn18(): + args = get_default_args_kg(dataset="wn18", model="rotate") + args.double_entity_embedding = True + ret = train(args) + #assert 0 <= ret["mrr"] <= 1 + + +if __name__ == "__main__": + test_transe_fb15k() + test_complex_fb15k() + test_distmult_wn18() + test_rotate_wn18() + \ No newline at end of file diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 61433eec..ae6d434c 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -37,7 +37,7 @@ def test_gen_emb(): generator = pipeline( "generate-emb", model="mvgrl", - no_test=True, + return_model=True, num_features=8, hidden_size=10, sample_size=2,