v0.2.6

- Add IFM and DIFM model - Support multi gpus running
shenweichen · Apr 4, 2021 · 8265c75 · 8265c75
2 parents d18ea26 + ea6bc38
commit 8265c75
Show file tree

Hide file tree

Showing 40 changed files with 451 additions and 115 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -18,7 +18,7 @@ jobs:
     strategy:
       matrix:
         python-version: [3.6,3.7]
-        torch-version: [1.1.0,1.2.0,1.3.0,1.4.0,1.5.0,1.6.0,1.7.0]
+        torch-version: [1.1.0,1.2.0,1.3.0,1.4.0,1.5.0,1.6.0,1.7.0,1.8.1]
 
 #        exclude:
 #          - python-version: 3.5

diff --git a/README.md b/README.md
@@ -38,7 +38,9 @@ Let's [**Get Started!**](https://deepctr-torch.readthedocs.io/en/latest/Quick-St
 |                AutoInt                 | [CIKM 2019][AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks](https://arxiv.org/abs/1810.11921)                              |
 |                  ONN                   | [arxiv 2019][Operation-aware Neural Networks for User Response Prediction](https://arxiv.org/pdf/1904.12579.pdf)                                                |
 |                FiBiNET                 | [RecSys 2019][FiBiNET: Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction](https://arxiv.org/pdf/1905.09433.pdf)   |
+|                IFM                 | [IJCAI 2019][An Input-aware Factorization Machine for Sparse Prediction](https://www.ijcai.org/Proceedings/2019/0203.pdf)   |
 |                DCN V2                    | [arxiv 2020][DCN V2: Improved Deep & Cross Network and Practical Lessons for Web-scale Learning to Rank Systems](https://arxiv.org/abs/2008.13535)   |
+|                DIFM                 | [IJCAI 2020][A Dual Input-aware Factorization Machine for CTR Prediction](https://www.ijcai.org/Proceedings/2020/0434.pdf)   |
 
 
 ## DisscussionGroup & Related Projects
@@ -82,6 +84,11 @@ Let's [**Get Started!**](https://deepctr-torch.readthedocs.io/en/latest/Quick-St
          <a href="https://github.com/shenweichen">Shen Weichen</a> 
         <p>Core Dev<br> Zhejiang Unversity <br> <br>  </p>
       </td>
+      <td>
+         <a href="https://github.com/zanshuxun"><img width="70" height="70" src="https://github.com/zanshuxun.png?s=40" alt="pic"></a><br>
+         <a href="https://github.com/zanshuxun">Zan Shuxun</a>
+        <p>Core Dev<br> Beijing University <br> of  Posts and <br> Telecommunications</p>
+      </td>
       <td>
          <a href="https://github.com/weberrr"><img width="70" height="70" src="https://github.com/weberrr.png?s=40" alt="pic"></a><br>
          <a href="https://github.com/weberrr">Wang Ze</a> 
@@ -92,11 +99,6 @@ Let's [**Get Started!**](https://deepctr-torch.readthedocs.io/en/latest/Quick-St
          <a href="https://github.com/wutongzhang">Zhang Wutong</a>
          <p>Core Dev<br> Beijing University <br> of  Posts and <br> Telecommunications</p>
       </td>
-      <td>
-         <a href="https://github.com/zanshuxun"><img width="70" height="70" src="https://github.com/zanshuxun.png?s=40" alt="pic"></a><br>
-         <a href="https://github.com/zanshuxun">Zan Shuxun</a>
-        <p>Core Dev<br> Beijing University <br> of  Posts and <br> Telecommunications</p>
-      </td>
       <td>
          <a href="https://github.com/ZhangYuef"><img width="70" height="70" src="https://github.com/ZhangYuef.png?s=40" alt="pic"></a><br>
          <a href="https://github.com/ZhangYuef">Zhang Yuefeng</a>

diff --git a/deepctr_torch/__init__.py b/deepctr_torch/__init__.py
@@ -2,5 +2,5 @@
 from . import models
 from .utils import check_version
 
-__version__ = '0.2.5'
+__version__ = '0.2.6'
 check_version(__version__)
diff --git a/deepctr_torch/layers/activation.py b/deepctr_torch/layers/activation.py
@@ -12,7 +12,7 @@ class Dice(nn.Module):
 
     Output shape:
         - Same shape as input.
-    
+
     References
         - [Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068.](https://arxiv.org/pdf/1706.06978.pdf)
         - https://github.com/zhougr1993/DeepInterestNetwork, https://github.com/fanoping/DIN-pytorch

diff --git a/deepctr_torch/layers/interaction.py b/deepctr_torch/layers/interaction.py
@@ -106,10 +106,11 @@ class BilinearInteraction(nn.Module):
       Input shape
         - A list of 3D tensor with shape: ``(batch_size,filed_size, embedding_size)``.
       Output shape
-        - 3D tensor with shape: ``(batch_size,filed_size, embedding_size)``.
+        - 3D tensor with shape: ``(batch_size,filed_size*(filed_size-1)/2, embedding_size)``.
       Arguments
         - **filed_size** : Positive integer, number of feature groups.
-        - **str** : String, types of bilinear functions used in this layer.
+        - **embedding_size** : Positive integer, embedding size of sparse features.
+        - **bilinear_type** : String, types of bilinear functions used in this layer.
         - **seed** : A Python integer to use as random seed.
       References
         - [FiBiNET: Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction
@@ -125,7 +126,7 @@ def __init__(self, filed_size, embedding_size, bilinear_type="interaction", seed
             self.bilinear = nn.Linear(
                 embedding_size, embedding_size, bias=False)
         elif self.bilinear_type == "each":
-            for i in range(filed_size):
+            for _ in range(filed_size):
                 self.bilinear.append(
                     nn.Linear(embedding_size, embedding_size, bias=False))
         elif self.bilinear_type == "interaction":
@@ -340,13 +341,14 @@ class InteractingLayer(nn.Module):
             - [Song W, Shi C, Xiao Z, et al. AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks[J]. arXiv preprint arXiv:1810.11921, 2018.](https://arxiv.org/abs/1810.11921)
     """
 
-    def __init__(self, in_features, att_embedding_size=8, head_num=2, use_res=True, seed=1024, device='cpu'):
+    def __init__(self, in_features, att_embedding_size=8, head_num=2, use_res=True, scaling=False, seed=1024, device='cpu'):
         super(InteractingLayer, self).__init__()
         if head_num <= 0:
             raise ValueError('head_num must be a int > 0')
         self.att_embedding_size = att_embedding_size
         self.head_num = head_num
         self.use_res = use_res
+        self.scaling = scaling
         self.seed = seed
 
         embedding_size = in_features
@@ -388,7 +390,8 @@ def forward(self, inputs):
             values, self.att_embedding_size, dim=2))
         inner_product = torch.einsum(
             'bnik,bnjk->bnij', querys, keys)  # head_num None F F
-
+        if self.scaling:
+            inner_product /= self.att_embedding_size ** 0.5
         self.normalized_att_scores = F.softmax(
             inner_product, dim=-1)  # head_num None F F
         result = torch.matmul(self.normalized_att_scores,
@@ -428,17 +431,20 @@ def __init__(self, in_features, layer_num=2, parameterization='vector', seed=102
         self.parameterization = parameterization
         if self.parameterization == 'vector':
             # weight in DCN.  (in_features, 1)
-            self.kernels = torch.nn.ParameterList(
-                [nn.Parameter(nn.init.xavier_normal_(torch.empty(in_features, 1))) for i in range(self.layer_num)])
+            self.kernels = nn.Parameter(torch.Tensor(self.layer_num, in_features, 1))
         elif self.parameterization == 'matrix':
             # weight matrix in DCN-M.  (in_features, in_features)
-            self.kernels = torch.nn.ParameterList([nn.Parameter(nn.init.xavier_normal_(
-                torch.empty(in_features, in_features))) for i in range(self.layer_num)])
+            self.kernels = nn.Parameter(torch.Tensor(self.layer_num, in_features, in_features))
         else:  # error
             raise ValueError("parameterization should be 'vector' or 'matrix'")
 
-        self.bias = torch.nn.ParameterList(
-            [nn.Parameter(nn.init.zeros_(torch.empty(in_features, 1))) for i in range(self.layer_num)])
+        self.bias = nn.Parameter(torch.Tensor(self.layer_num, in_features, 1))
+
+        for i in range(self.kernels.shape[0]):
+            nn.init.xavier_normal_(self.kernels[i])
+        for i in range(self.bias.shape[0]):
+            nn.init.zeros_(self.bias[i])
+
         self.to(device)
 
     def forward(self, inputs):
@@ -483,18 +489,23 @@ def __init__(self, in_features, low_rank=32, num_experts=4, layer_num=2, device=
         self.num_experts = num_experts
 
         # U: (in_features, low_rank)
-        self.U_list = torch.nn.ParameterList([nn.Parameter(nn.init.xavier_normal_(
-            torch.empty(num_experts, in_features, low_rank))) for i in range(self.layer_num)])
+        self.U_list = nn.Parameter(torch.Tensor(self.layer_num, num_experts, in_features, low_rank))
         # V: (in_features, low_rank)
-        self.V_list = torch.nn.ParameterList([nn.Parameter(nn.init.xavier_normal_(
-            torch.empty(num_experts, in_features, low_rank))) for i in range(self.layer_num)])
+        self.V_list = nn.Parameter(torch.Tensor(self.layer_num, num_experts, in_features, low_rank))
         # C: (low_rank, low_rank)
-        self.C_list = torch.nn.ParameterList([nn.Parameter(nn.init.xavier_normal_(
-            torch.empty(num_experts, low_rank, low_rank))) for i in range(self.layer_num)])
+        self.C_list = nn.Parameter(torch.Tensor(self.layer_num, num_experts, low_rank, low_rank))
         self.gating = nn.ModuleList([nn.Linear(in_features, 1, bias=False) for i in range(self.num_experts)])
 
-        self.bias = torch.nn.ParameterList([nn.Parameter(nn.init.zeros_(
-            torch.empty(in_features, 1))) for i in range(self.layer_num)])
+        self.bias = nn.Parameter(torch.Tensor(self.layer_num, in_features, 1))
+
+        init_para_list = [self.U_list, self.V_list, self.C_list]
+        for i in range(len(init_para_list)):
+            for j in range(self.layer_num):
+                nn.init.xavier_normal_(init_para_list[i][j])
+
+        for i in range(len(self.bias)):
+            nn.init.zeros_(self.bias[i])
+
         self.to(device)
 
     def forward(self, inputs):

diff --git a/deepctr_torch/layers/sequence.py b/deepctr_torch/layers/sequence.py
@@ -39,7 +39,7 @@ def _sequence_mask(self, lengths, maxlen=None, dtype=torch.bool):
         # Returns a mask tensor representing the first N positions of each cell.
         if maxlen is None:
             maxlen = lengths.max()
-        row_vector = torch.arange(0, maxlen, 1).to(self.device)
+        row_vector = torch.arange(0, maxlen, 1).to(lengths.device)
         matrix = torch.unsqueeze(lengths, dim=-1)
         mask = row_vector < matrix
 
@@ -70,6 +70,7 @@ def forward(self, seq_value_len_list):
         hist = torch.sum(hist, dim=1, keepdim=False)
 
         if self.mode == 'mean':
+            self.eps = self.eps.to(user_behavior_length.device)
             hist = torch.div(hist, user_behavior_length.type(torch.float32) + self.eps)
 
         hist = torch.unsqueeze(hist, dim=1)

diff --git a/deepctr_torch/models/__init__.py b/deepctr_torch/models/__init__.py
@@ -2,6 +2,8 @@
 from .deepfm import DeepFM
 from .xdeepfm import xDeepFM
 from .afm import AFM
+from .difm import DIFM
+from .ifm import IFM
 from .autoint import AutoInt
 from .dcn import DCN
 from .dcnmix import DCNMix

diff --git a/deepctr_torch/models/afm.py b/deepctr_torch/models/afm.py
@@ -27,16 +27,17 @@ class AFM(BaseModel):
     :param seed: integer ,to use as random seed.
     :param task: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
     :param device: str, ``"cpu"`` or ``"cuda:0"``
+    :param gpus: list of int or torch.device for multiple gpus. If None, run on `device`. `gpus[0]` should be the same gpu with `device`.
     :return: A PyTorch model instance.
 
     """
 
     def __init__(self, linear_feature_columns, dnn_feature_columns, use_attention=True, attention_factor=8,
                  l2_reg_linear=1e-5, l2_reg_embedding=1e-5, l2_reg_att=1e-5, afm_dropout=0, init_std=0.0001, seed=1024,
-                 task='binary', device='cpu'):
+                 task='binary', device='cpu', gpus=None):
         super(AFM, self).__init__(linear_feature_columns, dnn_feature_columns, l2_reg_linear=l2_reg_linear,
                                   l2_reg_embedding=l2_reg_embedding, init_std=init_std, seed=seed, task=task,
-                                  device=device)
+                                  device=device, gpus=gpus)
 
         self.use_attention = use_attention
 

diff --git a/deepctr_torch/models/autoint.py b/deepctr_torch/models/autoint.py
@@ -32,19 +32,20 @@ class AutoInt(BaseModel):
     :param seed: integer ,to use as random seed.
     :param task: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
     :param device: str, ``"cpu"`` or ``"cuda:0"``
+    :param gpus: list of int or torch.device for multiple gpus. If None, run on `device`. `gpus[0]` should be the same gpu with `device`.
     :return: A PyTorch model instance.
-    
+
     """
 
     def __init__(self, linear_feature_columns, dnn_feature_columns, att_layer_num=3, att_embedding_size=8, att_head_num=2,
                  att_res=True,
                  dnn_hidden_units=(256, 128), dnn_activation='relu',
                  l2_reg_dnn=0, l2_reg_embedding=1e-5, dnn_use_bn=False, dnn_dropout=0, init_std=0.0001, seed=1024,
-                 task='binary', device='cpu'):
+                 task='binary', device='cpu', gpus=None):
 
         super(AutoInt, self).__init__(linear_feature_columns, dnn_feature_columns, l2_reg_linear=0,
                                       l2_reg_embedding=l2_reg_embedding, init_std=init_std, seed=seed, task=task,
-                                      device=device)
+                                      device=device, gpus=gpus)
 
         if len(dnn_hidden_units) <= 0 and att_layer_num <= 0:
             raise ValueError("Either hidden_layer or att_layer_num must > 0")

diff --git a/deepctr_torch/models/basemodel.py b/deepctr_torch/models/basemodel.py
@@ -59,7 +59,7 @@ def __init__(self, feature_columns, feature_index, init_std=0.0001, device='cpu'
                 device))
             torch.nn.init.normal_(self.weight, mean=0, std=init_std)
 
-    def forward(self, X):
+    def forward(self, X, sparse_feat_refine_weight=None):
 
         sparse_embedding_list = [self.embedding_dict[feat.embedding_name](
             X[:, self.feature_index[feat.name][0]:self.feature_index[feat.name][1]].long()) for
@@ -73,34 +73,37 @@ def forward(self, X):
 
         sparse_embedding_list += varlen_embedding_list
 
-        if len(sparse_embedding_list) > 0 and len(dense_value_list) > 0:
-            linear_sparse_logit = torch.sum(
-                torch.cat(sparse_embedding_list, dim=-1), dim=-1, keepdim=False)
-            linear_dense_logit = torch.cat(
+        linear_logit = torch.zeros([X.shape[0], 1]).to(sparse_embedding_list[0].device)
+        if len(sparse_embedding_list) > 0:
+            sparse_embedding_cat = torch.cat(sparse_embedding_list, dim=-1)
+            if sparse_feat_refine_weight is not None:
+                # w_{x,i}=m_{x,i} * w_i (in IFM and DIFM)
+                sparse_embedding_cat = sparse_embedding_cat * sparse_feat_refine_weight.unsqueeze(1)
+            sparse_feat_logit = torch.sum(sparse_embedding_cat, dim=-1, keepdim=False)
+            linear_logit += sparse_feat_logit
+        if len(dense_value_list) > 0:
+            dense_value_logit = torch.cat(
                 dense_value_list, dim=-1).matmul(self.weight)
-            linear_logit = linear_sparse_logit + linear_dense_logit
-        elif len(sparse_embedding_list) > 0:
-            linear_logit = torch.sum(
-                torch.cat(sparse_embedding_list, dim=-1), dim=-1, keepdim=False)
-        elif len(dense_value_list) > 0:
-            linear_logit = torch.cat(
-                dense_value_list, dim=-1).matmul(self.weight)
-        else:
-            linear_logit = torch.zeros([X.shape[0], 1])
+            linear_logit += dense_value_logit
+
         return linear_logit
 
 
 class BaseModel(nn.Module):
     def __init__(self, linear_feature_columns, dnn_feature_columns, l2_reg_linear=1e-5, l2_reg_embedding=1e-5,
-                 init_std=0.0001, seed=1024, task='binary', device='cpu'):
+                 init_std=0.0001, seed=1024, task='binary', device='cpu', gpus=None):
 
         super(BaseModel, self).__init__()
         torch.manual_seed(seed)
         self.dnn_feature_columns = dnn_feature_columns
 
         self.reg_loss = torch.zeros((1,), device=device)
         self.aux_loss = torch.zeros((1,), device=device)
-        self.device = device  # device
+        self.device = device
+        self.gpus = gpus
+        if gpus and str(self.gpus[0]) not in self.device:
+            raise ValueError(
+                "`gpus[0]` should be the same gpu with `device`")
 
         self.feature_index = build_input_features(
             linear_feature_columns + dnn_feature_columns)
@@ -192,14 +195,21 @@ def fit(self, x=None, y=None, batch_size=None, epochs=1, verbose=1, initial_epoc
             torch.from_numpy(y))
         if batch_size is None:
             batch_size = 256
-        train_loader = DataLoader(
-            dataset=train_tensor_data, shuffle=shuffle, batch_size=batch_size)
 
-        print(self.device, end="\n")
         model = self.train()
         loss_func = self.loss_func
         optim = self.optim
 
+        if self.gpus:
+            print('parallel running on these gpus:', self.gpus)
+            model = torch.nn.DataParallel(model, device_ids=self.gpus)
+            batch_size *= len(self.gpus)  # input `batch_size` is batch_size per gpu
+        else:
+            print(self.device)
+
+        train_loader = DataLoader(
+            dataset=train_tensor_data, shuffle=shuffle, batch_size=batch_size)
+
         sample_num = len(train_tensor_data)
         steps_per_epoch = (sample_num - 1) // batch_size + 1
 
@@ -224,7 +234,7 @@ def fit(self, x=None, y=None, batch_size=None, epochs=1, verbose=1, initial_epoc
             train_result = {}
             try:
                 with tqdm(enumerate(train_loader), disable=verbose != 1) as t:
-                    for index, (x_train, y_train) in t:
+                    for _, (x_train, y_train) in t:
                         x = x_train.to(self.device).float()
                         y = y_train.to(self.device).float()
 
@@ -323,7 +333,7 @@ def predict(self, x, batch_size=256):
 
         pred_ans = []
         with torch.no_grad():
-            for index, x_test in enumerate(test_loader):
+            for _, x_test in enumerate(test_loader):
                 x = x_test[0].to(self.device).float()
 
                 y_pred = model(x).cpu().data.numpy()  # .squeeze()

diff --git a/deepctr_torch/models/ccpm.py b/deepctr_torch/models/ccpm.py
@@ -34,18 +34,19 @@ class CCPM(BaseModel):
     :param seed: integer ,to use as random seed.
     :param task: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
     :param device: str, ``"cpu"`` or ``"cuda:0"``
+    :param gpus: list of int or torch.device for multiple gpus. If None, run on `device`. `gpus[0]` should be the same gpu with `device`.
     :return: A PyTorch model instance.
 
     """
 
     def __init__(self, linear_feature_columns, dnn_feature_columns, conv_kernel_width=(6, 5),
                  conv_filters=(4, 4),
                  dnn_hidden_units=(256,), l2_reg_linear=1e-5, l2_reg_embedding=1e-5, l2_reg_dnn=0, dnn_dropout=0,
-                 init_std=0.0001, seed=1024, task='binary', device='cpu', dnn_use_bn=False, dnn_activation='relu'):
+                 init_std=0.0001, seed=1024, task='binary', device='cpu', dnn_use_bn=False, dnn_activation='relu', gpus=None):
 
         super(CCPM, self).__init__(linear_feature_columns, dnn_feature_columns, l2_reg_linear=l2_reg_linear,
                                    l2_reg_embedding=l2_reg_embedding, init_std=init_std, seed=seed, task=task,
-                                   device=device)
+                                   device=device, gpus=gpus)
 
         if len(conv_kernel_width) != len(conv_filters):
             raise ValueError(