Merge pull request #22 from deel-ai/develop

Develop
deel-ai · May 26, 2021 · 56e0006 · 56e0006
2 parents 843a30c + c09ed37
commit 56e0006
Show file tree

Hide file tree

Showing 12 changed files with 239 additions and 197 deletions.
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -11,8 +11,8 @@ sphinx:
   builder: html
 
 # Optionally build your docs in additional formats such as PDF
-formats:
-  - pdf
+#formats:
+#  - pdf
 
 # Optionally set the version of Python and requirements required to build your docs
 python:

diff --git a/README.md b/README.md
@@ -29,7 +29,12 @@ In order to make things simple the following rules have been followed during dev
 
 Here is an example showing how to build and train a 1-Lipschitz network:
 ```python
-from deel.lip.layers import SpectralDense, SpectralConv2D, ScaledL2NormPooling2D
+from deel.lip.layers import (
+    SpectralDense,
+    SpectralConv2D,
+    ScaledL2NormPooling2D,
+    FrobeniusDense,
+)
 from deel.lip.model import Sequential
 from deel.lip.activations import GroupSort
 from deel.lip.losses import HKR_multiclass_loss
@@ -40,7 +45,7 @@ from tensorflow.keras.utils import to_categorical
 import numpy as np
 
 # Sequential (resp Model) from deel.model has the same properties as any lipschitz model.
-# It act only as a container, with features specific to lipschitz 
+# It act only as a container, with features specific to lipschitz
 # functions (condensation, vanilla_exportation...)
 model = Sequential(
     [
@@ -52,28 +57,28 @@ model = Sequential(
             filters=16,
             kernel_size=(3, 3),
             activation=GroupSort(2),
-            use_bias=False,
+            use_bias=True,
             kernel_initializer="orthogonal",
         ),
         # usual pooling layer are implemented (avg, max...), but new layers are also available
         ScaledL2NormPooling2D(pool_size=(2, 2), data_format="channels_last"),
         SpectralConv2D(
-            filters=32,
+            filters=16,
             kernel_size=(3, 3),
             activation=GroupSort(2),
-            use_bias=False,
+            use_bias=True,
             kernel_initializer="orthogonal",
         ),
         ScaledL2NormPooling2D(pool_size=(2, 2), data_format="channels_last"),
         # our layers are fully interoperable with existing keras layers
         Flatten(),
         SpectralDense(
-            100,
+            32,
             activation=GroupSort(2),
-            use_bias=False,
+            use_bias=True,
             kernel_initializer="orthogonal",
         ),
-        SpectralDense(
+        FrobeniusDense(
             10, activation=None, use_bias=False, kernel_initializer="orthogonal"
         ),
     ],
@@ -85,8 +90,10 @@ model = Sequential(
 
 # HKR (Hinge-Krantorovich-Rubinstein) optimize robustness along with accuracy
 model.compile(
-    loss=HKR_multiclass_loss(alpha=5.0, min_margin=0.5),
-    optimizer=Adam(lr=0.01),
+    # decreasing alpha and increasing min_margin improve robustness (at the cost of accuracy)
+    # note also in the case of lipschitz networks, more robustness require more parameters.
+    loss=HKR_multiclass_loss(alpha=25, min_margin=0.25),
+    optimizer=Adam(lr=0.005),
     metrics=["accuracy"],
 )
 

diff --git a/deel/lip/layers.py b/deel/lip/layers.py
@@ -588,7 +588,10 @@ def vanilla_export(self):
 @_deel_export
 class FrobeniusDense(Dense, LipschitzLayer, Condensable):
     """
-    Same a SpectralDense, but in the case of a single output.
+    Same a SpectralDense, but in the case of a single output. In the multiclass
+    setting, the behaviour of this layer is similar to the stacking of 1 lipschitz
+    layer (each output is 1-lipschitz, but the no orthogonality is enforced between
+    outputs ).
     """
 
     def __init__(
@@ -632,7 +635,7 @@ def _compute_lip_coef(self, input_shape=None):
         return 1.0
 
     def call(self, x):
-        W_bar = self.kernel / tf.norm(self.kernel) * self._get_coef()
+        W_bar = self.kernel / tf.norm(self.kernel, axis=0) * self._get_coef()
         kernel = self.kernel
         self.kernel = W_bar
         outputs = Dense.call(self, x)
@@ -647,7 +650,7 @@ def get_config(self):
         return dict(list(base_config.items()) + list(config.items()))
 
     def condense(self):
-        W_bar = self.kernel / tf.norm(self.kernel)
+        W_bar = self.kernel / tf.norm(self.kernel, axis=0)
         self.kernel.assign(W_bar)
 
     def vanilla_export(self):

diff --git a/deel/lip/losses.py b/deel/lip/losses.py
@@ -39,7 +39,9 @@ def KR_loss_fct(y_true, y_pred):
         S0 = tf.equal(y_true, 1)
         S1 = tf.not_equal(y_true, 1)
         # compute the KR dual representation
-        return K.mean(tf.boolean_mask(y_pred, S0)) - K.mean(tf.boolean_mask(y_pred, S1))
+        return tf.reduce_mean(tf.boolean_mask(y_pred, S0)) - tf.reduce_mean(
+            tf.boolean_mask(y_pred, S1)
+        )
 
     return KR_loss_fct
 
@@ -85,17 +87,17 @@ def HKR_loss(alpha, min_margin=1):
          a function that compute the regularized Wasserstein loss
 
     """
+    KR = KR_loss()
+    hinge = hinge_margin_loss(min_margin)
 
     @tf.function
     def HKR_loss_fct(y_true, y_pred):
         if alpha == np.inf:  # if alpha infinite, use hinge only
-            return hinge_margin_loss(min_margin)(y_true, y_pred)
+            return hinge(y_true, y_pred)
         else:
             # true value: positive value should be the first to be coherent with the
             # hinge loss (positive y_pred)
-            return alpha * hinge_margin_loss(min_margin)(y_true, y_pred) - KR_loss()(
-                y_true, y_pred
-            )
+            return alpha * hinge(y_true, y_pred) - KR(y_true, y_pred)
 
     return HKR_loss_fct
 
@@ -121,7 +123,7 @@ def hinge_margin_loss(min_margin=1):
     @tf.function
     def hinge_margin_fct(y_true, y_pred):
         sign = K.sign(
-            y_true - eps
+            tf.cast(y_true, y_pred.dtype) - eps
         )  # subtracting a small eps makes the loss work for (1,0) and (1,-1) labels
         hinge = K.maximum(0.0, min_margin - sign * y_pred)
         return K.mean(hinge)
@@ -132,15 +134,13 @@ def hinge_margin_fct(y_true, y_pred):
 @_deel_export
 def KR_multiclass_loss():
     r"""
-    Loss to estimate average of W1 distance using Kantorovich-Rubinstein
-    duality over outputs. In this multiclass setup thr KR term is computed
-    for each class and then averaged.
-
-    Note:
-        y_true has to be one hot encoded (labels being 1s and 0s ).
+    Loss to estimate average of W1 distance using Kantorovich-Rubinstein duality over
+    outputs. Note y_true should be one hot encoding (labels being 1s and 0s ). In
+    this multiclass setup thr KR term is computed for each class and then averaged.
 
     Returns:
         Callable, the function to compute Wasserstein multiclass loss.
+        #Note y_true has to be one hot encoded
 
     """
 
@@ -160,7 +160,7 @@ def KR_multiclass_loss_fct(y_true, y_pred):
             - tf.reduce_sum(y_true, axis=0)
         )
         # compute the differences to have the KR term for each output neuron,
-        # and compute the average over the classes
+        # then compute the average over the classes
         return tf.reduce_mean(-espNotYtrue + espYtrue)
 
     return KR_multiclass_loss_fct
@@ -169,17 +169,15 @@ def KR_multiclass_loss_fct(y_true, y_pred):
 @_deel_export
 def Hinge_multiclass_loss(min_margin=1):
     """
-    Loss to estimate the Hinge loss in a multiclass setup. It compute the
-    elementwise hinge term. Note that this formulation differs from the
-    one commonly found in tensorflow/pytorch (with marximise the difference
-    between the two largest logits). This formulation is consistent with the
-    binary cassification loss used in a multiclass fashion.
-
-    Note:
-         y_true should be one hot encoded. labels in (1,0)
+    Loss to estimate the Hinge loss in a multiclass setup. It compute the elementwise
+    hinge term. Note that this formulation differs from the one commonly found in
+    tensorflow/pytorch (with marximise the difference between the two largest
+    logits). This formulation is consistent with the binary classification loss used
+    in a multiclass fashion. Note y_true should be one hot encoded. labels in (1,0)
 
     Returns:
         Callable, the function to compute multiclass Hinge loss
+        #Note y_true has to be one hot encoded
 
     """
 
@@ -189,6 +187,8 @@ def Hinge_multiclass_loss_fct(y_true, y_pred):
         sign = 2 * y_true - 1
         # compute the elementwise hinge term
         hinge = tf.nn.relu(min_margin - sign * y_pred)
+        # reweight positive elements
+        hinge = tf.where(sign > 0, hinge * (y_true.shape[-1] - 1), hinge)
         return K.mean(hinge)
 
     return Hinge_multiclass_loss_fct
@@ -197,19 +197,16 @@ def Hinge_multiclass_loss_fct(y_true, y_pred):
 @_deel_export
 def HKR_multiclass_loss(alpha=0.0, min_margin=1):
     """
-    The multiclass version of HKR. This is done by computing the HKR term
-     over each class and averaging the results.
+    The multiclass version of HKR. This is done by computing the HKR term over each
+    class and averaging the results.
 
     Args:
         alpha: regularization factor
         min_margin: minimal margin ( see Hinge_multiclass_loss )
 
-    Note:
-        y_true has to be one hot encoded.
-
     Returns:
         Callable, the function to compute HKR loss
-
+        #Note y_true has to be one hot encoded
     """
     hingeloss = Hinge_multiclass_loss(min_margin)
     KRloss = KR_multiclass_loss()
@@ -221,7 +218,9 @@ def HKR_multiclass_loss_fct(y_true, y_pred):
         elif alpha == 0.0:  # alpha = 0 => KR only
             return -KRloss(y_true, y_pred)
         else:
-            return -KRloss(y_true, y_pred) / alpha + hingeloss(y_true, y_pred)
+            a = -KRloss(y_true, y_pred)
+            b = hingeloss(y_true, y_pred)
+            return a + alpha * b
 
     return HKR_multiclass_loss_fct