Skip to content

Commit

Permalink
Merge pull request #22 from deel-ai/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
thib-s authored May 26, 2021
2 parents 843a30c + c09ed37 commit 56e0006
Show file tree
Hide file tree
Showing 12 changed files with 239 additions and 197 deletions.
4 changes: 2 additions & 2 deletions .readthedocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ sphinx:
builder: html

# Optionally build your docs in additional formats such as PDF
formats:
- pdf
#formats:
# - pdf

# Optionally set the version of Python and requirements required to build your docs
python:
Expand Down
27 changes: 17 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,12 @@ In order to make things simple the following rules have been followed during dev

Here is an example showing how to build and train a 1-Lipschitz network:
```python
from deel.lip.layers import SpectralDense, SpectralConv2D, ScaledL2NormPooling2D
from deel.lip.layers import (
SpectralDense,
SpectralConv2D,
ScaledL2NormPooling2D,
FrobeniusDense,
)
from deel.lip.model import Sequential
from deel.lip.activations import GroupSort
from deel.lip.losses import HKR_multiclass_loss
Expand All @@ -40,7 +45,7 @@ from tensorflow.keras.utils import to_categorical
import numpy as np

# Sequential (resp Model) from deel.model has the same properties as any lipschitz model.
# It act only as a container, with features specific to lipschitz
# It act only as a container, with features specific to lipschitz
# functions (condensation, vanilla_exportation...)
model = Sequential(
[
Expand All @@ -52,28 +57,28 @@ model = Sequential(
filters=16,
kernel_size=(3, 3),
activation=GroupSort(2),
use_bias=False,
use_bias=True,
kernel_initializer="orthogonal",
),
# usual pooling layer are implemented (avg, max...), but new layers are also available
ScaledL2NormPooling2D(pool_size=(2, 2), data_format="channels_last"),
SpectralConv2D(
filters=32,
filters=16,
kernel_size=(3, 3),
activation=GroupSort(2),
use_bias=False,
use_bias=True,
kernel_initializer="orthogonal",
),
ScaledL2NormPooling2D(pool_size=(2, 2), data_format="channels_last"),
# our layers are fully interoperable with existing keras layers
Flatten(),
SpectralDense(
100,
32,
activation=GroupSort(2),
use_bias=False,
use_bias=True,
kernel_initializer="orthogonal",
),
SpectralDense(
FrobeniusDense(
10, activation=None, use_bias=False, kernel_initializer="orthogonal"
),
],
Expand All @@ -85,8 +90,10 @@ model = Sequential(

# HKR (Hinge-Krantorovich-Rubinstein) optimize robustness along with accuracy
model.compile(
loss=HKR_multiclass_loss(alpha=5.0, min_margin=0.5),
optimizer=Adam(lr=0.01),
# decreasing alpha and increasing min_margin improve robustness (at the cost of accuracy)
# note also in the case of lipschitz networks, more robustness require more parameters.
loss=HKR_multiclass_loss(alpha=25, min_margin=0.25),
optimizer=Adam(lr=0.005),
metrics=["accuracy"],
)

Expand Down
9 changes: 6 additions & 3 deletions deel/lip/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,10 @@ def vanilla_export(self):
@_deel_export
class FrobeniusDense(Dense, LipschitzLayer, Condensable):
"""
Same a SpectralDense, but in the case of a single output.
Same a SpectralDense, but in the case of a single output. In the multiclass
setting, the behaviour of this layer is similar to the stacking of 1 lipschitz
layer (each output is 1-lipschitz, but the no orthogonality is enforced between
outputs ).
"""

def __init__(
Expand Down Expand Up @@ -632,7 +635,7 @@ def _compute_lip_coef(self, input_shape=None):
return 1.0

def call(self, x):
W_bar = self.kernel / tf.norm(self.kernel) * self._get_coef()
W_bar = self.kernel / tf.norm(self.kernel, axis=0) * self._get_coef()
kernel = self.kernel
self.kernel = W_bar
outputs = Dense.call(self, x)
Expand All @@ -647,7 +650,7 @@ def get_config(self):
return dict(list(base_config.items()) + list(config.items()))

def condense(self):
W_bar = self.kernel / tf.norm(self.kernel)
W_bar = self.kernel / tf.norm(self.kernel, axis=0)
self.kernel.assign(W_bar)

def vanilla_export(self):
Expand Down
55 changes: 27 additions & 28 deletions deel/lip/losses.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ def KR_loss_fct(y_true, y_pred):
S0 = tf.equal(y_true, 1)
S1 = tf.not_equal(y_true, 1)
# compute the KR dual representation
return K.mean(tf.boolean_mask(y_pred, S0)) - K.mean(tf.boolean_mask(y_pred, S1))
return tf.reduce_mean(tf.boolean_mask(y_pred, S0)) - tf.reduce_mean(
tf.boolean_mask(y_pred, S1)
)

return KR_loss_fct

Expand Down Expand Up @@ -85,17 +87,17 @@ def HKR_loss(alpha, min_margin=1):
a function that compute the regularized Wasserstein loss
"""
KR = KR_loss()
hinge = hinge_margin_loss(min_margin)

@tf.function
def HKR_loss_fct(y_true, y_pred):
if alpha == np.inf: # if alpha infinite, use hinge only
return hinge_margin_loss(min_margin)(y_true, y_pred)
return hinge(y_true, y_pred)
else:
# true value: positive value should be the first to be coherent with the
# hinge loss (positive y_pred)
return alpha * hinge_margin_loss(min_margin)(y_true, y_pred) - KR_loss()(
y_true, y_pred
)
return alpha * hinge(y_true, y_pred) - KR(y_true, y_pred)

return HKR_loss_fct

Expand All @@ -121,7 +123,7 @@ def hinge_margin_loss(min_margin=1):
@tf.function
def hinge_margin_fct(y_true, y_pred):
sign = K.sign(
y_true - eps
tf.cast(y_true, y_pred.dtype) - eps
) # subtracting a small eps makes the loss work for (1,0) and (1,-1) labels
hinge = K.maximum(0.0, min_margin - sign * y_pred)
return K.mean(hinge)
Expand All @@ -132,15 +134,13 @@ def hinge_margin_fct(y_true, y_pred):
@_deel_export
def KR_multiclass_loss():
r"""
Loss to estimate average of W1 distance using Kantorovich-Rubinstein
duality over outputs. In this multiclass setup thr KR term is computed
for each class and then averaged.
Note:
y_true has to be one hot encoded (labels being 1s and 0s ).
Loss to estimate average of W1 distance using Kantorovich-Rubinstein duality over
outputs. Note y_true should be one hot encoding (labels being 1s and 0s ). In
this multiclass setup thr KR term is computed for each class and then averaged.
Returns:
Callable, the function to compute Wasserstein multiclass loss.
#Note y_true has to be one hot encoded
"""

Expand All @@ -160,7 +160,7 @@ def KR_multiclass_loss_fct(y_true, y_pred):
- tf.reduce_sum(y_true, axis=0)
)
# compute the differences to have the KR term for each output neuron,
# and compute the average over the classes
# then compute the average over the classes
return tf.reduce_mean(-espNotYtrue + espYtrue)

return KR_multiclass_loss_fct
Expand All @@ -169,17 +169,15 @@ def KR_multiclass_loss_fct(y_true, y_pred):
@_deel_export
def Hinge_multiclass_loss(min_margin=1):
"""
Loss to estimate the Hinge loss in a multiclass setup. It compute the
elementwise hinge term. Note that this formulation differs from the
one commonly found in tensorflow/pytorch (with marximise the difference
between the two largest logits). This formulation is consistent with the
binary cassification loss used in a multiclass fashion.
Note:
y_true should be one hot encoded. labels in (1,0)
Loss to estimate the Hinge loss in a multiclass setup. It compute the elementwise
hinge term. Note that this formulation differs from the one commonly found in
tensorflow/pytorch (with marximise the difference between the two largest
logits). This formulation is consistent with the binary classification loss used
in a multiclass fashion. Note y_true should be one hot encoded. labels in (1,0)
Returns:
Callable, the function to compute multiclass Hinge loss
#Note y_true has to be one hot encoded
"""

Expand All @@ -189,6 +187,8 @@ def Hinge_multiclass_loss_fct(y_true, y_pred):
sign = 2 * y_true - 1
# compute the elementwise hinge term
hinge = tf.nn.relu(min_margin - sign * y_pred)
# reweight positive elements
hinge = tf.where(sign > 0, hinge * (y_true.shape[-1] - 1), hinge)
return K.mean(hinge)

return Hinge_multiclass_loss_fct
Expand All @@ -197,19 +197,16 @@ def Hinge_multiclass_loss_fct(y_true, y_pred):
@_deel_export
def HKR_multiclass_loss(alpha=0.0, min_margin=1):
"""
The multiclass version of HKR. This is done by computing the HKR term
over each class and averaging the results.
The multiclass version of HKR. This is done by computing the HKR term over each
class and averaging the results.
Args:
alpha: regularization factor
min_margin: minimal margin ( see Hinge_multiclass_loss )
Note:
y_true has to be one hot encoded.
Returns:
Callable, the function to compute HKR loss
#Note y_true has to be one hot encoded
"""
hingeloss = Hinge_multiclass_loss(min_margin)
KRloss = KR_multiclass_loss()
Expand All @@ -221,7 +218,9 @@ def HKR_multiclass_loss_fct(y_true, y_pred):
elif alpha == 0.0: # alpha = 0 => KR only
return -KRloss(y_true, y_pred)
else:
return -KRloss(y_true, y_pred) / alpha + hingeloss(y_true, y_pred)
a = -KRloss(y_true, y_pred)
b = hingeloss(y_true, y_pred)
return a + alpha * b

return HKR_multiclass_loss_fct

Expand Down
Loading

0 comments on commit 56e0006

Please sign in to comment.