ott-jax · michalk8 · Dec 20, 2023 · Nov 19, 2023 · Nov 21, 2023 · Nov 21, 2023
diff --git a/docs/tutorials/neural_dual.ipynb b/docs/tutorials/neural_dual.ipynb
diff --git a/src/ott/neural/layers.py b/src/ott/neural/layers.py
@@ -29,20 +29,17 @@ class PositiveDense(nn.Module):
   """A linear transformation using a weight matrix with all entries positive.
 
   Args:
-    dim_hidden: the number of output dim_hidden.
-    rectifier_fn: choice of rectifier function (default: softplus function).
-    inv_rectifier_fn: choice of inverse rectifier function
-      (default: inverse softplus function).
-    dtype: the dtype of the computation (default: float32).
-    precision: numerical precision of computation see `jax.lax.Precision`
-      for details.
-    kernel_init: initializer function for the weight matrix.
-    bias_init: initializer function for the bias.
+  dim_hidden: the number of output dim_hidden.
+  rectifier_fn: choice of rectifier function (default: softplus function).
+  dtype: the dtype of the computation (default: float32).
+  precision: numerical precision of computation see `jax.lax.Precision`
+  for details.
+  kernel_init: initializer function for the weight matrix.
+  bias_init: initializer function for the bias.
   """
+
   dim_hidden: int
-  rectifier_fn: Callable[[jnp.ndarray], jnp.ndarray] = nn.softplus
-  inv_rectifier_fn: Callable[[jnp.ndarray],
-                             jnp.ndarray] = lambda x: jnp.log(jnp.exp(x) - 1)
+  rectifier_fn: Callable[[jnp.ndarray], jnp.ndarray] = nn.relu
   use_bias: bool = True
   dtype: Any = jnp.float32
   precision: Any = None
@@ -54,10 +51,10 @@ def __call__(self, inputs: jnp.ndarray) -> jnp.ndarray:
     """Applies a linear transformation to inputs along the last dimension.
 
     Args:
-      inputs: Array to be transformed.
+    inputs: Array to be transformed.
 
     Returns:
-      The transformed input.
+    The transformed input.
     """
     kernel_init = nn.initializers.lecun_normal(
     ) if self.kernel_init is None else self.kernel_init
@@ -81,59 +78,82 @@ def __call__(self, inputs: jnp.ndarray) -> jnp.ndarray:
 
 
 class PosDefPotentials(nn.Module):
-  r"""A layer to output :math:`\frac{1}{2} ||A_i^T (x - b_i)||^2_i` potentials.
+  """A layer to output  0.5 x^T(A_i A_i^T + Diag(d_i^2))x + b_i^T x + c_i potentials.
 
   Args:
-    use_bias: whether to add a bias to the output.
-    dtype: the dtype of the computation.
-    precision: numerical precision of computation see `jax.lax.Precision`
-      for details.
-    kernel_init: initializer function for the weight matrix.
-    bias_init: initializer function for the bias.
-  """
-  dim_data: int
+  num_potentials: the dimension of the output
+  rank: the rank of the matrix used for the quadratic layer
+  use_linear: whether to add a linear layer to the output
+  use_bias: whether to add a bias to the output.
+  dtype: the dtype of the computation.
+  precision: numerical precision of computation see `jax.lax.Precision` for details.
+  kernel_quadratic_init: initializer function for the weight matrix of the quadratic layer.
+  kernel_linear_init: initializer function for the weight matrix of the linea layer.
+  bias_init: initializer function for the bias.
+  """  # noqa: E501
+
   num_potentials: int
+  rank: int = 0
+  use_linear: bool = True
   use_bias: bool = True
   dtype: Any = jnp.float32
   precision: Any = None
-  kernel_init: Optional[Callable[[PRNGKey, Shape, Dtype], Array]] = None
-  bias_init: Callable[[PRNGKey, Shape, Dtype], Array] = nn.initializers.zeros
+  kernel_quadratic_init: Callable[[PRNGKey, Shape, Dtype],
+                                  Array] = nn.initializers.lecun_normal()
+  kernel_diagonal_init: Callable[[PRNGKey, Shape, Dtype],
+                                 Array] = nn.initializers.ones
+  kernel_linear_init: Callable[[PRNGKey, Shape, Dtype],
+                               Array] = nn.initializers.lecun_normal()
+  bias_init: Callable[[PRNGKey, Shape, Dtype],
+                      Array] = nn.initializers.lecun_normal()
 
   @nn.compact
   def __call__(self, inputs: jnp.ndarray) -> jnp.ndarray:
     """Apply a few quadratic forms.
 
     Args:
-      inputs: Array to be transformed (possibly batched).
+    inputs: Array to be transformed (possibly batched).
 
     Returns:
-      The transformed input.
+    The transformed input.
     """
-    kernel_init = nn.initializers.lecun_normal(
-    ) if self.kernel_init is None else self.kernel_init
+    dim_data = inputs.shape[-1]
     inputs = jnp.asarray(inputs, self.dtype)
-    kernel = self.param(
-        "kernel", kernel_init,
-        (self.num_potentials, inputs.shape[-1], inputs.shape[-1])
-    )
+    inputs = inputs.reshape((-1, dim_data))
 
-    if self.use_bias:
-      bias = self.param(
-          "bias", self.bias_init, (self.num_potentials, self.dim_data)
+    diag_kernel = self.param(
+        "diag_kernel", self.kernel_diagonal_init,
+        (1, dim_data, self.num_potentials)
+    )
+    # ensures the diag_kernel parameter stays non negative
+    diag_kernel = nn.activation.relu(diag_kernel)
+    y = 0.5 * jnp.sum(jnp.multiply(inputs[..., None], diag_kernel) ** 2, axis=1)
+
+    if self.rank > 0:
+      quadratic_kernel = self.param(
+          "quad_kernel", self.kernel_quadratic_init,
+          (self.num_potentials, dim_data, self.rank)
       )
-      bias = jnp.asarray(bias, self.dtype)
-
-      y = inputs.reshape((-1, inputs.shape[-1])) if inputs.ndim == 1 else inputs
-      y = y[..., None] - bias.T[None, ...]
-      y = jax.lax.dot_general(
-          y, kernel, (((1,), (1,)), ((2,), (0,))), precision=self.precision
+      y += jnp.sum(
+          0.5 * jnp.tensordot(
+              inputs,
+              quadratic_kernel,
+              axes=((inputs.ndim - 1,), (1,)),
+              precision=self.precision
+          ) ** 2,
+          axis=2,
       )
-    else:
-      y = jax.lax.dot_general(
-          inputs,
-          kernel, (((inputs.ndim - 1,), (0,)), ((), ())),
-          precision=self.precision
+
+    if self.use_linear:
+      linear_kernel = self.param(
+          "lin_kernel", self.kernel_linear_init,
+          (dim_data, self.num_potentials)
       )
+      y = y + jnp.dot(inputs, linear_kernel, precision=self.precision)
 
-    y = 0.5 * y * y
-    return jnp.sum(y.reshape((-1, self.num_potentials, self.dim_data)), axis=2)
+    if self.use_bias:
+      bias = self.param("bias", self.bias_init, (1, self.num_potentials))
+      bias = jnp.asarray(bias, self.dtype)
+      y = y + bias
+
+    return y