deel-ai · Agustin-Picard · Apr 10, 2024 · May 2, 2023 · May 2, 2023 · Jul 26, 2023
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,18 +1,7 @@
 [bumpversion]
-current_version = 0.2.0
-parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(-(?P<prerelease>\d+))?
-serialize = 
-	{major}.{minor}.{patch}-{prerelease}
-	{major}.{minor}.{patch}
+current_version = 0.3.0
 commit = True
-tag = True
-
-[bumpversion:part:prerelease]
-optional_value = regular
-values = 
-	beta
-	alpha
-	regular
+tag = False
 
 [bumpversion:file:setup.py]
 

diff --git a/README.md b/README.md
@@ -37,9 +37,11 @@ We propose some hands-on tutorials to get familiar with the library and it's API
 - [**Benchmarking with Mislabeled sample detection**](https://colab.research.google.com/drive/1_5-RC_YBHptVCElBbjxWfWQ1LMU20vOp?usp=sharing) <sub> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1_5-RC_YBHptVCElBbjxWfWQ1LMU20vOp?usp=sharing) </sub>
 - [**Using the first order influence calculator**](https://colab.research.google.com/drive/1WlYcQNu5obhVjhonN2QYi8ybKyZJl4iY?usp=sharing) <sub> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1WlYcQNu5obhVjhonN2QYi8ybKyZJl4iY?usp=sharing) </sub>
 - [**Using the second order influence calculator**](https://colab.research.google.com/drive/1qNvKiU3-aZWhRA0rxS6X3ebeNkoznJJe?usp=sharing) <sub> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qNvKiU3-aZWhRA0rxS6X3ebeNkoznJJe?usp=sharing) </sub>
+- [**Using Arnoldi Influence Calculator**](https://colab.research.google.com/drive/1rQU33sbD0YW1cZMRlJmS15EW5O16yoDE?usp=sharing) <sub> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rQU33sbD0YW1cZMRlJmS15EW5O16yoDE?usp=sharing) </sub>
 - [**Using TracIn**](https://colab.research.google.com/drive/1E94cGF46SUQXcCTNwQ4VGSjXEKm7g21c?usp=sharing) <sub> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1E94cGF46SUQXcCTNwQ4VGSjXEKm7g21c?usp=sharing) </sub>
 - [**Using Representer Point Selection - L2 (RPS_L2)**](https://colab.research.google.com/drive/17W5s30LbxABbDd8hbdwYE56abyWjSC4u?usp=sharing) <sub> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/17W5s30LbxABbDd8hbdwYE56abyWjSC4u?usp=sharing) </sub>
 - [**Using Representer Point Selection - Local Jacobian Expansion (RPS_LJE)**](https://colab.research.google.com/drive/14e7wwFRQJhY-huVYmJ7ri355kfLJgAPA?usp=sharing) <sub> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/14e7wwFRQJhY-huVYmJ7ri355kfLJgAPA?usp=sharing) </sub>
+- [**Using Boundary-based Influence**](https://colab.research.google.com/drive/1785eHgT91FfqG1f25s7ovqd6JhP5uklh?usp=sharing) <sub> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1785eHgT91FfqG1f25s7ovqd6JhP5uklh?usp=sharing) </sub>
 
 ## 🚀 Quick Start
 
@@ -63,7 +65,7 @@ from deel.influenciae.utils import ORDER
 
 # load the model, the training loss (without reduction) and the training data (with the labels and in a batched TF dataset)
 
-influence_model = InfluenceModel(model, target_layer, loss_function)
+influence_model = InfluenceModel(model, start_layer=target_layer, loss_function=loss_function)
 ihvp_calculator = ExactIHVP(influence_model, train_dataset)
 influence_calculator = FirstOrderInfluenceCalculator(influence_model, train_dataset, ihvp_calculator)
 data_and_influence_dataset = influence_calculator.compute_influence_values(train_dataset)
@@ -85,7 +87,7 @@ from deel.influenciae.utils import ORDER
 # load the model, the training loss (without reduction), the training data and
 # the data to explain (with the labels and in batched a TF dataset)
 
-influence_model = InfluenceModel(model, target_layer, loss_function)
+influence_model = InfluenceModel(model, start_layer=target_layer, loss_function=loss_function)
 ihvp_calculator = ExactIHVP(influence_model, train_dataset)
 influence_calculator = FirstOrderInfluenceCalculator(influence_model, train_dataset, ihvp_calculator)
 data_and_influence_dataset = influence_calculator.estimate_influence_values_in_batches(samples_to_explain, train_dataset)
@@ -108,7 +110,7 @@ from deel.influenciae.influence import SecondOrderInfluenceCalculator
 # load the model, the training loss (without reduction), the training data and
 # the data to explain (with the labels and in a batched TF dataset)
 
-influence_model = InfluenceModel(model, target_layer, loss_function)
+influence_model = InfluenceModel(model, start_layer=target_layer, loss_function=loss_function)
 ihvp_calculator = ExactIHVP(influence_model, train_dataset)
 influence_calculator = SecondOrderInfluenceCalculator(influence_model, train_dataset, ihvp_calculator)  # or FirstOrderInfluenceCalculator
 data_and_influence_dataset = influence_calculator.estimate_influence_values_group(groups_train, groups_to_explain)
@@ -123,7 +125,7 @@ from deel.influenciae.influence import SecondOrderInfluenceCalculator
 # load the model, the training loss (without reduction), the training data and
 # the data to explain (with the labels and in a batched TF dataset)
 
-influence_model = InfluenceModel(model, target_layer, loss_function)
+influence_model = InfluenceModel(model, start_layer=target_layer, loss_function=loss_function)
 ihvp_calculator = ExactIHVP(influence_model, train_dataset)
 influence_calculator = SecondOrderInfluenceCalculator(influence_model, train_dataset, ihvp_calculator)  # or FirstOrderInfluenceCalculator
 data_and_influence_dataset = influence_calculator.estimate_influence_values_group(groups_train)
@@ -139,11 +141,11 @@ All the influence calculation methods work on Tensorflow models trained for any
 | RelatIF                                                 | [Paper](https://arxiv.org/pdf/2003.11630.pdf)                                                      | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1WlYcQNu5obhVjhonN2QYi8ybKyZJl4iY?usp=sharing) |
 | Influence Functions  (first order, groups)              | [Paper](https://arxiv.org/abs/1905.13289)                                                          | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1WlYcQNu5obhVjhonN2QYi8ybKyZJl4iY?usp=sharing) |
 | Influence Functions  (second order, groups)             | [Paper](https://arxiv.org/abs/1911.00418)                                                          | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qNvKiU3-aZWhRA0rxS6X3ebeNkoznJJe?usp=sharing) |
-| Arnoldi (Scaling Up Influence Functions)                | [Paper](https://arxiv.org/abs/2112.03052)                                                          |                                                                                 WIP                                                                                 |
+| Arnoldi iteration (Scaling Up Influence Functions)      | [Paper](https://arxiv.org/abs/2112.03052)  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rQU33sbD0YW1cZMRlJmS15EW5O16yoDE?usp=sharing)  |
+| Trac-In                                                 | [Paper](https://arxiv.org/abs/2002.08484)                                                          | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1E94cGF46SUQXcCTNwQ4VGSjXEKm7g21c?usp=sharing) |
 | Representer Point Selection  (L2)                       | [Paper](https://arxiv.org/abs/1811.09720)                                                          | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/17W5s30LbxABbDd8hbdwYE56abyWjSC4u?usp=sharing) |
 | Representer Point Selection  (Local Jacobian Expansion) | [Paper](https://proceedings.neurips.cc/paper/2021/file/c460dc0f18fc309ac07306a4a55d2fd6-Paper.pdf) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/14e7wwFRQJhY-huVYmJ7ri355kfLJgAPA?usp=sharing) |
-| Trac-In                                                 | [Paper](https://arxiv.org/abs/2002.08484)                                                          | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1E94cGF46SUQXcCTNwQ4VGSjXEKm7g21c?usp=sharing) |
-| Boundary-based influence                                | --                                                                                                 |                                                                                 WIP                                                                                 |
+| Boundary-based influence                                | --                                                                                                 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1785eHgT91FfqG1f25s7ovqd6JhP5uklh?usp=sharing) |
 
 ## 👀 See Also
 
@@ -170,6 +172,25 @@ This project received funding from the French ”Investing for the Future – PI
 
 This library was first created as a research tool by [Agustin Martin PICARD](mailto:[email protected]) in the context of the DEEL project with the help of [David Vigouroux](mailto:[email protected]) and [Thomas FEL](http://thomasfel.fr). Later on, [Lucas Hervier](https://github.com/lucashervier) joined the team to transform the code base as a practical user-(almost)-friendly and efficient tool.
 
+## 🗞️ Citation
+
+If you use Influenciae as part of your workflow in a scientific publication, please consider citing the 🗞️ [official paper](https://hal.science/hal-04284178/):
+
+```
+@unpublished{picard:hal-04284178,
+  TITLE = {{Influenci{\ae}: A library for tracing the influence back to the data-points}},
+  AUTHOR = {Picard, Agustin Martin and Hervier, Lucas and Fel, Thomas and Vigouroux, David},
+  URL = {https://hal.science/hal-04284178},
+  NOTE = {working paper or preprint},
+  YEAR = {2023},
+  MONTH = Nov,
+  KEYWORDS = {Data-centric ai ; XAI ; Explainability ; Influence Functions ; Open-source toolbox},
+  PDF = {https://hal.science/hal-04284178/file/ms.pdf},
+  HAL_ID = {hal-04284178},
+  HAL_VERSION = {v1},
+}
+```
+
 ## 📝 License
 
 The package is released under <a href="https://choosealicense.com/licenses/mit"> MIT license</a>.
diff --git a/benchmark_runner.py b/benchmark_runner.py
@@ -75,10 +75,13 @@
 
     args = parser.parse_args()
 
+    use_bias = False if args.method_name == "rps_lje" or args.method_name == "rps_l2" else True
+
     cifar10_evaluator = Cifar10MislabelingDetectorEvaluator(epochs=args.epochs,
                                                             model_type=args.model_type,
                                                             mislabeling_ratio=args.mislabeling_ratio,
                                                             use_regu=args.use_regu,
+                                                            use_bias=use_bias,
                                                             force_overfit=args.force_overfit,
                                                             train_batch_size=args.train_batch_size,
                                                             test_batch_size=args.test_batch_size,

diff --git a/deel/influenciae/__init__.py b/deel/influenciae/__init__.py
@@ -10,7 +10,7 @@
 techniques
 """
 
-__version__ = '0.2.0'
+__version__ = '0.3.0'
 
 from . import influence
 from . import common

diff --git a/deel/influenciae/benchmark/__init__.py b/deel/influenciae/benchmark/__init__.py
@@ -7,5 +7,13 @@
 """
 
 from .base_benchmark import BaseTrainingProcedure, MislabelingDetectorEvaluator
-from .influence_factory import InfluenceCalculatorFactory, FirstOrderFactory, RPSLJEFactory, TracInFactory
+from .influence_factory import (
+    InfluenceCalculatorFactory,
+    FirstOrderFactory,
+    RPSLJEFactory,
+    TracInFactory,
+    WeightsBoundaryCalculatorFactory,
+    SampleBoundaryCalculatorFactory,
+    ArnoldiCalculatorFactory
+)
 from .cifar10_benchmark import Cifar10TrainingProcedure, Cifar10MislabelingDetectorEvaluator
diff --git a/deel/influenciae/benchmark/cifar10_benchmark.py b/deel/influenciae/benchmark/cifar10_benchmark.py
@@ -44,7 +44,7 @@ class ConvNetCIFAR(Sequential):
     use_regularization
         A boolean indicating whether to add regularization on the final model's last layer.
     """
-    def __init__(self, model: Union[str, Model], use_regularization: bool = True, **kwargs):
+    def __init__(self, model: Union[str, Model], use_regularization: bool = True, use_bias: bool = True, **kwargs):
         super().__init__(**kwargs)
         if isinstance(model, Model):
             base_model = model
@@ -73,9 +73,14 @@ def __init__(self, model: Union[str, Model], use_regularization: bool = True, **
         self.add(tf.keras.layers.LeakyReLU())
 
         if use_regularization:
-            dense_2 = Dense(10, kernel_regularizer=L1L2(l1=1e-4, l2=1e-4), kernel_initializer="he_normal")
+            dense_2 = Dense(
+                10,
+                kernel_regularizer=L1L2(l1=1e-4, l2=1e-4),
+                kernel_initializer="he_normal",
+                use_bias=use_bias
+            )
         else:
-            dense_2 = Dense(10)
+            dense_2 = Dense(10, use_bias=use_bias)
 
         self.add(dense_2)
 
@@ -92,6 +97,8 @@ class Cifar10TrainingProcedure(BaseTrainingProcedure):
         A string with the type of model to use. Either 'resnet', 'efficient_net' or 'vgg19'.
     use_regu
         A boolean indicating whether L1L2 regularization should be used on the last layer.
+    use_bias
+        A boolean for adding a bias to the last layer.
     force_overfit
         A boolean for if the training schedule to be used should try to overfit the model or not.
     epochs_to_save
@@ -107,6 +114,7 @@ def __init__(
             epochs: int = 60,
             model_type: str = 'resnet',
             use_regu: bool = True,
+            use_bias: bool = True,
             force_overfit: bool = False,
             epochs_to_save: Optional[List[int]] = None,
             verbose: bool = True,
@@ -115,6 +123,7 @@ def __init__(
         self.epochs = epochs
         self.model_type = model_type
         self.use_regu = use_regu
+        self.use_bias = use_bias
         self.force_overfit = force_overfit
         self.epochs_to_save = epochs_to_save
         self.verbose = verbose
@@ -171,7 +180,7 @@ def preprocess(x):
 
         test_dataset = test_dataset.batch(test_batch_size).prefetch(100)
 
-        model = ConvNetCIFAR(self.model_type, self.use_regu)
+        model = ConvNetCIFAR(self.model_type, self.use_regu, self.use_bias)
 
         loss = CategoricalCrossentropy(from_logits=True)
 
@@ -266,6 +275,7 @@ def __init__(
             model_type: str = 'resnet',
             mislabeling_ratio: float = 0.0005,
             use_regu: bool = True,
+            use_bias: bool = True,
             force_overfit: bool = False,
             train_batch_size: int = 128,
             test_batch_size: int = 128,
@@ -281,6 +291,7 @@ def __init__(
             "model_type": model_type,
             "mislabeling_ratio": mislabeling_ratio,
             "use_regularization": use_regu,
+            "use_bias": use_bias,
             "optimizer": 'sgd' if force_overfit else 'adam',
             "train_batch_size": train_batch_size,
             "test_batch_size": test_batch_size,
@@ -301,8 +312,8 @@ def __init__(
         if take_batch is not None:
             training_dataset = training_dataset.take(take_batch)
             test_dataset = test_dataset.take(take_batch)
-        training_procedure = Cifar10TrainingProcedure(epochs, model_type, use_regu, force_overfit, epochs_to_save,
-                                                      verbose_training, use_tensorboard)
+        training_procedure = Cifar10TrainingProcedure(epochs, model_type, use_regu, use_bias, force_overfit,
+                                                      epochs_to_save, verbose_training, use_tensorboard)
         super().__init__(training_dataset, test_dataset, training_procedure,
                          nb_classes=10, mislabeling_ratio=mislabeling_ratio,
                          train_batch_size=train_batch_size,

diff --git a/deel/influenciae/benchmark/influence_factory.py b/deel/influenciae/benchmark/influence_factory.py
@@ -202,9 +202,7 @@ def build(self, training_dataset: tf.data.Dataset, model: tf.keras.Model,
             dataset_hessian = training_dataset
         else:
             batch_size = training_dataset._batch_size.numpy()  # pylint: disable=W0212
-            take_size = int(
-                np.ceil(float(self.dataset_hessian_size) / batch_size)) * batch_size
-            dataset_hessian = training_dataset.take(take_size)
+            dataset_hessian = training_dataset.unbatch().take(self.dataset_hessian_size).batch(batch_size)
 
         if self.ihvp_mode == 'exact':
             ihvp_calculator_factory = ExactIHVPFactory()

diff --git a/deel/influenciae/boundary_based/sample_boundary.py b/deel/influenciae/boundary_based/sample_boundary.py
@@ -75,7 +75,7 @@ def __delta_to_index(indexes_1: tf.Tensor, indexes_2: tf.Tensor, x: tf.Tensor):
         return delta_x
 
     @tf.function
-    def __step(self, x: tf.Tensor, y_pred: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+    def _step(self, x: tf.Tensor, y_pred: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """
         The optimization step to find the distance between the boundary and a given sample x.
 
@@ -163,7 +163,7 @@ def __compute_single_sample_score(self, x: tf.Tensor) -> tf.Tensor:
         y_pred = self.model(x)
 
         def body(index, x_current):
-            computation, _, x_new = self.__step(x_current, y_pred)
+            computation, _, x_new = self._step(x_current, y_pred)
             return computation, index + 1, x_new
 
         _, _, x_adversarial = tf.while_loop(

diff --git a/deel/influenciae/boundary_based/weights_boundary.py b/deel/influenciae/boundary_based/weights_boundary.py
@@ -98,7 +98,7 @@ def __delta_to_index(self, indexes_1: tf.Tensor, indexes_2: tf.Tensor, x: tf.Ten
         return delta_x
 
     @tf.function
-    def __step(self, x: tf.Tensor, y_pred: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+    def _step(self, x: tf.Tensor, y_pred: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
         """
         The optimization step to find the distance between the boundary and a given sample x.
         To see more details about the optimization procedure for multi-class classifiers,
@@ -211,7 +211,7 @@ def __compute_single_sample_score(self, x: tf.Tensor) -> tf.Tensor:
         y_pred = self.model(x)
 
         tf.while_loop(lambda cond, index: tf.logical_and(cond, index < self.step_nbr),
-                      lambda cond, index: (self.__step(x, y_pred)[0], index + 1),
+                      lambda cond, index: (self._step(x, y_pred)[0], index + 1),
                       [tf.constant(True), tf.constant(0, dtype=tf.int32)])
 
         score = self.__delta_weights()

diff --git a/deel/influenciae/common/base_influence.py b/deel/influenciae/common/base_influence.py
@@ -269,30 +269,6 @@ def compute_influence_vector(
 
         return inf_vect_ds
 
-    @abstractmethod
-    def _estimate_individual_influence_values_from_batch(
-            self,
-            train_samples: Tuple[tf.Tensor, ...],
-            samples_to_evaluate: Tuple[tf.Tensor, ...]
-    ) -> tf.Tensor:
-        """
-        Estimate the (individual) influence scores of a single batch of samples with respect to
-        a batch of samples belonging to the model's training dataset.
-
-        Parameters
-        ----------
-        train_samples
-            A single batch of training samples (and their target values).
-        samples_to_evaluate
-            A single batch of samples of which we wish to compute the influence of removing the training
-            samples.
-
-        Returns
-        -------
-        A tensor containing the individual influence scores.
-        """
-        raise NotImplementedError()
-
     def estimate_influence_values_in_batches(
             self,
             dataset_to_evaluate: tf.data.Dataset,