deepset-ai · tstadel · Apr 25, 2022 · Mar 21, 2022 · Mar 21, 2022 · Mar 21, 2022
diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md
@@ -440,7 +440,7 @@ then be found in the dict returned by this method under the key "_debug"
 
 ```python
 @classmethod
-def eval_beir(cls, index_pipeline: Pipeline, query_pipeline: Pipeline, index_params: dict = {}, query_params: dict = {}, dataset: str = "scifact", dataset_dir: Path = Path("."), top_k_values: List[int] = [1, 3, 5, 10, 100, 1000], keep_index: bool = False) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, float]]
+def eval_beir(cls, pipeline_bundle: PipelineBundle, dataset: str = "scifact", dataset_dir: Path = Path("."), top_k_values: List[int] = [1, 3, 5, 10, 100, 1000], keep_index: bool = False) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, float]]
 ```
 
 Runs information retrieval evaluation of a pipeline using BEIR on a specified BEIR dataset.
@@ -463,6 +463,60 @@ If True the index will be kept after beir evaluation. Otherwise it will be delet
 Returns a tuple containing the ncdg, map, recall and precision scores.
 Each metric is represented by a dictionary containing the scores for each top_k value.
 
+<a id="base.Pipeline.run_eval_experiment"></a>
+
+#### run\_eval\_experiment
+
+```python
+@classmethod
+def run_eval_experiment(cls, pipeline_bundle: PipelineBundle, dataset: EvaluationDataset, corpus: Corpus, experiment_name: str, experiment_run_name: str, experiment_tracking_uri: str, sas_model_name_or_path: str = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, reuse_index: bool = False) -> EvaluationResult
+```
+
+Starts an experiment run that evaluates the pipeline bundle using pipeline.eval() by indexing the corpus and running the query pipeline once per query in debug mode
+
+and putting together all data that is needed for evaluation, e.g. calculating metrics.
+The resulting data is collected and tracked by an experiment tracking tool (currently we only support mlflow).
+
+This method starts an experiment run. Each experiment run is part of at least one experiment.
+An experiment typically consists of multiple runs. Within the experiment tracking tool you can compare experiment runs across the experiment.
+E.g. you can call run_eval_experiment() multiple times with different params and its respecting experiment_run_names and later on compare the results in mlflow.
+
+**Arguments**:
+
+- `dataset`: The dataset containing the labels to evaluate on
+- `experiment_name`: The name of the experiment
+- `experiment_run_name`: The name of the experiment run
+- `experiment_tracking_uri`: The uri of the experiment tracking server to track the results to.
+- `params`: Dictionary of parameters to be dispatched to the nodes.
+If you want to pass a param to all nodes, you can just use: {"top_k":10}
+If you want to pass it to targeted nodes, you can do:
+{"Retriever": {"top_k": 10}, "Reader": {"top_k": 3, "debug": True}}
+- `sas_model_name_or_path`: Name or path of "Semantic Answer Similarity (SAS) model". When set, the model will be used to calculate similarity between predictions and labels and generate the SAS metric.
+The SAS metric correlates better with human judgement of correct answers as it does not rely on string overlaps.
+Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture.
+More info in the paper: https://arxiv.org/abs/2108.06130
+Models:
+- You can use Bi Encoders (sentence transformers) or cross encoders trained on Semantic Textual Similarity (STS) data.
+Not all cross encoders can be used because of different return types.
+If you use custom cross encoders please make sure they work with sentence_transformers.CrossEncoder class
+- Good default for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
+- Large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large"
+- Large model for German only: "deepset/gbert-large-sts"
+- `sas_batch_size`: Number of prediction label pairs to encode at once by CrossEncoder or SentenceTransformer while calculating SAS.
+- `sas_use_gpu`: Whether to use a GPU or the CPU for calculating semantic answer similarity.
+Falls back to CPU if no GPU is available.
+- `add_isolated_node_eval`: If set to True, in addition to the integrated evaluation of the pipeline, each node is evaluated in isolated evaluation mode.
+This mode helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node.
+If a node performs much better in the isolated evaluation than in the integrated evaluation, the previous node needs to be optimized to improve the pipeline's performance.
+If a node's performance is similar in both modes, this node itself needs to be optimized to improve the pipeline's performance.
+The isolated evaluation calculates the upper bound of each node's evaluation metrics under the assumption that it received perfect inputs from the previous node.
+To this end, labels are used as input to the node instead of the output of the previous node in the pipeline.
+The generated dataframes in the EvaluationResult then contain additional rows, which can be distinguished from the integrated evaluation results based on the
+values "integrated" or "isolated" in the column "eval_mode" and the evaluation report then additionally lists the upper bound of each node's evaluation metrics.
+- `reuse_index`: Whether to reuse existing non-empty index and to keep the index after evaluation.
+If True the index will be kept after evaluation and no indexing will take place if index has already documents. Otherwise it will be deleted immediately afterwards.
+Defaults to False.
+
 <a id="base.Pipeline.eval"></a>
 
 #### eval
@@ -877,7 +931,7 @@ class _HaystackBeirRetrieverAdapter()
 #### \_\_init\_\_
 
 ```python
-def __init__(index_pipeline: Pipeline, query_pipeline: Pipeline, index_params: dict, query_params: dict)
+def __init__(pipeline_bundle: PipelineBundle)
 ```
 
 Adapter mimicking a BEIR retriever used by BEIR's EvaluateRetrieval class to run BEIR evaluations on Haystack Pipelines.

diff --git a/docs/_src/api/api/primitives.md b/docs/_src/api/api/primitives.md
@@ -431,3 +431,21 @@ Loads the evaluation result from disk. Expects one csv file per node. See save()
 
 - `load_dir`: The directory containing the csv files.
 
+<a id="schema.EvaluationDataset"></a>
+
+## EvaluationDataset
+
+```python
+class EvaluationDataset()
+```
+
+<a id="schema.EvaluationDataset.__init__"></a>
+
+#### \_\_init\_\_
+
+```python
+def __init__(name: str, labels: List[MultiLabel], meta: Dict[str, Any] = {}) -> None
+```
+
+Set of labels belonging together and forming a well-specified and referenceable evaluation dataset.
+
diff --git a/haystack/__init__.py b/haystack/__init__.py
@@ -19,7 +19,7 @@
 logging.getLogger("haystack").setLevel(logging.INFO)
 
 from haystack import pipelines
-from haystack.schema import Document, Answer, Label, MultiLabel, Span
+from haystack.schema import Document, Answer, Label, MultiLabel, Span, EvaluationDataset, EvaluationResult
 from haystack.nodes import BaseComponent
 from haystack.pipelines import Pipeline
 
@@ -101,7 +101,6 @@ def __getattr__(self, attr):
     pass
 
 from haystack.modeling.evaluation import eval
-from haystack.modeling.logger import MLFlowLogger, StdoutLogger, TensorBoardLogger
 from haystack.nodes.other import JoinDocuments, Docs2Answers, JoinAnswers, RouteDocuments
 from haystack.nodes.query_classifier import SklearnQueryClassifier, TransformersQueryClassifier
 from haystack.nodes.file_classifier import FileTypeClassifier
@@ -175,9 +174,6 @@ def __getattr__(self, attr):
 # Adding them to sys.modules would enable `import haystack.pipelines.JoinDocuments`,
 # which I believe it's a very rare import style.
 setattr(file_converter, "FileTypeClassifier", FileTypeClassifier)
-setattr(modeling_utils, "MLFlowLogger", MLFlowLogger)
-setattr(modeling_utils, "StdoutLogger", StdoutLogger)
-setattr(modeling_utils, "TensorBoardLogger", TensorBoardLogger)
 setattr(pipelines, "JoinDocuments", JoinDocuments)
 setattr(pipelines, "Docs2Answers", Docs2Answers)
 setattr(pipelines, "SklearnQueryClassifier", SklearnQueryClassifier)

diff --git a/haystack/modeling/data_handler/data_silo.py b/haystack/modeling/data_handler/data_silo.py
@@ -23,7 +23,7 @@
     from haystack.nodes import FARMReader
 from haystack.modeling.data_handler.dataloader import NamedDataLoader
 from haystack.modeling.data_handler.processor import Processor
-from haystack.modeling.logger import MLFlowLogger as MlLogger
+from haystack.utils.experiment_tracking import Tracker as tracker
 from haystack.modeling.utils import log_ascii_workers, grouper, calc_chunksize
 from haystack.modeling.visual import TRACTOR_SMALL
 
@@ -500,7 +500,7 @@ def _calculate_statistics(self):
                 logger.info("Average passage length after clipping:          {}".format(ave_len[1]))
                 logger.info("Proportion passages clipped:                    {}".format(clipped[1]))
 
-        MlLogger.log_params(
+        tracker.track_params(
             {
                 "n_samples_train": self.counts["train"],
                 "n_samples_dev": self.counts["dev"],

diff --git a/haystack/modeling/data_handler/processor.py b/haystack/modeling/data_handler/processor.py
@@ -34,7 +34,7 @@
     offset_to_token_idx_vecorized,
 )
 from haystack.modeling.data_handler.input_features import sample_to_features_text
-from haystack.modeling.logger import MLFlowLogger as MlLogger
+from haystack.utils.experiment_tracking import Tracker as tracker
 
 
 DOWNSTREAM_TASK_MAP = {
@@ -362,7 +362,7 @@ def _log_params(self):
         for name in names:
             value = getattr(self, name)
             params.update({name: str(value)})
-        MlLogger.log_params(params)
+        tracker.track_params(params)
 
 
 class SquadProcessor(Processor):

diff --git a/haystack/modeling/evaluation/eval.py b/haystack/modeling/evaluation/eval.py
@@ -8,7 +8,7 @@
 
 from haystack.modeling.evaluation.metrics import compute_metrics, compute_report_metrics
 from haystack.modeling.model.adaptive_model import AdaptiveModel
-from haystack.modeling.logger import MLFlowLogger as MlLogger
+from haystack.utils.experiment_tracking import Tracker as tracker
 from haystack.modeling.visual import BUSH_SEP
 
 
@@ -147,11 +147,11 @@ def log_results(
         for head_num, head in enumerate(results):
             logger.info("\n _________ {} _________".format(head["task_name"]))
             for metric_name, metric_val in head.items():
-                # log with ML framework (e.g. Mlflow)
+                # log with experiment tracking framework (e.g. Mlflow)
                 if logging:
                     if not metric_name in ["preds", "labels"] and not metric_name.startswith("_"):
                         if isinstance(metric_val, numbers.Number):
-                            MlLogger.log_metrics(
+                            tracker.track_metrics(
                                 metrics={f"{dataset_name}_{metric_name}_{head['task_name']}": metric_val}, step=steps
                             )
                 # print via standard python logger

diff --git a/haystack/modeling/infer.py b/haystack/modeling/infer.py
@@ -21,7 +21,6 @@
 )
 from haystack.modeling.data_handler.inputs import QAInput
 from haystack.modeling.model.adaptive_model import AdaptiveModel, BaseAdaptiveModel
-from haystack.modeling.logger import MLFlowLogger
 from haystack.modeling.model.predictions import QAPred
 
 
@@ -74,8 +73,6 @@ def __init__(
         :return: An instance of the Inferencer.
 
         """
-        MLFlowLogger.disable()
-
         # Init device and distributed settings
         self.devices, n_gpu = initialize_device_settings(use_cuda=gpu, multi_gpu=False)
 

diff --git a/haystack/modeling/logger.py b/haystack/modeling/logger.py
diff --git a/haystack/modeling/model/adaptive_model.py b/haystack/modeling/model/adaptive_model.py
@@ -16,7 +16,7 @@
 from haystack.modeling.data_handler.processor import Processor
 from haystack.modeling.model.language_model import LanguageModel
 from haystack.modeling.model.prediction_head import PredictionHead
-from haystack.modeling.logger import MLFlowLogger as MlLogger
+from haystack.utils.experiment_tracking import Tracker as tracker
 
 
 logger = logging.getLogger(__name__)
@@ -450,7 +450,7 @@ def log_params(self):
             "lm_output_types": ",".join(self.lm_output_types),
         }
         try:
-            MlLogger.log_params(params)
+            tracker.track_params(params)
         except Exception as e:
             logger.warning(f"ML logging didn't work: {e}")
 

diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py
@@ -9,7 +9,7 @@
 from haystack.modeling.data_handler.processor import Processor
 from haystack.modeling.model.language_model import LanguageModel
 from haystack.modeling.model.prediction_head import PredictionHead, TextSimilarityHead
-from haystack.modeling.logger import MLFlowLogger as MlLogger
+from haystack.utils.experiment_tracking import Tracker as tracker
 
 
 logger = logging.getLogger(__name__)
@@ -334,7 +334,7 @@ def log_params(self):
             "prediction_heads": ",".join([head.__class__.__name__ for head in self.prediction_heads]),
         }
         try:
-            MlLogger.log_params(params)
+            tracker.track_params(params)
         except Exception as e:
             logger.warning(f"ML logging didn't work: {e}")