deepset-ai · tholor · Mar 19, 2021 · Mar 19, 2021
diff --git a/haystack/retriever/dense.py b/haystack/retriever/dense.py
@@ -270,6 +270,7 @@ def train(self,
               train_filename: str,
               dev_filename: str = None,
               test_filename: str = None,
+              max_processes: int = 128,
               dev_split: float = 0,
               batch_size: int = 2,
               embed_title: bool = True,
@@ -295,6 +296,8 @@ def train(self,
         :param train_filename: training filename
         :param dev_filename: development set filename, file to be used by model in eval step of training
         :param test_filename: test set filename, file to be used by model in test step after training
+        :param max_processes: the maximum number of processes to spawn in the multiprocessing.Pool used in DataSilo.
+                              It can be set to 1 to disable the use of multiprocessing or make debugging easier.
         :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None
         :param batch_size: total number of samples in 1 batch of data
         :param embed_title: whether to concatenate passage title with each passage. The default setting in official DPR embeds passage title with the corresponding passage
@@ -333,7 +336,7 @@ def train(self,
 
         self.model.connect_heads_with_processor(self.processor.tasks, require_labels=True)
 
-        data_silo = DataSilo(processor=self.processor, batch_size=batch_size, distributed=False)
+        data_silo = DataSilo(processor=self.processor, batch_size=batch_size, distributed=False, max_processes=max_processes)
 
         # 5. Create an optimizer
         self.model, optimizer, lr_schedule = initialize_optimizer(