From c9d3146fae89d578cd9867c8b5e699b973cbe917 Mon Sep 17 00:00:00 2001 From: Malte Pietsch Date: Wed, 15 Jul 2020 18:34:55 +0200 Subject: [PATCH] Fix multi-gpu training via DataParallel (#234) --- haystack/__init__.py | 1 + haystack/reader/farm.py | 14 +++++++++++++- .../Tutorial2_Finetune_a_model_on_your_data.ipynb | 4 ++-- .../Tutorial2_Finetune_a_model_on_your_data.py | 4 ++-- 4 files changed, 18 insertions(+), 5 deletions(-) diff --git a/haystack/__init__.py b/haystack/__init__.py index 26c3bc9856..f5b96f9aaf 100644 --- a/haystack/__init__.py +++ b/haystack/__init__.py @@ -12,5 +12,6 @@ logging.getLogger('farm.infer').setLevel(logging.INFO) logging.getLogger('transformers').setLevel(logging.WARNING) logging.getLogger('farm.eval').setLevel(logging.INFO) +logging.getLogger('farm.modeling.optimization').setLevel(logging.INFO) diff --git a/haystack/reader/farm.py b/haystack/reader/farm.py index a1ca2e10bf..10e060fd3d 100644 --- a/haystack/reader/farm.py +++ b/haystack/reader/farm.py @@ -10,10 +10,12 @@ from farm.infer import QAInferencer from farm.modeling.optimization import initialize_optimizer from farm.modeling.predictions import QAPred, QACandidate +from farm.modeling.adaptive_model import BaseAdaptiveModel from farm.train import Trainer from farm.eval import Evaluator from farm.utils import set_all_seeds, initialize_device_settings from scipy.special import expit +import shutil from haystack.database.base import Document from haystack.database.elasticsearch import ElasticsearchDocumentStore @@ -177,9 +179,17 @@ def train( # and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) + # Quick-fix until this is fixed upstream in FARM: + # We must avoid applying DataParallel twice (once when loading the inferencer, + # once when calling initalize_optimizer) + self.inferencer.model.save("tmp_model") + model = BaseAdaptiveModel.load(load_dir="tmp_model", device=device, strict=True) + shutil.rmtree('tmp_model') + # 3. Create an optimizer and pass the already initialized model model, optimizer, lr_schedule = initialize_optimizer( - model=self.inferencer.model, + model=model, + # model=self.inferencer.model, learning_rate=learning_rate, schedule_opts={"name": "LinearWarmup", "warmup_proportion": warmup_proportion}, n_batches=len(data_silo.loaders["train"]), @@ -197,6 +207,8 @@ def train( evaluate_every=evaluate_every, device=device, ) + + # 5. Let it grow! self.inferencer.model = trainer.train() self.save(Path(save_dir)) diff --git a/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb b/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb index e8533e4cff..2c30e80c60 100644 --- a/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb +++ b/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb @@ -90,10 +90,10 @@ } ], "source": [ - "reader = FARMReader(model_name_or_path=\"distilbert-base-uncased-distilled-squad\", use_gpu=False)\n", + "reader = FARMReader(model_name_or_path=\"distilbert-base-uncased-distilled-squad\", use_gpu=True)\n", "train_data = \"data/squad20\"\n", "# train_data = \"PATH/TO_YOUR/TRAIN_DATA\" \n", - "reader.train(data_dir=train_data, train_filename=\"dev-v2.0.json\", use_gpu=False, n_epochs=1, save_dir=\"my_model\")" + "reader.train(data_dir=train_data, train_filename=\"dev-v2.0.json\", use_gpu=True, n_epochs=1, save_dir=\"my_model\")" ] }, { diff --git a/tutorials/Tutorial2_Finetune_a_model_on_your_data.py b/tutorials/Tutorial2_Finetune_a_model_on_your_data.py index 2ad0f92769..5e199f656c 100755 --- a/tutorials/Tutorial2_Finetune_a_model_on_your_data.py +++ b/tutorials/Tutorial2_Finetune_a_model_on_your_data.py @@ -34,10 +34,10 @@ #**Recommendation: Run training on a GPU. To do so change the `use_gpu` arguments below to `True` -reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False) +reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True) train_data = "data/squad20" # train_data = "PATH/TO_YOUR/TRAIN_DATA" -reader.train(data_dir=train_data, train_filename="dev-v2.0.json", use_gpu=False, n_epochs=1, save_dir="my_model") +reader.train(data_dir=train_data, train_filename="dev-v2.0.json", use_gpu=True, n_epochs=1, save_dir="my_model") # Saving the model happens automatically at the end of training into the `save_dir` you specified # However, you could also save a reader manually again via: