diff --git a/docs/source/models.rst b/docs/source/models.rst index fa3c22e3e15dd..2aafda4be8c37 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -659,6 +659,10 @@ produced by :py:func:`mlflow.pytorch.save_model()` and :py:func:`mlflow.pytorch. the ``python_function`` flavor, allowing you to load them as generic Python functions for inference via :py:func:`mlflow.pyfunc.load_model()`. +.. note:: + In case of multi gpu training, ensure to save the model only with global rank 0 gpu. This avoids + logging multiple copies of the same model. + For more information, see :py:mod:`mlflow.pytorch`. Scikit-learn (``sklearn``) diff --git a/examples/pytorch/BertNewsClassification/bert_classification.py b/examples/pytorch/BertNewsClassification/bert_classification.py index bbea6e330fd69..d8d5d3f74a175 100644 --- a/examples/pytorch/BertNewsClassification/bert_classification.py +++ b/examples/pytorch/BertNewsClassification/bert_classification.py @@ -6,6 +6,7 @@ import os from argparse import ArgumentParser +import logging import numpy as np import pandas as pd import pytorch_lightning as pl @@ -424,8 +425,6 @@ def configure_optimizers(self): parser = BertNewsClassifier.add_model_specific_args(parent_parser=parser) parser = BertDataModule.add_model_specific_args(parent_parser=parser) - mlflow.pytorch.autolog() - args = parser.parse_args() dict_args = vars(args) @@ -455,5 +454,23 @@ def configure_optimizers(self): enable_checkpointing=True, ) + # It is safe to use `mlflow.pytorch.autolog` in DDP training, as below condition invokes + # autolog with only rank 0 gpu. + + # For CPU Training + if dict_args["gpus"] is None or int(dict_args["gpus"]) == 0: + mlflow.pytorch.autolog() + elif int(dict_args["gpus"]) >= 1 and trainer.global_rank == 0: + # In case of multi gpu training, the training script is invoked multiple times, + # The following condition is needed to avoid multiple copies of mlflow runs. + # When one or more gpus are used for training, it is enough to save + # the model and its parameters using rank 0 gpu. + mlflow.pytorch.autolog() + else: + # This condition is met only for multi-gpu training when the global rank is non zero. + # Since the parameters are already logged using global rank 0 gpu, it is safe to ignore + # this condition. + logging.info("Active run exists.. ") + trainer.fit(model, dm) trainer.test(model, datamodule=dm) diff --git a/examples/pytorch/MNIST/mnist_autolog_example.py b/examples/pytorch/MNIST/mnist_autolog_example.py index 231cd58f20396..c65da9a631c6d 100644 --- a/examples/pytorch/MNIST/mnist_autolog_example.py +++ b/examples/pytorch/MNIST/mnist_autolog_example.py @@ -10,6 +10,7 @@ # pylint: disable=abstract-method import pytorch_lightning as pl import mlflow.pytorch +import logging import os import torch from argparse import ArgumentParser @@ -274,8 +275,6 @@ def configure_optimizers(self): parser = pl.Trainer.add_argparse_args(parent_parser=parser) parser = LightningMNISTClassifier.add_model_specific_args(parent_parser=parser) - mlflow.pytorch.autolog() - args = parser.parse_args() dict_args = vars(args) @@ -303,5 +302,24 @@ def configure_optimizers(self): trainer = pl.Trainer.from_argparse_args( args, callbacks=[lr_logger, early_stopping, checkpoint_callback], checkpoint_callback=True ) + + # It is safe to use `mlflow.pytorch.autolog` in DDP training, as below condition invokes + # autolog with only rank 0 gpu. + + # For CPU Training + if dict_args["gpus"] is None or int(dict_args["gpus"]) == 0: + mlflow.pytorch.autolog() + elif int(dict_args["gpus"]) >= 1 and trainer.global_rank == 0: + # In case of multi gpu training, the training script is invoked multiple times, + # The following condition is needed to avoid multiple copies of mlflow runs. + # When one or more gpus are used for training, it is enough to save + # the model and its parameters using rank 0 gpu. + mlflow.pytorch.autolog() + else: + # This condition is met only for multi-gpu training when the global rank is non zero. + # Since the parameters are already logged using global rank 0 gpu, it is safe to ignore + # this condition. + logging.info("Active run exists.. ") + trainer.fit(model, dm) trainer.test(datamodule=dm)