Skip to content

Commit

Permalink
Revert "Multi gpu PyTorch autolog - creates multiple runs (mlflow#5837)"
Browse files Browse the repository at this point in the history
This reverts commit 816e035.

Signed-off-by: Diogo Santos <[email protected]>
  • Loading branch information
drsantos89 committed Jun 13, 2022
1 parent 6ed1601 commit c638fcf
Show file tree
Hide file tree
Showing 3 changed files with 4 additions and 43 deletions.
4 changes: 0 additions & 4 deletions docs/source/models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -659,10 +659,6 @@ produced by :py:func:`mlflow.pytorch.save_model()` and :py:func:`mlflow.pytorch.
the ``python_function`` flavor, allowing you to load them as generic Python functions for inference
via :py:func:`mlflow.pyfunc.load_model()`.

.. note::
In case of multi gpu training, ensure to save the model only with global rank 0 gpu. This avoids
logging multiple copies of the same model.

For more information, see :py:mod:`mlflow.pytorch`.

Scikit-learn (``sklearn``)
Expand Down
21 changes: 2 additions & 19 deletions examples/pytorch/BertNewsClassification/bert_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import os
from argparse import ArgumentParser

import logging
import numpy as np
import pandas as pd
import pytorch_lightning as pl
Expand Down Expand Up @@ -425,6 +424,8 @@ def configure_optimizers(self):
parser = BertNewsClassifier.add_model_specific_args(parent_parser=parser)
parser = BertDataModule.add_model_specific_args(parent_parser=parser)

mlflow.pytorch.autolog()

args = parser.parse_args()
dict_args = vars(args)

Expand Down Expand Up @@ -454,23 +455,5 @@ def configure_optimizers(self):
enable_checkpointing=True,
)

# It is safe to use `mlflow.pytorch.autolog` in DDP training, as below condition invokes
# autolog with only rank 0 gpu.

# For CPU Training
if dict_args["gpus"] is None or int(dict_args["gpus"]) == 0:
mlflow.pytorch.autolog()
elif int(dict_args["gpus"]) >= 1 and trainer.global_rank == 0:
# In case of multi gpu training, the training script is invoked multiple times,
# The following condition is needed to avoid multiple copies of mlflow runs.
# When one or more gpus are used for training, it is enough to save
# the model and its parameters using rank 0 gpu.
mlflow.pytorch.autolog()
else:
# This condition is met only for multi-gpu training when the global rank is non zero.
# Since the parameters are already logged using global rank 0 gpu, it is safe to ignore
# this condition.
logging.info("Active run exists.. ")

trainer.fit(model, dm)
trainer.test(model, datamodule=dm)
22 changes: 2 additions & 20 deletions examples/pytorch/MNIST/mnist_autolog_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
# pylint: disable=abstract-method
import pytorch_lightning as pl
import mlflow.pytorch
import logging
import os
import torch
from argparse import ArgumentParser
Expand Down Expand Up @@ -275,6 +274,8 @@ def configure_optimizers(self):
parser = pl.Trainer.add_argparse_args(parent_parser=parser)
parser = LightningMNISTClassifier.add_model_specific_args(parent_parser=parser)

mlflow.pytorch.autolog()

args = parser.parse_args()
dict_args = vars(args)

Expand Down Expand Up @@ -302,24 +303,5 @@ def configure_optimizers(self):
trainer = pl.Trainer.from_argparse_args(
args, callbacks=[lr_logger, early_stopping, checkpoint_callback], checkpoint_callback=True
)

# It is safe to use `mlflow.pytorch.autolog` in DDP training, as below condition invokes
# autolog with only rank 0 gpu.

# For CPU Training
if dict_args["gpus"] is None or int(dict_args["gpus"]) == 0:
mlflow.pytorch.autolog()
elif int(dict_args["gpus"]) >= 1 and trainer.global_rank == 0:
# In case of multi gpu training, the training script is invoked multiple times,
# The following condition is needed to avoid multiple copies of mlflow runs.
# When one or more gpus are used for training, it is enough to save
# the model and its parameters using rank 0 gpu.
mlflow.pytorch.autolog()
else:
# This condition is met only for multi-gpu training when the global rank is non zero.
# Since the parameters are already logged using global rank 0 gpu, it is safe to ignore
# this condition.
logging.info("Active run exists.. ")

trainer.fit(model, dm)
trainer.test(datamodule=dm)

0 comments on commit c638fcf

Please sign in to comment.