Fix sdxl inference and add compatible changes for launcher support (#…

…9995) * Fix open_clip_torch version Signed-off-by: Mingyuan Ma <[email protected]> * Update sdxl inference scripts Signed-off-by: Mingyuan Ma <[email protected]> * Apply isort and black reformatting Signed-off-by: Victor49152 <[email protected]> * set ddp to false Signed-off-by: Mingyuan Ma <[email protected]> --------- Signed-off-by: Mingyuan Ma <[email protected]> Signed-off-by: Victor49152 <[email protected]> Co-authored-by: Victor49152 <[email protected]>
NVIDIA · Oct 11, 2024 · b50b30e · b50b30e
1 parent b2f77e4
commit b50b30e
Show file tree

Hide file tree

Showing 4 changed files with 37 additions and 36 deletions.
diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd2_train.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd2_train.yaml
@@ -153,6 +153,7 @@ model:
   resume_from_checkpoint: null # manually set the checkpoint file to load from
   apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
   gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  ddp_overlap: False # True for using PyTorch DDP overlap.
 
   optim:
     name: fused_adam

diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_infer.py b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_infer.py
@@ -26,10 +26,10 @@ def model_cfg_modifier(model_cfg):
         model_cfg.precision = cfg.trainer.precision
         model_cfg.ckpt_path = None
         model_cfg.inductor = False
-        model_cfg.unet_config.from_pretrained = "/opt/nemo-aligner/checkpoints/sdxl/unet_nemo.ckpt"
-        model_cfg.unet_config.from_NeMo = True
-        model_cfg.first_stage_config.from_pretrained = "/opt/nemo-aligner/checkpoints/sdxl/vae_nemo.ckpt"
-        model_cfg.first_stage_config.from_NeMo = True
+        # model_cfg.unet_config.from_pretrained = "/opt/nemo-aligner/checkpoints/sdxl/unet_nemo.ckpt"
+        # model_cfg.unet_config.from_NeMo = True
+        # model_cfg.first_stage_config.from_pretrained = "/opt/nemo-aligner/checkpoints/sdxl/vae_nemo.ckpt"
+        # model_cfg.first_stage_config.from_NeMo = True
         model_cfg.first_stage_config._target_ = 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper'
         # model_cfg.fsdp = True
 

diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
@@ -303,41 +303,41 @@ def setup_trainer_and_model_for_inference(
 
     # Create the NLPSaveRestoreConnector object for model saving and restoring.
     save_restore_connector = NLPSaveRestoreConnector()
+    if cfg.model.restore_from_path is not None:
+        if cfg.model.restore_from_path.endswith(".nemo") or os.path.isdir(cfg.model.restore_from_path):
+            # Set the model_extracted_dir attribute if the restore path is a directory.
+            if os.path.isdir(cfg.model.restore_from_path):
+                save_restore_connector.model_extracted_dir = cfg.model.restore_from_path
 
-    if cfg.model.restore_from_path.endswith(".nemo") or os.path.isdir(cfg.model.restore_from_path):
-        # Set the model_extracted_dir attribute if the restore path is a directory.
-        if os.path.isdir(cfg.model.restore_from_path):
-            save_restore_connector.model_extracted_dir = cfg.model.restore_from_path
-
-        # Restore the model configuration from the specified path and modify it for inference.
-        model_cfg = model_provider.restore_from(
-            restore_path=cfg.model.restore_from_path,
-            trainer=trainer,
-            save_restore_connector=save_restore_connector,
-            return_config=True,
-        )
-        with open_dict(model_cfg):
-            model_cfg_modifier(model_cfg)  # modify the configuration for inference
+            # Restore the model configuration from the specified path and modify it for inference.
+            model_cfg = model_provider.restore_from(
+                restore_path=cfg.model.restore_from_path,
+                trainer=trainer,
+                save_restore_connector=save_restore_connector,
+                return_config=True,
+            )
+            with open_dict(model_cfg):
+                model_cfg_modifier(model_cfg)  # modify the configuration for inference
 
-        # Restore the model from the specified path and configuration, and set it up for inference.
-        model = model_provider.restore_from(
-            restore_path=cfg.model.restore_from_path,
-            trainer=trainer,
-            override_config_path=model_cfg,
-            save_restore_connector=save_restore_connector,
-            strict=True,
-        )
+            # Restore the model from the specified path and configuration, and set it up for inference.
+            model = model_provider.restore_from(
+                restore_path=cfg.model.restore_from_path,
+                trainer=trainer,
+                override_config_path=model_cfg,
+                save_restore_connector=save_restore_connector,
+                strict=True,
+            )
 
-    elif cfg.model.restore_from_path.endswith(".ckpt"):
-        logging.warning(
-            "Loading from .ckpt checkpoint for inference is experimental! It doesn't support models with model parallelism!"
-        )
+        elif cfg.model.restore_from_path.endswith(".ckpt"):
+            logging.warning(
+                "Loading from .ckpt checkpoint for inference is experimental! It doesn't support models with model parallelism!"
+            )
 
-        model = model_provider.load_from_checkpoint(
-            cfg.model.restore_from_path,
-            hparams_file=cfg.model.get("hparams_file"),
-            trainer=trainer,
-        )
+            model = model_provider.load_from_checkpoint(
+                cfg.model.restore_from_path,
+                hparams_file=cfg.model.get("hparams_file"),
+                trainer=trainer,
+            )
 
     else:
         # load a model from scratch

diff --git a/requirements/requirements_multimodal.txt b/requirements/requirements_multimodal.txt
@@ -6,7 +6,7 @@ einops_exts
 imageio
 kornia
 nerfacc>=0.5.3
-open_clip_torch
+open_clip_torch==2.24.0
 PyMCubes
 taming-transformers
 torchdiffeq