From 8696fbb43a5ab52796cea68f50aaab300b453d72 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 2 Nov 2024 10:04:00 +0530
Subject: [PATCH 01/27] allow big lora tests to run on the CI.

---
 tests/lora/test_lora_layers_flux.py | 11 +++++++++--
 tests/lora/test_lora_layers_sd3.py  | 12 +++++++++++-
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/tests/lora/test_lora_layers_flux.py b/tests/lora/test_lora_layers_flux.py
index b58525cc7a6f..760b33ddb2e8 100644
--- a/tests/lora/test_lora_layers_flux.py
+++ b/tests/lora/test_lora_layers_flux.py
@@ -31,9 +31,12 @@
     numpy_cosine_similarity_distance,
     require_peft_backend,
     require_torch_gpu,
+    require_big_gpu_with_torch_cuda,
     slow,
     torch_device,
+    print_tensor_test
 )
+import pytest
 
 
 if is_peft_available():
@@ -169,8 +172,8 @@ def test_modify_padding_mode(self):
 @nightly
 @require_torch_gpu
 @require_peft_backend
-@unittest.skip("We cannot run inference on this model with the current CI hardware")
-# TODO (DN6, sayakpaul): move these tests to a beefier GPU
+@require_big_gpu_with_torch_cuda
+@pytest.mark.big_gpu_with_torch_cuda
 class FluxLoRAIntegrationTests(unittest.TestCase):
     """internal note: The integration slices were obtained on audace.
 
@@ -211,6 +214,7 @@ def test_flux_the_last_ben(self):
             generator=torch.manual_seed(self.seed),
         ).images
         out_slice = out[0, -3:, -3:, -1].flatten()
+        print_tensor_test(out_slice)
         expected_slice = np.array([0.1855, 0.1855, 0.1836, 0.1855, 0.1836, 0.1875, 0.1777, 0.1758, 0.2246])
 
         max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), out_slice)
@@ -233,6 +237,7 @@ def test_flux_kohya(self):
         ).images
 
         out_slice = out[0, -3:, -3:, -1].flatten()
+        print_tensor_test(out_slice)
         expected_slice = np.array([0.6367, 0.6367, 0.6328, 0.6367, 0.6328, 0.6289, 0.6367, 0.6328, 0.6484])
 
         max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), out_slice)
@@ -255,6 +260,7 @@ def test_flux_kohya_with_text_encoder(self):
         ).images
 
         out_slice = out[0, -3:, -3:, -1].flatten()
+        print_tensor_test(out_slice)
         expected_slice = np.array([0.4023, 0.4023, 0.4023, 0.3965, 0.3984, 0.3965, 0.3926, 0.3906, 0.4219])
 
         max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), out_slice)
@@ -277,6 +283,7 @@ def test_flux_xlabs(self):
             generator=torch.manual_seed(self.seed),
         ).images
         out_slice = out[0, -3:, -3:, -1].flatten()
+        print_tensor_test(out_slice)
         expected_slice = np.array([0.3965, 0.4180, 0.4434, 0.4082, 0.4375, 0.4590, 0.4141, 0.4375, 0.4980])
 
         max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), out_slice)
diff --git a/tests/lora/test_lora_layers_sd3.py b/tests/lora/test_lora_layers_sd3.py
index 78d4b786d21b..d86d731774ff 100644
--- a/tests/lora/test_lora_layers_sd3.py
+++ b/tests/lora/test_lora_layers_sd3.py
@@ -34,7 +34,12 @@
     require_peft_backend,
     require_torch_gpu,
     torch_device,
+    slow,
+    nightly,
+    require_big_gpu_with_torch_cuda,
+    print_tensor_test
 )
+import pytest
 
 
 if is_peft_available():
@@ -130,9 +135,13 @@ def test_modify_padding_mode(self):
         pass
 
 
+@slow
+@nightly
 @require_torch_gpu
 @require_peft_backend
-class LoraSD3IntegrationTests(unittest.TestCase):
+@require_big_gpu_with_torch_cuda
+@pytest.mark.big_gpu_with_torch_cuda
+class SD3LoraIntegrationTests(unittest.TestCase):
     pipeline_class = StableDiffusion3Img2ImgPipeline
     repo_id = "stabilityai/stable-diffusion-3-medium-diffusers"
 
@@ -173,6 +182,7 @@ def test_sd3_img2img_lora(self):
 
         image = pipe(**inputs).images[0]
         image_slice = image[0, :10, :10]
+        print_tensor_test(image[0, -3:, -3:, -1].flatten())
         expected_slice = np.array(
             [
                 0.47827148,

From 06b3919c37dedace8f812b988741d3e425fd3aec Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 2 Nov 2024 14:56:13 +0530
Subject: [PATCH 02/27] print

---
 src/diffusers/pipelines/flux/pipeline_flux.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/diffusers/pipelines/flux/pipeline_flux.py b/src/diffusers/pipelines/flux/pipeline_flux.py
index 040d935f1b88..19175b56cadf 100644
--- a/src/diffusers/pipelines/flux/pipeline_flux.py
+++ b/src/diffusers/pipelines/flux/pipeline_flux.py
@@ -776,7 +776,9 @@ def __call__(
             image = self.image_processor.postprocess(image, output_type=output_type)
 
         # Offload all models
+        print("before maybe")
         self.maybe_free_model_hooks()
+        print("after maybe")
 
         if not return_dict:
             return (image,)

From b062bd9b9161230f5655a9928125da85425791fa Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 2 Nov 2024 15:04:00 +0530
Subject: [PATCH 03/27] print.

---
 src/diffusers/pipelines/pipeline_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 2e1858b16148..aa2c7d18fb4d 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -983,6 +983,7 @@ def remove_all_hooks(self):
         r"""
         Removes all hooks that were added when using `enable_sequential_cpu_offload` or `enable_model_cpu_offload`.
         """
+        print("Within remove_all_hooks().")
         for _, model in self.components.items():
             if isinstance(model, torch.nn.Module) and hasattr(model, "_hf_hook"):
                 accelerate.hooks.remove_hook_from_module(model, recurse=True)

From 360935c04ff80757a5cdd637301f8181538ed006 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 2 Nov 2024 15:10:47 +0530
Subject: [PATCH 04/27] print

---
 src/diffusers/pipelines/pipeline_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index aa2c7d18fb4d..9e2dab68fa60 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -986,6 +986,7 @@ def remove_all_hooks(self):
         print("Within remove_all_hooks().")
         for _, model in self.components.items():
             if isinstance(model, torch.nn.Module) and hasattr(model, "_hf_hook"):
+                print(f"{model.__class__.__name__=}")
                 accelerate.hooks.remove_hook_from_module(model, recurse=True)
         self._all_hooks = []
 

From 1d1248ac1783fc519c1da04980585dcb2557371b Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 2 Nov 2024 15:19:53 +0530
Subject: [PATCH 05/27] print

---
 src/diffusers/pipelines/pipeline_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 9e2dab68fa60..007870d81fa1 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -988,6 +988,7 @@ def remove_all_hooks(self):
             if isinstance(model, torch.nn.Module) and hasattr(model, "_hf_hook"):
                 print(f"{model.__class__.__name__=}")
                 accelerate.hooks.remove_hook_from_module(model, recurse=True)
+                print("Done removing from the current model.")
         self._all_hooks = []
 
     def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):

From f5550e35a2503c3f9ad764aa8ee45b99cd49a1b0 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 2 Nov 2024 15:28:59 +0530
Subject: [PATCH 06/27] print

---
 src/diffusers/pipelines/pipeline_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 007870d81fa1..9084cb635f17 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -990,6 +990,7 @@ def remove_all_hooks(self):
                 accelerate.hooks.remove_hook_from_module(model, recurse=True)
                 print("Done removing from the current model.")
         self._all_hooks = []
+        print("Done in remove.")
 
     def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
         r"""
@@ -1064,6 +1065,7 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
 
             _, hook = cpu_offload_with_hook(model, device, prev_module_hook=hook)
             self._all_hooks.append(hook)
+            print("Initial hooks appended.")
 
         # CPU offload models that are not in the seq chain unless they are explicitly excluded
         # these models will stay on CPU until maybe_free_model_hooks is called
@@ -1077,6 +1079,7 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
             else:
                 _, hook = cpu_offload_with_hook(model, device)
                 self._all_hooks.append(hook)
+                print("Done second time.")
 
     def maybe_free_model_hooks(self):
         r"""

From 4cd5a3cd629143143a162831193c07e22dcb027b Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 2 Nov 2024 15:37:27 +0530
Subject: [PATCH 07/27] print

---
 src/diffusers/pipelines/pipeline_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 9084cb635f17..c8e193c2cf37 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -1050,6 +1050,7 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
         self._all_hooks = []
         hook = None
         for model_str in self.model_cpu_offload_seq.split("->"):
+            print(f"Entering with {model_str}")
             model = all_model_components.pop(model_str, None)
 
             if not isinstance(model, torch.nn.Module):
@@ -1079,8 +1080,7 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
             else:
                 _, hook = cpu_offload_with_hook(model, device)
                 self._all_hooks.append(hook)
-                print("Done second time.")
-
+                
     def maybe_free_model_hooks(self):
         r"""
         Function that offloads all components, removes all model hooks that were added when using

From a901420bedfb8b6a1988a901e62f7229ef747bbe Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 2 Nov 2024 15:45:48 +0530
Subject: [PATCH 08/27] more

---
 src/diffusers/pipelines/pipeline_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index c8e193c2cf37..64d74a3bd286 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -1044,7 +1044,7 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
         device_mod = getattr(torch, device.type, None)
         if hasattr(device_mod, "empty_cache") and device_mod.is_available():
             device_mod.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
+        print("Empty cache called.")
         all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
 
         self._all_hooks = []

From d659f1ccf0a61887579e642d8fc12f653b447ed6 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 2 Nov 2024 15:52:31 +0530
Subject: [PATCH 09/27] print

---
 src/diffusers/pipelines/pipeline_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 64d74a3bd286..0de84c8e02fd 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -1039,9 +1039,12 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
         device_type = torch_device.type
         device = torch.device(f"{device_type}:{self._offload_gpu_id}")
         self._offload_device = device
+        print("Initial assignments done.")
 
         self.to("cpu", silence_dtype_warnings=True)
+        print("placed on CPU.")
         device_mod = getattr(torch, device.type, None)
+        print(f"{device=}")
         if hasattr(device_mod, "empty_cache") and device_mod.is_available():
             device_mod.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
         print("Empty cache called.")

From 96d27ff0ec600f44050c0b8ca6b74640b2c76f60 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 2 Nov 2024 16:10:49 +0530
Subject: [PATCH 10/27] remove print.

---
 src/diffusers/pipelines/pipeline_utils.py | 13 ++-----------
 tests/lora/test_lora_layers_flux.py       |  6 +++---
 tests/lora/test_lora_layers_sd3.py        | 10 +++++-----
 3 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 0de84c8e02fd..b56f9f472f01 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -983,14 +983,11 @@ def remove_all_hooks(self):
         r"""
         Removes all hooks that were added when using `enable_sequential_cpu_offload` or `enable_model_cpu_offload`.
         """
-        print("Within remove_all_hooks().")
         for _, model in self.components.items():
             if isinstance(model, torch.nn.Module) and hasattr(model, "_hf_hook"):
-                print(f"{model.__class__.__name__=}")
                 accelerate.hooks.remove_hook_from_module(model, recurse=True)
-                print("Done removing from the current model.")
+
         self._all_hooks = []
-        print("Done in remove.")
 
     def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
         r"""
@@ -1039,21 +1036,16 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
         device_type = torch_device.type
         device = torch.device(f"{device_type}:{self._offload_gpu_id}")
         self._offload_device = device
-        print("Initial assignments done.")
 
         self.to("cpu", silence_dtype_warnings=True)
-        print("placed on CPU.")
         device_mod = getattr(torch, device.type, None)
-        print(f"{device=}")
         if hasattr(device_mod, "empty_cache") and device_mod.is_available():
             device_mod.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        print("Empty cache called.")
         all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
 
         self._all_hooks = []
         hook = None
         for model_str in self.model_cpu_offload_seq.split("->"):
-            print(f"Entering with {model_str}")
             model = all_model_components.pop(model_str, None)
 
             if not isinstance(model, torch.nn.Module):
@@ -1069,7 +1061,6 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
 
             _, hook = cpu_offload_with_hook(model, device, prev_module_hook=hook)
             self._all_hooks.append(hook)
-            print("Initial hooks appended.")
 
         # CPU offload models that are not in the seq chain unless they are explicitly excluded
         # these models will stay on CPU until maybe_free_model_hooks is called
@@ -1083,7 +1074,7 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
             else:
                 _, hook = cpu_offload_with_hook(model, device)
                 self._all_hooks.append(hook)
-                
+
     def maybe_free_model_hooks(self):
         r"""
         Function that offloads all components, removes all model hooks that were added when using
diff --git a/tests/lora/test_lora_layers_flux.py b/tests/lora/test_lora_layers_flux.py
index 760b33ddb2e8..fa7136601c69 100644
--- a/tests/lora/test_lora_layers_flux.py
+++ b/tests/lora/test_lora_layers_flux.py
@@ -19,6 +19,7 @@
 import unittest
 
 import numpy as np
+import pytest
 import safetensors.torch
 import torch
 from transformers import AutoTokenizer, CLIPTextModel, CLIPTokenizer, T5EncoderModel
@@ -29,14 +30,13 @@
     is_peft_available,
     nightly,
     numpy_cosine_similarity_distance,
+    print_tensor_test,
+    require_big_gpu_with_torch_cuda,
     require_peft_backend,
     require_torch_gpu,
-    require_big_gpu_with_torch_cuda,
     slow,
     torch_device,
-    print_tensor_test
 )
-import pytest
 
 
 if is_peft_available():
diff --git a/tests/lora/test_lora_layers_sd3.py b/tests/lora/test_lora_layers_sd3.py
index d86d731774ff..d718c236d69d 100644
--- a/tests/lora/test_lora_layers_sd3.py
+++ b/tests/lora/test_lora_layers_sd3.py
@@ -17,6 +17,7 @@
 import unittest
 
 import numpy as np
+import pytest
 import torch
 from transformers import AutoTokenizer, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel
 
@@ -30,16 +31,15 @@
 from diffusers.utils.import_utils import is_accelerate_available
 from diffusers.utils.testing_utils import (
     is_peft_available,
+    nightly,
     numpy_cosine_similarity_distance,
+    print_tensor_test,
+    require_big_gpu_with_torch_cuda,
     require_peft_backend,
     require_torch_gpu,
-    torch_device,
     slow,
-    nightly,
-    require_big_gpu_with_torch_cuda,
-    print_tensor_test
+    torch_device,
 )
-import pytest
 
 
 if is_peft_available():

From 8510f98b8d175c310b26134447121b802e91a99e Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 2 Nov 2024 16:11:21 +0530
Subject: [PATCH 11/27] remove print

---
 src/diffusers/pipelines/flux/pipeline_flux.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/flux/pipeline_flux.py b/src/diffusers/pipelines/flux/pipeline_flux.py
index 19175b56cadf..ebaa6bbdc359 100644
--- a/src/diffusers/pipelines/flux/pipeline_flux.py
+++ b/src/diffusers/pipelines/flux/pipeline_flux.py
@@ -776,10 +776,8 @@ def __call__(
             image = self.image_processor.postprocess(image, output_type=output_type)
 
         # Offload all models
-        print("before maybe")
         self.maybe_free_model_hooks()
-        print("after maybe")
-
+        
         if not return_dict:
             return (image,)
 

From 9fe7b91b42bd06b1716f19c19c596cc5cce82f48 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 2 Nov 2024 16:16:36 +0530
Subject: [PATCH 12/27] directly place on cuda.

---
 src/diffusers/pipelines/flux/pipeline_flux.py |  2 +-
 tests/lora/test_lora_layers_flux.py           | 11 +++++++----
 tests/lora/test_lora_layers_sd3.py            |  4 +++-
 tests/pipelines/flux/test_pipeline_flux.py    |  3 +--
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/pipelines/flux/pipeline_flux.py b/src/diffusers/pipelines/flux/pipeline_flux.py
index ebaa6bbdc359..040d935f1b88 100644
--- a/src/diffusers/pipelines/flux/pipeline_flux.py
+++ b/src/diffusers/pipelines/flux/pipeline_flux.py
@@ -777,7 +777,7 @@ def __call__(
 
         # Offload all models
         self.maybe_free_model_hooks()
-        
+
         if not return_dict:
             return (image,)
 
diff --git a/tests/lora/test_lora_layers_flux.py b/tests/lora/test_lora_layers_flux.py
index fa7136601c69..789db2b5c582 100644
--- a/tests/lora/test_lora_layers_flux.py
+++ b/tests/lora/test_lora_layers_flux.py
@@ -202,7 +202,7 @@ def test_flux_the_last_ben(self):
         self.pipeline.load_lora_weights("TheLastBen/Jon_Snow_Flux_LoRA", weight_name="jon_snow.safetensors")
         self.pipeline.fuse_lora()
         self.pipeline.unload_lora_weights()
-        self.pipeline.enable_model_cpu_offload()
+        self.pipeline = self.pipeline.to("cuda")
 
         prompt = "jon snow eating pizza with ketchup"
 
@@ -225,7 +225,10 @@ def test_flux_kohya(self):
         self.pipeline.load_lora_weights("Norod78/brain-slug-flux")
         self.pipeline.fuse_lora()
         self.pipeline.unload_lora_weights()
-        self.pipeline.enable_model_cpu_offload()
+        # Instead of calling `enable_model_cpu_offload()`, we do a cuda placement here because the CI
+        # run supports it. We have about 34GB RAM in the CI runner which kills the test when run with
+        # `enable_model_cpu_offload()`.
+        self.pipeline = self.pipeline.to("cuda")
 
         prompt = "The cat with a brain slug earring"
         out = self.pipeline(
@@ -248,7 +251,7 @@ def test_flux_kohya_with_text_encoder(self):
         self.pipeline.load_lora_weights("cocktailpeanut/optimus", weight_name="optimus.safetensors")
         self.pipeline.fuse_lora()
         self.pipeline.unload_lora_weights()
-        self.pipeline.enable_model_cpu_offload()
+        self.pipeline = self.pipeline.to("cuda")
 
         prompt = "optimus is cleaning the house with broomstick"
         out = self.pipeline(
@@ -271,7 +274,7 @@ def test_flux_xlabs(self):
         self.pipeline.load_lora_weights("XLabs-AI/flux-lora-collection", weight_name="disney_lora.safetensors")
         self.pipeline.fuse_lora()
         self.pipeline.unload_lora_weights()
-        self.pipeline.enable_model_cpu_offload()
+        self.pipeline = self.pipeline.to("cuda")
 
         prompt = "A blue jay standing on a large basket of rainbow macarons, disney style"
 
diff --git a/tests/lora/test_lora_layers_sd3.py b/tests/lora/test_lora_layers_sd3.py
index d718c236d69d..19fe8cbb732b 100644
--- a/tests/lora/test_lora_layers_sd3.py
+++ b/tests/lora/test_lora_layers_sd3.py
@@ -176,7 +176,9 @@ def get_inputs(self, device, seed=0):
     def test_sd3_img2img_lora(self):
         pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16)
         pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors")
-        pipe.enable_sequential_cpu_offload()
+        pipe.fuse_lora()
+        pipe.unload_lora_weights()
+        pipe = pipe.to("cuda")
 
         inputs = self.get_inputs(torch_device)
 
diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py
index 3ccf3f80ba3c..88997de5f5fb 100644
--- a/tests/pipelines/flux/test_pipeline_flux.py
+++ b/tests/pipelines/flux/test_pipeline_flux.py
@@ -236,8 +236,7 @@ def get_inputs(self, device, seed=0):
     def test_flux_inference(self):
         pipe = self.pipeline_class.from_pretrained(
             self.repo_id, torch_dtype=torch.bfloat16, text_encoder=None, text_encoder_2=None
-        )
-        pipe.enable_model_cpu_offload()
+        ).to("cuda")
 
         inputs = self.get_inputs(torch_device)
 

From 286af0e87d44388417725622dd5c01f2edd245e7 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 2 Nov 2024 16:24:31 +0530
Subject: [PATCH 13/27] remove pipeline.

---
 tests/lora/test_lora_layers_flux.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/lora/test_lora_layers_flux.py b/tests/lora/test_lora_layers_flux.py
index 789db2b5c582..390a3deed491 100644
--- a/tests/lora/test_lora_layers_flux.py
+++ b/tests/lora/test_lora_layers_flux.py
@@ -195,6 +195,7 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
 
+        del self.pipeline 
         gc.collect()
         torch.cuda.empty_cache()
 

From 741a44fb5dc822421ebdcbe7d25e65577bf1505f Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 2 Nov 2024 16:36:49 +0530
Subject: [PATCH 14/27] remove

---
 tests/lora/test_lora_layers_flux.py | 4 ----
 tests/lora/test_lora_layers_sd3.py  | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/lora/test_lora_layers_flux.py b/tests/lora/test_lora_layers_flux.py
index 390a3deed491..b3b2e9d222d7 100644
--- a/tests/lora/test_lora_layers_flux.py
+++ b/tests/lora/test_lora_layers_flux.py
@@ -215,7 +215,6 @@ def test_flux_the_last_ben(self):
             generator=torch.manual_seed(self.seed),
         ).images
         out_slice = out[0, -3:, -3:, -1].flatten()
-        print_tensor_test(out_slice)
         expected_slice = np.array([0.1855, 0.1855, 0.1836, 0.1855, 0.1836, 0.1875, 0.1777, 0.1758, 0.2246])
 
         max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), out_slice)
@@ -241,7 +240,6 @@ def test_flux_kohya(self):
         ).images
 
         out_slice = out[0, -3:, -3:, -1].flatten()
-        print_tensor_test(out_slice)
         expected_slice = np.array([0.6367, 0.6367, 0.6328, 0.6367, 0.6328, 0.6289, 0.6367, 0.6328, 0.6484])
 
         max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), out_slice)
@@ -264,7 +262,6 @@ def test_flux_kohya_with_text_encoder(self):
         ).images
 
         out_slice = out[0, -3:, -3:, -1].flatten()
-        print_tensor_test(out_slice)
         expected_slice = np.array([0.4023, 0.4023, 0.4023, 0.3965, 0.3984, 0.3965, 0.3926, 0.3906, 0.4219])
 
         max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), out_slice)
@@ -287,7 +284,6 @@ def test_flux_xlabs(self):
             generator=torch.manual_seed(self.seed),
         ).images
         out_slice = out[0, -3:, -3:, -1].flatten()
-        print_tensor_test(out_slice)
         expected_slice = np.array([0.3965, 0.4180, 0.4434, 0.4082, 0.4375, 0.4590, 0.4141, 0.4375, 0.4980])
 
         max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), out_slice)
diff --git a/tests/lora/test_lora_layers_sd3.py b/tests/lora/test_lora_layers_sd3.py
index 19fe8cbb732b..04ba923a9d8e 100644
--- a/tests/lora/test_lora_layers_sd3.py
+++ b/tests/lora/test_lora_layers_sd3.py
@@ -184,7 +184,7 @@ def test_sd3_img2img_lora(self):
 
         image = pipe(**inputs).images[0]
         image_slice = image[0, :10, :10]
-        print_tensor_test(image[0, -3:, -3:, -1].flatten())
+        print_tensor_test(image[0, :10, :10].flatten())
         expected_slice = np.array(
             [
                 0.47827148,

From e818907ece93960cb54ba727abd39e4ab69d6a75 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 2 Nov 2024 16:39:02 +0530
Subject: [PATCH 15/27] fix

---
 tests/lora/test_lora_layers_flux.py        | 3 +--
 tests/lora/test_lora_layers_sd3.py         | 2 --
 tests/pipelines/flux/test_pipeline_flux.py | 2 ++
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/lora/test_lora_layers_flux.py b/tests/lora/test_lora_layers_flux.py
index b3b2e9d222d7..880fe70e8531 100644
--- a/tests/lora/test_lora_layers_flux.py
+++ b/tests/lora/test_lora_layers_flux.py
@@ -30,7 +30,6 @@
     is_peft_available,
     nightly,
     numpy_cosine_similarity_distance,
-    print_tensor_test,
     require_big_gpu_with_torch_cuda,
     require_peft_backend,
     require_torch_gpu,
@@ -195,7 +194,7 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
 
-        del self.pipeline 
+        del self.pipeline
         gc.collect()
         torch.cuda.empty_cache()
 
diff --git a/tests/lora/test_lora_layers_sd3.py b/tests/lora/test_lora_layers_sd3.py
index 04ba923a9d8e..584a42fc7879 100644
--- a/tests/lora/test_lora_layers_sd3.py
+++ b/tests/lora/test_lora_layers_sd3.py
@@ -33,7 +33,6 @@
     is_peft_available,
     nightly,
     numpy_cosine_similarity_distance,
-    print_tensor_test,
     require_big_gpu_with_torch_cuda,
     require_peft_backend,
     require_torch_gpu,
@@ -184,7 +183,6 @@ def test_sd3_img2img_lora(self):
 
         image = pipe(**inputs).images[0]
         image_slice = image[0, :10, :10]
-        print_tensor_test(image[0, :10, :10].flatten())
         expected_slice = np.array(
             [
                 0.47827148,
diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py
index 88997de5f5fb..23e0f18c0009 100644
--- a/tests/pipelines/flux/test_pipeline_flux.py
+++ b/tests/pipelines/flux/test_pipeline_flux.py
@@ -9,6 +9,7 @@
 
 from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
 from diffusers.utils.testing_utils import (
+    nightly,
     numpy_cosine_similarity_distance,
     require_big_gpu_with_torch_cuda,
     slow,
@@ -193,6 +194,7 @@ def test_fused_qkv_projections(self):
 
 
 @slow
+@nightly
 @require_big_gpu_with_torch_cuda
 @pytest.mark.big_gpu_with_torch_cuda
 class FluxPipelineSlowTests(unittest.TestCase):

From 3ed98a12994267866572b2f56751509f9151fa72 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 2 Nov 2024 16:45:11 +0530
Subject: [PATCH 16/27] fix

---
 tests/lora/test_lora_layers_flux.py        |  8 ++++----
 tests/lora/test_lora_layers_sd3.py         |  2 +-
 tests/pipelines/flux/test_pipeline_flux.py | 11 ++++-------
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/tests/lora/test_lora_layers_flux.py b/tests/lora/test_lora_layers_flux.py
index 880fe70e8531..63aed8bbce2a 100644
--- a/tests/lora/test_lora_layers_flux.py
+++ b/tests/lora/test_lora_layers_flux.py
@@ -202,7 +202,7 @@ def test_flux_the_last_ben(self):
         self.pipeline.load_lora_weights("TheLastBen/Jon_Snow_Flux_LoRA", weight_name="jon_snow.safetensors")
         self.pipeline.fuse_lora()
         self.pipeline.unload_lora_weights()
-        self.pipeline = self.pipeline.to("cuda")
+        self.pipeline = self.pipeline.to(torch_device)
 
         prompt = "jon snow eating pizza with ketchup"
 
@@ -227,7 +227,7 @@ def test_flux_kohya(self):
         # Instead of calling `enable_model_cpu_offload()`, we do a cuda placement here because the CI
         # run supports it. We have about 34GB RAM in the CI runner which kills the test when run with
         # `enable_model_cpu_offload()`.
-        self.pipeline = self.pipeline.to("cuda")
+        self.pipeline = self.pipeline.to(torch_device)
 
         prompt = "The cat with a brain slug earring"
         out = self.pipeline(
@@ -249,7 +249,7 @@ def test_flux_kohya_with_text_encoder(self):
         self.pipeline.load_lora_weights("cocktailpeanut/optimus", weight_name="optimus.safetensors")
         self.pipeline.fuse_lora()
         self.pipeline.unload_lora_weights()
-        self.pipeline = self.pipeline.to("cuda")
+        self.pipeline = self.pipeline.to(torch_device)
 
         prompt = "optimus is cleaning the house with broomstick"
         out = self.pipeline(
@@ -271,7 +271,7 @@ def test_flux_xlabs(self):
         self.pipeline.load_lora_weights("XLabs-AI/flux-lora-collection", weight_name="disney_lora.safetensors")
         self.pipeline.fuse_lora()
         self.pipeline.unload_lora_weights()
-        self.pipeline = self.pipeline.to("cuda")
+        self.pipeline = self.pipeline.to(torch_device)
 
         prompt = "A blue jay standing on a large basket of rainbow macarons, disney style"
 
diff --git a/tests/lora/test_lora_layers_sd3.py b/tests/lora/test_lora_layers_sd3.py
index 584a42fc7879..87e9ded3f1c4 100644
--- a/tests/lora/test_lora_layers_sd3.py
+++ b/tests/lora/test_lora_layers_sd3.py
@@ -177,7 +177,7 @@ def test_sd3_img2img_lora(self):
         pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors")
         pipe.fuse_lora()
         pipe.unload_lora_weights()
-        pipe = pipe.to("cuda")
+        pipe = pipe.to(torch_device)
 
         inputs = self.get_inputs(torch_device)
 
diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py
index 23e0f18c0009..cb8146160f36 100644
--- a/tests/pipelines/flux/test_pipeline_flux.py
+++ b/tests/pipelines/flux/test_pipeline_flux.py
@@ -212,19 +212,16 @@ def tearDown(self):
         torch.cuda.empty_cache()
 
     def get_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device="cpu").manual_seed(seed)
+        generator = torch.Generator(device="cpu").manual_seed(seed)
 
         prompt_embeds = torch.load(
             hf_hub_download(repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/prompt_embeds.pt")
-        )
+        ).to(torch_device)
         pooled_prompt_embeds = torch.load(
             hf_hub_download(
                 repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/pooled_prompt_embeds.pt"
             )
-        )
+        ).to(torch_device)
         return {
             "prompt_embeds": prompt_embeds,
             "pooled_prompt_embeds": pooled_prompt_embeds,
@@ -238,7 +235,7 @@ def get_inputs(self, device, seed=0):
     def test_flux_inference(self):
         pipe = self.pipeline_class.from_pretrained(
             self.repo_id, torch_dtype=torch.bfloat16, text_encoder=None, text_encoder_2=None
-        ).to("cuda")
+        ).to(torch_device)
 
         inputs = self.get_inputs(torch_device)
 

From 9124f28bd10fbb8811fc050e8428fe829fc2cddc Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 2 Nov 2024 17:14:57 +0530
Subject: [PATCH 17/27] spaces

---
 src/diffusers/pipelines/pipeline_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index b56f9f472f01..a275a7e92969 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -986,7 +986,6 @@ def remove_all_hooks(self):
         for _, model in self.components.items():
             if isinstance(model, torch.nn.Module) and hasattr(model, "_hf_hook"):
                 accelerate.hooks.remove_hook_from_module(model, recurse=True)
-
         self._all_hooks = []
 
     def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
@@ -1041,6 +1040,7 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
         device_mod = getattr(torch, device.type, None)
         if hasattr(device_mod, "empty_cache") and device_mod.is_available():
             device_mod.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+        
         all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
 
         self._all_hooks = []

From a938831dcb3186622a7041a60699b708a97d2bd1 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sat, 2 Nov 2024 17:15:26 +0530
Subject: [PATCH 18/27] quality

---
 src/diffusers/pipelines/pipeline_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index a275a7e92969..2e1858b16148 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -1040,7 +1040,7 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
         device_mod = getattr(torch, device.type, None)
         if hasattr(device_mod, "empty_cache") and device_mod.is_available():
             device_mod.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        
+
         all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
 
         self._all_hooks = []

From bd94852dde5fbb10191833cc9e641a301904986b Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sun, 3 Nov 2024 06:56:48 +0530
Subject: [PATCH 19/27] updates

---
 tests/lora/test_lora_layers_flux.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/lora/test_lora_layers_flux.py b/tests/lora/test_lora_layers_flux.py
index 63aed8bbce2a..cc7913cd611e 100644
--- a/tests/lora/test_lora_layers_flux.py
+++ b/tests/lora/test_lora_layers_flux.py
@@ -202,6 +202,9 @@ def test_flux_the_last_ben(self):
         self.pipeline.load_lora_weights("TheLastBen/Jon_Snow_Flux_LoRA", weight_name="jon_snow.safetensors")
         self.pipeline.fuse_lora()
         self.pipeline.unload_lora_weights()
+        # Instead of calling `enable_model_cpu_offload()`, we do a cuda placement here because the CI
+        # run supports it. We have about 34GB RAM in the CI runner which kills the test when run with
+        # `enable_model_cpu_offload()`. We repeat this for the other tests, too.
         self.pipeline = self.pipeline.to(torch_device)
 
         prompt = "jon snow eating pizza with ketchup"
@@ -224,9 +227,6 @@ def test_flux_kohya(self):
         self.pipeline.load_lora_weights("Norod78/brain-slug-flux")
         self.pipeline.fuse_lora()
         self.pipeline.unload_lora_weights()
-        # Instead of calling `enable_model_cpu_offload()`, we do a cuda placement here because the CI
-        # run supports it. We have about 34GB RAM in the CI runner which kills the test when run with
-        # `enable_model_cpu_offload()`.
         self.pipeline = self.pipeline.to(torch_device)
 
         prompt = "The cat with a brain slug earring"

From 176041904e07ab2451748f08abf3af01205f782f Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sun, 3 Nov 2024 06:58:34 +0530
Subject: [PATCH 20/27] directly place flux controlnet pipeline on cuda.

---
 tests/pipelines/controlnet_flux/test_controlnet_flux.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/pipelines/controlnet_flux/test_controlnet_flux.py b/tests/pipelines/controlnet_flux/test_controlnet_flux.py
index 89540232f9cf..06144ce67b42 100644
--- a/tests/pipelines/controlnet_flux/test_controlnet_flux.py
+++ b/tests/pipelines/controlnet_flux/test_controlnet_flux.py
@@ -35,6 +35,7 @@
     numpy_cosine_similarity_distance,
     require_big_gpu_with_torch_cuda,
     slow,
+    nightly,
     torch_device,
 )
 from diffusers.utils.torch_utils import randn_tensor
@@ -183,6 +184,7 @@ def test_xformers_attention_forwardGenerator_pass(self):
 
 
 @slow
+@nightly
 @require_big_gpu_with_torch_cuda
 @pytest.mark.big_gpu_with_torch_cuda
 class FluxControlNetPipelineSlowTests(unittest.TestCase):
@@ -208,8 +210,7 @@ def test_canny(self):
             text_encoder_2=None,
             controlnet=controlnet,
             torch_dtype=torch.bfloat16,
-        )
-        pipe.enable_model_cpu_offload()
+        ).to("cuda")
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)

From 021f0deb9b4ecd5ed4b50b3cc3b78c48a3f759ce Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sun, 3 Nov 2024 06:58:58 +0530
Subject: [PATCH 21/27] torch_device instead of cuda.

---
 tests/pipelines/controlnet_flux/test_controlnet_flux.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/controlnet_flux/test_controlnet_flux.py b/tests/pipelines/controlnet_flux/test_controlnet_flux.py
index 06144ce67b42..eb67b5f41b78 100644
--- a/tests/pipelines/controlnet_flux/test_controlnet_flux.py
+++ b/tests/pipelines/controlnet_flux/test_controlnet_flux.py
@@ -210,7 +210,7 @@ def test_canny(self):
             text_encoder_2=None,
             controlnet=controlnet,
             torch_dtype=torch.bfloat16,
-        ).to("cuda")
+        ).to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)

From ee662cfd3917d1fef092e92bd55ac0311829ceb3 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sun, 3 Nov 2024 07:06:25 +0530
Subject: [PATCH 22/27] style

---
 tests/pipelines/controlnet_flux/test_controlnet_flux.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/controlnet_flux/test_controlnet_flux.py b/tests/pipelines/controlnet_flux/test_controlnet_flux.py
index eb67b5f41b78..f7e1df9f8ca2 100644
--- a/tests/pipelines/controlnet_flux/test_controlnet_flux.py
+++ b/tests/pipelines/controlnet_flux/test_controlnet_flux.py
@@ -32,10 +32,10 @@
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
+    nightly,
     numpy_cosine_similarity_distance,
     require_big_gpu_with_torch_cuda,
     slow,
-    nightly,
     torch_device,
 )
 from diffusers.utils.torch_utils import randn_tensor

From c46331fe13a0e72dcf60d859449bbe25afa218a7 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Sun, 3 Nov 2024 07:20:41 +0530
Subject: [PATCH 23/27] device placement.

---
 tests/pipelines/controlnet_flux/test_controlnet_flux.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/pipelines/controlnet_flux/test_controlnet_flux.py b/tests/pipelines/controlnet_flux/test_controlnet_flux.py
index f7e1df9f8ca2..fe1f192623c5 100644
--- a/tests/pipelines/controlnet_flux/test_controlnet_flux.py
+++ b/tests/pipelines/controlnet_flux/test_controlnet_flux.py
@@ -220,12 +220,12 @@ def test_canny(self):
 
         prompt_embeds = torch.load(
             hf_hub_download(repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/prompt_embeds.pt")
-        )
+        ).to(torch_device)
         pooled_prompt_embeds = torch.load(
             hf_hub_download(
                 repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/pooled_prompt_embeds.pt"
             )
-        )
+        ).to(torch_device)
 
         output = pipe(
             prompt_embeds=prompt_embeds,

From 207579b529df93718f5b7fdcc7c1f84b1539b9e3 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 5 Nov 2024 23:35:27 +0100
Subject: [PATCH 24/27] fixes

---
 tests/lora/test_lora_layers_sd3.py | 39 +++---------------------------
 1 file changed, 3 insertions(+), 36 deletions(-)

diff --git a/tests/lora/test_lora_layers_sd3.py b/tests/lora/test_lora_layers_sd3.py
index 87e9ded3f1c4..c8728c8f7fe8 100644
--- a/tests/lora/test_lora_layers_sd3.py
+++ b/tests/lora/test_lora_layers_sd3.py
@@ -174,7 +174,7 @@ def get_inputs(self, device, seed=0):
 
     def test_sd3_img2img_lora(self):
         pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16)
-        pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors")
+        pipe.load_lora_weights("zwloong/sd3-lora-training-rank16-v2")
         pipe.fuse_lora()
         pipe.unload_lora_weights()
         pipe = pipe.to(torch_device)
@@ -182,41 +182,8 @@ def test_sd3_img2img_lora(self):
         inputs = self.get_inputs(torch_device)
 
         image = pipe(**inputs).images[0]
-        image_slice = image[0, :10, :10]
-        expected_slice = np.array(
-            [
-                0.47827148,
-                0.5,
-                0.71972656,
-                0.3955078,
-                0.4194336,
-                0.69628906,
-                0.37036133,
-                0.40820312,
-                0.6923828,
-                0.36450195,
-                0.40429688,
-                0.6904297,
-                0.35595703,
-                0.39257812,
-                0.68652344,
-                0.35498047,
-                0.3984375,
-                0.68310547,
-                0.34716797,
-                0.3996582,
-                0.6855469,
-                0.3388672,
-                0.3959961,
-                0.6816406,
-                0.34033203,
-                0.40429688,
-                0.6845703,
-                0.34228516,
-                0.4086914,
-                0.6870117,
-            ]
-        )
+        image_slice = image[0, -3:, -3:]
+        expected_slice = np.array([0.5396, 0.5776, 0.7432, 0.5151, 0.5586, 0.7383, 0.5537, 0.5933, 0.7153])
 
         max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())
 

From 295315c6435d36a33588e91d42321b27db8dbf12 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Wed, 20 Nov 2024 04:53:23 +0100
Subject: [PATCH 25/27] add big gpu marker for mochi; rename test correctly

---
 tests/pipelines/mochi/test_mochi.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/pipelines/mochi/test_mochi.py b/tests/pipelines/mochi/test_mochi.py
index 2192c171aa22..dfbe77f04eb7 100644
--- a/tests/pipelines/mochi/test_mochi.py
+++ b/tests/pipelines/mochi/test_mochi.py
@@ -17,13 +17,16 @@
 import unittest
 
 import numpy as np
+import pytest
 import torch
 from transformers import AutoTokenizer, T5EncoderModel
 
 from diffusers import AutoencoderKLMochi, FlowMatchEulerDiscreteScheduler, MochiPipeline, MochiTransformer3DModel
 from diffusers.utils.testing_utils import (
     enable_full_determinism,
+    nightly,
     numpy_cosine_similarity_distance,
+    require_big_gpu_with_torch_cuda,
     require_torch_gpu,
     slow,
     torch_device,
@@ -261,7 +264,10 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
 
 
 @slow
+@nightly
 @require_torch_gpu
+@require_big_gpu_with_torch_cuda
+@pytest.mark.big_gpu_with_torch_cuda
 class MochiPipelineIntegrationTests(unittest.TestCase):
     prompt = "A painting of a squirrel eating a burger."
 
@@ -275,7 +281,7 @@ def tearDown(self):
         gc.collect()
         torch.cuda.empty_cache()
 
-    def test_cogvideox(self):
+    def test_mochi(self):
         generator = torch.Generator("cpu").manual_seed(0)
 
         pipe = MochiPipeline.from_pretrained("genmo/mochi-1-preview", torch_dtype=torch.float16)
@@ -293,7 +299,7 @@ def test_cogvideox(self):
         ).frames
 
         video = videos[0]
-        expected_video = torch.randn(1, 16, 480, 848, 3).numpy()
+        expected_video = torch.randn(1, 19, 480, 848, 3).numpy()
 
         max_diff = numpy_cosine_similarity_distance(video, expected_video)
         assert max_diff < 1e-3, f"Max diff is too high. got {video}"

From 58b79f2c12cfb4ea06d14d7425564eea54cf0c9c Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 2 Dec 2024 14:24:22 +0530
Subject: [PATCH 26/27] address feedback

---
 tests/lora/test_lora_layers_sd3.py                      | 2 --
 tests/pipelines/controlnet_flux/test_controlnet_flux.py | 2 --
 tests/pipelines/flux/test_pipeline_flux.py              | 2 --
 tests/pipelines/mochi/test_mochi.py                     | 2 --
 4 files changed, 8 deletions(-)

diff --git a/tests/lora/test_lora_layers_sd3.py b/tests/lora/test_lora_layers_sd3.py
index c8728c8f7fe8..992a13b22604 100644
--- a/tests/lora/test_lora_layers_sd3.py
+++ b/tests/lora/test_lora_layers_sd3.py
@@ -36,7 +36,6 @@
     require_big_gpu_with_torch_cuda,
     require_peft_backend,
     require_torch_gpu,
-    slow,
     torch_device,
 )
 
@@ -134,7 +133,6 @@ def test_modify_padding_mode(self):
         pass
 
 
-@slow
 @nightly
 @require_torch_gpu
 @require_peft_backend
diff --git a/tests/pipelines/controlnet_flux/test_controlnet_flux.py b/tests/pipelines/controlnet_flux/test_controlnet_flux.py
index 9de7b4b0fa9b..5e856b125f32 100644
--- a/tests/pipelines/controlnet_flux/test_controlnet_flux.py
+++ b/tests/pipelines/controlnet_flux/test_controlnet_flux.py
@@ -35,7 +35,6 @@
     nightly,
     numpy_cosine_similarity_distance,
     require_big_gpu_with_torch_cuda,
-    slow,
     torch_device,
 )
 from diffusers.utils.torch_utils import randn_tensor
@@ -205,7 +204,6 @@ def test_flux_image_output_shape(self):
             assert (output_height, output_width) == (expected_height, expected_width)
 
 
-@slow
 @nightly
 @require_big_gpu_with_torch_cuda
 @pytest.mark.big_gpu_with_torch_cuda
diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py
index 100c8afb211c..c6905d8a7f22 100644
--- a/tests/pipelines/flux/test_pipeline_flux.py
+++ b/tests/pipelines/flux/test_pipeline_flux.py
@@ -12,7 +12,6 @@
     nightly,
     numpy_cosine_similarity_distance,
     require_big_gpu_with_torch_cuda,
-    slow,
     torch_device,
 )
 
@@ -207,7 +206,6 @@ def test_flux_image_output_shape(self):
             assert (output_height, output_width) == (expected_height, expected_width)
 
 
-@slow
 @nightly
 @require_big_gpu_with_torch_cuda
 @pytest.mark.big_gpu_with_torch_cuda
diff --git a/tests/pipelines/mochi/test_mochi.py b/tests/pipelines/mochi/test_mochi.py
index dfbe77f04eb7..c9df5785897c 100644
--- a/tests/pipelines/mochi/test_mochi.py
+++ b/tests/pipelines/mochi/test_mochi.py
@@ -28,7 +28,6 @@
     numpy_cosine_similarity_distance,
     require_big_gpu_with_torch_cuda,
     require_torch_gpu,
-    slow,
     torch_device,
 )
 
@@ -263,7 +262,6 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
         )
 
 
-@slow
 @nightly
 @require_torch_gpu
 @require_big_gpu_with_torch_cuda

From 44db42338af37c06ff83344488ab8ca2cc57e63a Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Wed, 25 Dec 2024 16:58:15 +0530
Subject: [PATCH 27/27] fix

---
 tests/pipelines/flux/test_pipeline_flux.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py
index a0b2274adcff..ab36333c4056 100644
--- a/tests/pipelines/flux/test_pipeline_flux.py
+++ b/tests/pipelines/flux/test_pipeline_flux.py
@@ -12,6 +12,7 @@
     nightly,
     numpy_cosine_similarity_distance,
     require_big_gpu_with_torch_cuda,
+    slow,
     torch_device,
 )