From 6123837e2515d6435a6724f91431642c88021a4e Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 14 Feb 2023 00:53:47 +0900
Subject: [PATCH 001/122] add scaffold - copied
 convert_controlnet_to_diffusers.py from
 convert_original_stable_diffusion_to_diffusers.py

---
 scripts/convert_controlnet_to_diffusers.py    | 118 ++++++++++++++++++
 .../models/unet_2d_condition_controlnet.py    |   8 ++
 .../pipeline_stable_diffusion_controlnet.py   |   4 +
 3 files changed, 130 insertions(+)
 create mode 100644 scripts/convert_controlnet_to_diffusers.py
 create mode 100644 src/diffusers/models/unet_2d_condition_controlnet.py
 create mode 100644 src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py

diff --git a/scripts/convert_controlnet_to_diffusers.py b/scripts/convert_controlnet_to_diffusers.py
new file mode 100644
index 000000000000..fd6a69441628
--- /dev/null
+++ b/scripts/convert_controlnet_to_diffusers.py
@@ -0,0 +1,118 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the ControlNet checkpoints. """
+
+import argparse
+
+from diffusers.pipelines.stable_diffusion.convert_from_ckpt import load_pipeline_from_original_stable_diffusion_ckpt
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    # !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml
+    parser.add_argument(
+        "--original_config_file",
+        default=None,
+        type=str,
+        help="The YAML config file corresponding to the original architecture.",
+    )
+    parser.add_argument(
+        "--num_in_channels",
+        default=None,
+        type=int,
+        help="The number of input channels. If `None` number of input channels will be automatically inferred.",
+    )
+    parser.add_argument(
+        "--scheduler_type",
+        default="pndm",
+        type=str,
+        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancestral', 'dpm']",
+    )
+    parser.add_argument(
+        "--pipeline_type",
+        default=None,
+        type=str,
+        help=(
+            "The pipeline type. One of 'FrozenOpenCLIPEmbedder', 'FrozenCLIPEmbedder', 'PaintByExample'"
+            ". If `None` pipeline will be automatically inferred."
+        ),
+    )
+    parser.add_argument(
+        "--image_size",
+        default=None,
+        type=int,
+        help=(
+            "The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Siffusion v2"
+            " Base. Use 768 for Stable Diffusion v2."
+        ),
+    )
+    parser.add_argument(
+        "--prediction_type",
+        default=None,
+        type=str,
+        help=(
+            "The prediction type that the model was trained on. Use 'epsilon' for Stable Diffusion v1.X and Stable"
+            " Diffusion v2 Base. Use 'v_prediction' for Stable Diffusion v2."
+        ),
+    )
+    parser.add_argument(
+        "--extract_ema",
+        action="store_true",
+        help=(
+            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
+            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
+            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
+        ),
+    )
+    parser.add_argument(
+        "--upcast_attention",
+        action="store_true",
+        help=(
+            "Whether the attention computation should always be upcasted. This is necessary when running stable"
+            " diffusion 2.1."
+        ),
+    )
+    parser.add_argument(
+        "--from_safetensors",
+        action="store_true",
+        help="If `--checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.",
+    )
+    parser.add_argument(
+        "--to_safetensors",
+        action="store_true",
+        help="Whether to store pipeline in safetensors format or not.",
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
+    args = parser.parse_args()
+
+    pipe = load_pipeline_from_original_stable_diffusion_ckpt(
+        checkpoint_path=args.checkpoint_path,
+        original_config_file=args.original_config_file,
+        image_size=args.image_size,
+        prediction_type=args.prediction_type,
+        model_type=args.pipeline_type,
+        extract_ema=args.extract_ema,
+        scheduler_type=args.scheduler_type,
+        num_in_channels=args.num_in_channels,
+        upcast_attention=args.upcast_attention,
+        from_safetensors=args.from_safetensors,
+        device=args.device,
+    )
+    pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
diff --git a/src/diffusers/models/unet_2d_condition_controlnet.py b/src/diffusers/models/unet_2d_condition_controlnet.py
new file mode 100644
index 000000000000..b9d080f2d247
--- /dev/null
+++ b/src/diffusers/models/unet_2d_condition_controlnet.py
@@ -0,0 +1,8 @@
+from . import UNet2DConditionModel
+
+class ControlledUNet2DConditionModel(UNet2DConditionModel):
+    pass
+
+class ControlNetModel(UNet2DConditionModel):
+    pass
+
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
new file mode 100644
index 000000000000..009824b5026e
--- /dev/null
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -0,0 +1,4 @@
+from . import StableDiffusionPipeline
+
+class StableDiffusionPipelineControlNet(StableDiffusionPipeline):
+    pass

From d382f933e48bae14071010b6a6e9ef821449cadb Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 14 Feb 2023 02:15:17 +0900
Subject: [PATCH 002/122] Add support to load ControlNet (WIP) - this makes
 Missking Key error on ControlNetModel

---
 scripts/convert_controlnet_to_diffusers.py    |   6 +-
 src/diffusers/__init__.py                     |   3 +
 src/diffusers/models/__init__.py              |   1 +
 .../models/unet_2d_condition_controlnet.py    |   3 +-
 src/diffusers/pipelines/__init__.py           |   1 +
 .../pipelines/stable_diffusion/__init__.py    |   1 +
 .../stable_diffusion/convert_from_ckpt.py     | 416 +++++++++++++++++-
 .../pipeline_stable_diffusion_controlnet.py   |   3 +-
 8 files changed, 426 insertions(+), 8 deletions(-)

diff --git a/scripts/convert_controlnet_to_diffusers.py b/scripts/convert_controlnet_to_diffusers.py
index fd6a69441628..b73c089cce28 100644
--- a/scripts/convert_controlnet_to_diffusers.py
+++ b/scripts/convert_controlnet_to_diffusers.py
@@ -16,7 +16,7 @@
 
 import argparse
 
-from diffusers.pipelines.stable_diffusion.convert_from_ckpt import load_pipeline_from_original_stable_diffusion_ckpt
+from diffusers.pipelines.stable_diffusion.convert_from_ckpt import load_pipeline_from_control_net_ckpt
 
 
 if __name__ == "__main__":
@@ -40,7 +40,7 @@
     )
     parser.add_argument(
         "--scheduler_type",
-        default="pndm",
+        default="ddim",
         type=str,
         help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancestral', 'dpm']",
     )
@@ -102,7 +102,7 @@
     parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
     args = parser.parse_args()
 
-    pipe = load_pipeline_from_original_stable_diffusion_ckpt(
+    pipe = load_pipeline_from_control_net_ckpt(
         checkpoint_path=args.checkpoint_path,
         original_config_file=args.original_config_file,
         image_size=args.image_size,
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index bc6057eaf2da..5dca2a5d0032 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -34,6 +34,8 @@
 else:
     from .models import (
         AutoencoderKL,
+        ControlledUNet2DConditionModel,
+        ControlNetModel,
         ModelMixin,
         PriorTransformer,
         Transformer2DModel,
@@ -109,6 +111,7 @@
         CycleDiffusionPipeline,
         LDMTextToImagePipeline,
         PaintByExamplePipeline,
+        StableDiffusionControlNetPipeline,
         StableDiffusionDepth2ImgPipeline,
         StableDiffusionImageVariationPipeline,
         StableDiffusionImg2ImgPipeline,
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 474b8412560e..29c0091c4a82 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -24,6 +24,7 @@
     from .unet_1d import UNet1DModel
     from .unet_2d import UNet2DModel
     from .unet_2d_condition import UNet2DConditionModel
+    from .unet_2d_condition_controlnet import ControlledUNet2DConditionModel, ControlNetModel
     from .vq_model import VQModel
 
 if is_flax_available():
diff --git a/src/diffusers/models/unet_2d_condition_controlnet.py b/src/diffusers/models/unet_2d_condition_controlnet.py
index b9d080f2d247..7b550ac4f51a 100644
--- a/src/diffusers/models/unet_2d_condition_controlnet.py
+++ b/src/diffusers/models/unet_2d_condition_controlnet.py
@@ -1,8 +1,9 @@
 from . import UNet2DConditionModel
 
+
 class ControlledUNet2DConditionModel(UNet2DConditionModel):
     pass
 
+
 class ControlNetModel(UNet2DConditionModel):
     pass
-
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index dfb2fd83cb71..a17aeaa922c4 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -46,6 +46,7 @@
     from .paint_by_example import PaintByExamplePipeline
     from .stable_diffusion import (
         CycleDiffusionPipeline,
+        StableDiffusionControlNetPipeline,
         StableDiffusionDepth2ImgPipeline,
         StableDiffusionImageVariationPipeline,
         StableDiffusionImg2ImgPipeline,
diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
index bf07127cde5b..d03db8619c3f 100644
--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -39,6 +39,7 @@ class StableDiffusionPipelineOutput(BaseOutput):
 if is_transformers_available() and is_torch_available():
     from .pipeline_cycle_diffusion import CycleDiffusionPipeline
     from .pipeline_stable_diffusion import StableDiffusionPipeline
+    from .pipeline_stable_diffusion_controlnet import StableDiffusionControlNetPipeline
     from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
     from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
     from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index a460ecfb77c8..3c4fa4095e6f 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -25,6 +25,8 @@
 
 from diffusers import (
     AutoencoderKL,
+    ControlledUNet2DConditionModel,
+    ControlNetModel,
     DDIMScheduler,
     DPMSolverMultistepScheduler,
     EulerAncestralDiscreteScheduler,
@@ -33,6 +35,7 @@
     LDMTextToImagePipeline,
     LMSDiscreteScheduler,
     PNDMScheduler,
+    StableDiffusionControlNetPipeline,
     StableDiffusionPipeline,
     UNet2DConditionModel,
 )
@@ -209,11 +212,15 @@ def conv_attn_to_linear(checkpoint):
                 checkpoint[key] = checkpoint[key][:, :, 0]
 
 
-def create_unet_diffusers_config(original_config, image_size: int):
+def create_unet_diffusers_config(original_config, image_size: int, control_net=False):
     """
     Creates a config for the diffusers based on the config of the LDM model.
     """
-    unet_params = original_config.model.params.unet_config.params
+    if not control_net:
+        unet_params = original_config.model.params.unet_config.params
+    else:
+        unet_params = original_config.model.params.control_stage_config.params
+
     vae_params = original_config.model.params.first_stage_config.params.ddconfig
 
     block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
@@ -246,7 +253,7 @@ def create_unet_diffusers_config(original_config, image_size: int):
     config = dict(
         sample_size=image_size // vae_scale_factor,
         in_channels=unet_params.in_channels,
-        out_channels=unet_params.out_channels,
+        # out_channels=unet_params.out_channels,
         down_block_types=tuple(down_block_types),
         up_block_types=tuple(up_block_types),
         block_out_channels=tuple(block_out_channels),
@@ -256,6 +263,10 @@ def create_unet_diffusers_config(original_config, image_size: int):
         use_linear_projection=use_linear_projection,
     )
 
+    # ControlNet donesn't have out_channels
+    if not control_net:
+        config["out_channels"] = unet_params.out_channels
+
     return config
 
 
@@ -476,6 +487,183 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
     return new_checkpoint
 
 
+def convert_controlnet_checkpoint(checkpoint, config):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+
+    unet_key = "control_model."
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    # if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+    #     print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+    #     print(
+    #         "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+    #         " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+    #     )
+    #     for key in keys:
+    #         if key.startswith("control_model"):
+    #             flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+    #             unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+    # else:
+    #     if sum(k.startswith("model_ema") for k in keys) > 100:
+    #         print(
+    #             "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+    #             " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+    #         )
+
+    for key in keys:
+        if key.startswith(unet_key):
+            unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    # new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    # new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    # new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    # new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    # num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    # output_blocks = {
+    #     layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+    #     for layer_id in range(num_output_blocks)
+    # }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    # #################################################
+    # TODO: add to load input_hint_block and zero_convs
+    # #################################################
+
+    # for i in range(num_output_blocks):
+    #     block_id = i // (config["layers_per_block"] + 1)
+    #     layer_in_block_id = i % (config["layers_per_block"] + 1)
+    #     output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+    #     output_block_list = {}
+
+    #     for layer in output_block_layers:
+    #         layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+    #         if layer_id in output_block_list:
+    #             output_block_list[layer_id].append(layer_name)
+    #         else:
+    #             output_block_list[layer_id] = [layer_name]
+
+    #     if len(output_block_list) > 1:
+    #         resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+    #         attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+    #         resnet_0_paths = renew_resnet_paths(resnets)
+    #         paths = renew_resnet_paths(resnets)
+
+    #         meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+    #         assign_to_checkpoint(
+    #             paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    #         )
+
+    #         output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+    #         if ["conv.bias", "conv.weight"] in output_block_list.values():
+    #             index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+    #             new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+    #                 f"output_blocks.{i}.{index}.conv.weight"
+    #             ]
+    #             new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+    #                 f"output_blocks.{i}.{index}.conv.bias"
+    #             ]
+
+    #             # Clear attentions as they have been attributed above.
+    #             if len(attentions) == 2:
+    #                 attentions = []
+
+    #         if len(attentions):
+    #             paths = renew_attention_paths(attentions)
+    #             meta_path = {
+    #                 "old": f"output_blocks.{i}.1",
+    #                 "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+    #             }
+    #             assign_to_checkpoint(
+    #                 paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    #             )
+    #     else:
+    #         resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+    #         for path in resnet_0_paths:
+    #             old_path = ".".join(["output_blocks", str(i), path["old"]])
+    #             new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+    #             new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
 def convert_ldm_vae_checkpoint(checkpoint, config):
     # extract state dict for VAE
     vae_state_dict = {}
@@ -1019,3 +1207,225 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
         pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
 
     return pipe
+
+
+def load_pipeline_from_control_net_ckpt(
+    checkpoint_path: str,
+    original_config_file: str = None,
+    image_size: int = 512,
+    prediction_type: str = None,
+    model_type: str = None,
+    extract_ema: bool = False,
+    scheduler_type: str = "ddim",
+    num_in_channels: Optional[int] = None,
+    upcast_attention: Optional[bool] = None,
+    device: str = None,
+    from_safetensors: bool = False,
+) -> StableDiffusionControlNetPipeline:
+    """
+    Load a Stable Diffusion pipeline object from a ControlNet `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
+    config file.
+
+    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
+    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
+    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
+
+    Args:
+        checkpoint_path (`str`): Path to `.ckpt` file.
+        original_config_file (`str`):
+            Path to `.yaml` config file corresponding to the original architecture. If `None`, will be automatically
+            inferred by looking for a key that only exists in SD2.0 models.
+        image_size (`int`, *optional*, defaults to 512):
+            The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2
+            Base. Use 768 for Stable Diffusion v2.
+        prediction_type (`str`, *optional*):
+            The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion v1.X and Stable
+            Diffusion v2 Base. Use `'v_prediction'` for Stable Diffusion v2.
+        num_in_channels (`int`, *optional*, defaults to None):
+            The number of input channels. If `None`, it will be automatically inferred.
+        scheduler_type (`str`, *optional*, defaults to 'pndm'):
+            Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
+            "ddim"]`.
+        model_type (`str`, *optional*, defaults to `None`):
+            The pipeline type. `None` to automatically infer, or one of `["FrozenOpenCLIPEmbedder",
+            "FrozenCLIPEmbedder", "PaintByExample"]`.
+        extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
+            checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults to
+            `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
+            inference. Non-EMA weights are usually better to continue fine-tuning.
+        upcast_attention (`bool`, *optional*, defaults to `None`):
+            Whether the attention computation should always be upcasted. This is necessary when running stable
+            diffusion 2.1.
+        device (`str`, *optional*, defaults to `None`):
+            The device to use. Pass `None` to determine automatically. :param from_safetensors: If `checkpoint_path` is
+            in `safetensors` format, load checkpoint with safetensors instead of PyTorch. :return: A
+            StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
+    """
+    if prediction_type == "v-prediction":
+        prediction_type = "v_prediction"
+
+    if not is_omegaconf_available():
+        raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
+
+    from omegaconf import OmegaConf
+
+    if from_safetensors:
+        if not is_safetensors_available():
+            raise ValueError(BACKENDS_MAPPING["safetensors"][1])
+
+        from safetensors import safe_open
+
+        checkpoint = {}
+        with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                checkpoint[key] = f.get_tensor(key)
+    else:
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+        else:
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+
+    # Sometimes models don't have the global_step item
+    if "global_step" in checkpoint:
+        global_step = checkpoint["global_step"]
+    else:
+        print("global_step key not found in model")
+        global_step = None
+
+    if "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        if original_config_file is None:
+            key_name = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight"
+
+            original_config_file = os.path.join(tmpdir, "inference.yaml")
+            if key_name in checkpoint and checkpoint[key_name].shape[-1] == 1024:
+                if not os.path.isfile("v2-inference-v.yaml"):
+                    # model_type = "v2"
+                    r = requests.get(
+                        " https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml"
+                    )
+                    open(original_config_file, "wb").write(r.content)
+
+                if global_step == 110000:
+                    # v2.1 needs to upcast attention
+                    upcast_attention = True
+            else:
+                if not os.path.isfile("v1-inference.yaml"):
+                    # model_type = "v1"
+                    r = requests.get(
+                        " https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
+                    )
+                    open(original_config_file, "wb").write(r.content)
+
+        original_config = OmegaConf.load(original_config_file)
+
+    if num_in_channels is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
+
+    if (
+        "parameterization" in original_config["model"]["params"]
+        and original_config["model"]["params"]["parameterization"] == "v"
+    ):
+        if prediction_type is None:
+            # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
+            # as it relies on a brittle global step parameter here
+            prediction_type = "epsilon" if global_step == 875000 else "v_prediction"
+        if image_size is None:
+            # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
+            # as it relies on a brittle global step parameter here
+            image_size = 512 if global_step == 875000 else 768
+    else:
+        if prediction_type is None:
+            prediction_type = "epsilon"
+        if image_size is None:
+            image_size = 512
+
+    num_train_timesteps = original_config.model.params.timesteps
+    beta_start = original_config.model.params.linear_start
+    beta_end = original_config.model.params.linear_end
+
+    scheduler = DDIMScheduler(
+        beta_end=beta_end,
+        beta_schedule="scaled_linear",
+        beta_start=beta_start,
+        num_train_timesteps=num_train_timesteps,
+        steps_offset=1,
+        clip_sample=False,
+        set_alpha_to_one=False,
+        prediction_type=prediction_type,
+    )
+    # make sure scheduler works correctly with DDIM
+    scheduler.register_to_config(clip_sample=False)
+
+    if scheduler_type == "pndm":
+        config = dict(scheduler.config)
+        config["skip_prk_steps"] = True
+        scheduler = PNDMScheduler.from_config(config)
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "dpm":
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+    elif scheduler_type == "ddim":
+        scheduler = scheduler
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+
+    # Convert the ControlledUNet2DConditionModel model.
+    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+    unet_config["upcast_attention"] = upcast_attention
+    unet = ControlledUNet2DConditionModel(**unet_config)
+
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
+    )
+
+    unet.load_state_dict(converted_unet_checkpoint)
+
+    # Convert the ControlNetModel model.
+    ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, control_net=True)
+    ctrlnet_config["upcast_attention"] = upcast_attention
+    controlnet = ControlNetModel(**ctrlnet_config)
+
+    converted_ctrl_checkpoint = convert_controlnet_checkpoint(checkpoint, unet_config)
+
+    controlnet.load_state_dict(converted_ctrl_checkpoint)
+
+    # Convert the VAE model.
+    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+
+    # Convert the text model.
+    if model_type is None:
+        model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
+        logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}")
+
+    if model_type == "FrozenOpenCLIPEmbedder":
+        text_model = convert_open_clip_checkpoint(checkpoint)
+        tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer")
+        pipe = StableDiffusionControlNetPipeline(
+            vae=vae,
+            text_encoder=text_model,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=None,
+            feature_extractor=None,
+            requires_safety_checker=False,
+        )
+    else:
+        raise NotImplementedError()
+
+    return pipe
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 009824b5026e..0d9d3d832fca 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -1,4 +1,5 @@
 from . import StableDiffusionPipeline
 
-class StableDiffusionPipelineControlNet(StableDiffusionPipeline):
+
+class StableDiffusionControlNetPipeline(StableDiffusionPipeline):
     pass

From 04a514a63736062ba5e10f24d1663f3825df5284 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 14 Feb 2023 02:52:16 +0900
Subject: [PATCH 003/122] Update to convert ControlNet without error msg - init
 impl for StableDiffusionControlNetPipeline - init impl for ControlNetModel

---
 .../models/unet_2d_condition_controlnet.py    | 253 +++++++++++++++++-
 .../stable_diffusion/convert_from_ckpt.py     |  20 +-
 .../pipeline_stable_diffusion_controlnet.py   |  55 +++-
 3 files changed, 314 insertions(+), 14 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition_controlnet.py b/src/diffusers/models/unet_2d_condition_controlnet.py
index 7b550ac4f51a..6487a84cece9 100644
--- a/src/diffusers/models/unet_2d_condition_controlnet.py
+++ b/src/diffusers/models/unet_2d_condition_controlnet.py
@@ -1,9 +1,258 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+import torch.nn as nn
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..loaders import UNet2DConditionLoadersMixin
 from . import UNet2DConditionModel
+from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
+from .modeling_utils import ModelMixin
+from .unet_2d_blocks import (
+    UNetMidBlock2DCrossAttn,
+    UNetMidBlock2DSimpleCrossAttn,
+    get_down_block,
+)
 
 
 class ControlledUNet2DConditionModel(UNet2DConditionModel):
     pass
 
 
-class ControlNetModel(UNet2DConditionModel):
-    pass
+class ControlNetModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        # out_channels: int = 4,
+        # center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        # up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        time_embedding_type: str = "positional",  # fourier, positional
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        # conv_out_kernel: int = 3,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+
+        # time
+        if time_embedding_type == "fourier":
+            time_embed_dim = block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = block_out_channels[0] * 4
+
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Pleaes make sure to use one of `fourier` or `positional`."
+            )
+
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        if mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+            )
+        elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
+            self.mid_block = UNetMidBlock2DSimpleCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+            )
+        elif mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+
+        # #################################################
+        # TODO: add input_hint_block and zero_convs modules
+        # #################################################
+
+
+        # # up
+        # reversed_block_out_channels = list(reversed(block_out_channels))
+        # reversed_attention_head_dim = list(reversed(attention_head_dim))
+        # only_cross_attention = list(reversed(only_cross_attention))
+
+        # output_channel = reversed_block_out_channels[0]
+        # for i, up_block_type in enumerate(up_block_types):
+        #     is_final_block = i == len(block_out_channels) - 1
+
+        #     prev_output_channel = output_channel
+        #     output_channel = reversed_block_out_channels[i]
+        #     input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+        #     # add upsample block for all BUT final layer
+        #     if not is_final_block:
+        #         add_upsample = True
+        #         self.num_upsamplers += 1
+        #     else:
+        #         add_upsample = False
+
+        #     up_block = get_up_block(
+        #         up_block_type,
+        #         num_layers=layers_per_block + 1,
+        #         in_channels=input_channel,
+        #         out_channels=output_channel,
+        #         prev_output_channel=prev_output_channel,
+        #         temb_channels=time_embed_dim,
+        #         add_upsample=add_upsample,
+        #         resnet_eps=norm_eps,
+        #         resnet_act_fn=act_fn,
+        #         resnet_groups=norm_num_groups,
+        #         cross_attention_dim=cross_attention_dim,
+        #         attn_num_head_channels=reversed_attention_head_dim[i],
+        #         dual_cross_attention=dual_cross_attention,
+        #         use_linear_projection=use_linear_projection,
+        #         only_cross_attention=only_cross_attention[i],
+        #         upcast_attention=upcast_attention,
+        #         resnet_time_scale_shift=resnet_time_scale_shift,
+        #     )
+        #     self.up_blocks.append(up_block)
+        #     prev_output_channel = output_channel
+
+        # out
+        # if norm_num_groups is not None:
+        #     self.conv_norm_out = nn.GroupNorm(
+        #         num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+        #     )
+        #     self.conv_act = nn.SiLU()
+        # else:
+        #     self.conv_norm_out = None
+        #     self.conv_act = None
+
+        # conv_out_padding = (conv_out_kernel - 1) // 2
+        # self.conv_out = nn.Conv2d(
+        #     block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        # )
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index 3c4fa4095e6f..0e379288be92 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -255,7 +255,7 @@ def create_unet_diffusers_config(original_config, image_size: int, control_net=F
         in_channels=unet_params.in_channels,
         # out_channels=unet_params.out_channels,
         down_block_types=tuple(down_block_types),
-        up_block_types=tuple(up_block_types),
+        # up_block_types=tuple(up_block_types),
         block_out_channels=tuple(block_out_channels),
         layers_per_block=unet_params.num_res_blocks,
         cross_attention_dim=unet_params.context_dim,
@@ -263,9 +263,10 @@ def create_unet_diffusers_config(original_config, image_size: int, control_net=F
         use_linear_projection=use_linear_projection,
     )
 
-    # ControlNet donesn't have out_channels
+    # ControlNet donesn't have output channels
     if not control_net:
         config["out_channels"] = unet_params.out_channels
+        config["up_block_types"] = tuple(up_block_types)
 
     return config
 
@@ -1411,9 +1412,11 @@ def load_pipeline_from_control_net_ckpt(
         model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
         logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}")
 
-    if model_type == "FrozenOpenCLIPEmbedder":
-        text_model = convert_open_clip_checkpoint(checkpoint)
-        tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer")
+    if model_type == "FrozenCLIPEmbedder":
+        text_model = convert_ldm_clip_checkpoint(checkpoint)
+        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
+        feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
         pipe = StableDiffusionControlNetPipeline(
             vae=vae,
             text_encoder=text_model,
@@ -1421,11 +1424,10 @@ def load_pipeline_from_control_net_ckpt(
             unet=unet,
             controlnet=controlnet,
             scheduler=scheduler,
-            safety_checker=None,
-            feature_extractor=None,
-            requires_safety_checker=False,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
         )
     else:
-        raise NotImplementedError()
+        raise NotImplementedError("Currently supported only for FrozenCLIPEmbedder.")
 
     return pipe
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 0d9d3d832fca..a04fc5c5c63f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -1,5 +1,54 @@
-from . import StableDiffusionPipeline
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 
-class StableDiffusionControlNetPipeline(StableDiffusionPipeline):
-    pass
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+from ...models import AutoencoderKL, ControlledUNet2DConditionModel, ControlNetModel
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import logging
+from ..pipeline_utils import DiffusionPipeline
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class StableDiffusionControlNetPipeline(DiffusionPipeline):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: ControlledUNet2DConditionModel,
+        controlnet: ControlNetModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)

From 1f4b7062a7f3c3084cf7b2ce3ad76f703ce8896f Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 14 Feb 2023 02:54:56 +0900
Subject: [PATCH 004/122] cleanup of commented out

---
 .../models/unet_2d_condition_controlnet.py    | 59 -------------
 .../stable_diffusion/convert_from_ckpt.py     | 85 -------------------
 2 files changed, 144 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition_controlnet.py b/src/diffusers/models/unet_2d_condition_controlnet.py
index 6487a84cece9..f74e77719622 100644
--- a/src/diffusers/models/unet_2d_condition_controlnet.py
+++ b/src/diffusers/models/unet_2d_condition_controlnet.py
@@ -194,65 +194,6 @@ def __init__(
         # count how many layers upsample the images
         self.num_upsamplers = 0
 
-
         # #################################################
         # TODO: add input_hint_block and zero_convs modules
         # #################################################
-
-
-        # # up
-        # reversed_block_out_channels = list(reversed(block_out_channels))
-        # reversed_attention_head_dim = list(reversed(attention_head_dim))
-        # only_cross_attention = list(reversed(only_cross_attention))
-
-        # output_channel = reversed_block_out_channels[0]
-        # for i, up_block_type in enumerate(up_block_types):
-        #     is_final_block = i == len(block_out_channels) - 1
-
-        #     prev_output_channel = output_channel
-        #     output_channel = reversed_block_out_channels[i]
-        #     input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
-
-        #     # add upsample block for all BUT final layer
-        #     if not is_final_block:
-        #         add_upsample = True
-        #         self.num_upsamplers += 1
-        #     else:
-        #         add_upsample = False
-
-        #     up_block = get_up_block(
-        #         up_block_type,
-        #         num_layers=layers_per_block + 1,
-        #         in_channels=input_channel,
-        #         out_channels=output_channel,
-        #         prev_output_channel=prev_output_channel,
-        #         temb_channels=time_embed_dim,
-        #         add_upsample=add_upsample,
-        #         resnet_eps=norm_eps,
-        #         resnet_act_fn=act_fn,
-        #         resnet_groups=norm_num_groups,
-        #         cross_attention_dim=cross_attention_dim,
-        #         attn_num_head_channels=reversed_attention_head_dim[i],
-        #         dual_cross_attention=dual_cross_attention,
-        #         use_linear_projection=use_linear_projection,
-        #         only_cross_attention=only_cross_attention[i],
-        #         upcast_attention=upcast_attention,
-        #         resnet_time_scale_shift=resnet_time_scale_shift,
-        #     )
-        #     self.up_blocks.append(up_block)
-        #     prev_output_channel = output_channel
-
-        # out
-        # if norm_num_groups is not None:
-        #     self.conv_norm_out = nn.GroupNorm(
-        #         num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
-        #     )
-        #     self.conv_act = nn.SiLU()
-        # else:
-        #     self.conv_norm_out = None
-        #     self.conv_act = None
-
-        # conv_out_padding = (conv_out_kernel - 1) // 2
-        # self.conv_out = nn.Conv2d(
-        #     block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
-        # )
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index 0e379288be92..148a1d542430 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -498,23 +498,6 @@ def convert_controlnet_checkpoint(checkpoint, config):
     keys = list(checkpoint.keys())
 
     unet_key = "control_model."
-    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
-    # if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
-    #     print(f"Checkpoint {path} has both EMA and non-EMA weights.")
-    #     print(
-    #         "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
-    #         " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
-    #     )
-    #     for key in keys:
-    #         if key.startswith("control_model"):
-    #             flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
-    #             unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
-    # else:
-    #     if sum(k.startswith("model_ema") for k in keys) > 100:
-    #         print(
-    #             "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
-    #             " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
-    #         )
 
     for key in keys:
         if key.startswith(unet_key):
@@ -530,11 +513,6 @@ def convert_controlnet_checkpoint(checkpoint, config):
     new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
     new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
 
-    # new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
-    # new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
-    # new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
-    # new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
-
     # Retrieves the keys for the input blocks only
     num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
     input_blocks = {
@@ -549,13 +527,6 @@ def convert_controlnet_checkpoint(checkpoint, config):
         for layer_id in range(num_middle_blocks)
     }
 
-    # Retrieves the keys for the output blocks only
-    # num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
-    # output_blocks = {
-    #     layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
-    #     for layer_id in range(num_output_blocks)
-    # }
-
     for i in range(1, num_input_blocks):
         block_id = (i - 1) // (config["layers_per_block"] + 1)
         layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
@@ -606,62 +577,6 @@ def convert_controlnet_checkpoint(checkpoint, config):
     # TODO: add to load input_hint_block and zero_convs
     # #################################################
 
-    # for i in range(num_output_blocks):
-    #     block_id = i // (config["layers_per_block"] + 1)
-    #     layer_in_block_id = i % (config["layers_per_block"] + 1)
-    #     output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
-    #     output_block_list = {}
-
-    #     for layer in output_block_layers:
-    #         layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
-    #         if layer_id in output_block_list:
-    #             output_block_list[layer_id].append(layer_name)
-    #         else:
-    #             output_block_list[layer_id] = [layer_name]
-
-    #     if len(output_block_list) > 1:
-    #         resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
-    #         attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
-
-    #         resnet_0_paths = renew_resnet_paths(resnets)
-    #         paths = renew_resnet_paths(resnets)
-
-    #         meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
-    #         assign_to_checkpoint(
-    #             paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-    #         )
-
-    #         output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
-    #         if ["conv.bias", "conv.weight"] in output_block_list.values():
-    #             index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
-    #             new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
-    #                 f"output_blocks.{i}.{index}.conv.weight"
-    #             ]
-    #             new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
-    #                 f"output_blocks.{i}.{index}.conv.bias"
-    #             ]
-
-    #             # Clear attentions as they have been attributed above.
-    #             if len(attentions) == 2:
-    #                 attentions = []
-
-    #         if len(attentions):
-    #             paths = renew_attention_paths(attentions)
-    #             meta_path = {
-    #                 "old": f"output_blocks.{i}.1",
-    #                 "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
-    #             }
-    #             assign_to_checkpoint(
-    #                 paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-    #             )
-    #     else:
-    #         resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
-    #         for path in resnet_0_paths:
-    #             old_path = ".".join(["output_blocks", str(i), path["old"]])
-    #             new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
-
-    #             new_checkpoint[new_path] = unet_state_dict[old_path]
-
     return new_checkpoint
 
 

From 25eb4e7bdae26efc064b6cb084cb1cb538a678ff Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Wed, 15 Feb 2023 01:18:21 +0900
Subject: [PATCH 005/122] split create_controlnet_diffusers_config() from
 create_unet_diffusers_config()

- add config: hint_channels
---
 .../models/unet_2d_condition_controlnet.py    |  1 +
 .../stable_diffusion/convert_from_ckpt.py     | 65 +++++++++++++++----
 2 files changed, 53 insertions(+), 13 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition_controlnet.py b/src/diffusers/models/unet_2d_condition_controlnet.py
index f74e77719622..d665ec22abc9 100644
--- a/src/diffusers/models/unet_2d_condition_controlnet.py
+++ b/src/diffusers/models/unet_2d_condition_controlnet.py
@@ -72,6 +72,7 @@ def __init__(
         timestep_post_act: Optional[str] = None,
         time_cond_proj_dim: Optional[int] = None,
         conv_in_kernel: int = 3,
+        hint_channels: int = 3,
         # conv_out_kernel: int = 3,
     ):
         super().__init__()
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index 148a1d542430..03bdee0e9004 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -212,15 +212,11 @@ def conv_attn_to_linear(checkpoint):
                 checkpoint[key] = checkpoint[key][:, :, 0]
 
 
-def create_unet_diffusers_config(original_config, image_size: int, control_net=False):
+def create_unet_diffusers_config(original_config, image_size: int):
     """
     Creates a config for the diffusers based on the config of the LDM model.
     """
-    if not control_net:
-        unet_params = original_config.model.params.unet_config.params
-    else:
-        unet_params = original_config.model.params.control_stage_config.params
-
+    unet_params = original_config.model.params.unet_config.params
     vae_params = original_config.model.params.first_stage_config.params.ddconfig
 
     block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
@@ -253,20 +249,63 @@ def create_unet_diffusers_config(original_config, image_size: int, control_net=F
     config = dict(
         sample_size=image_size // vae_scale_factor,
         in_channels=unet_params.in_channels,
-        # out_channels=unet_params.out_channels,
+        out_channels=unet_params.out_channels,
         down_block_types=tuple(down_block_types),
-        # up_block_types=tuple(up_block_types),
+        up_block_types=tuple(up_block_types),
         block_out_channels=tuple(block_out_channels),
         layers_per_block=unet_params.num_res_blocks,
         cross_attention_dim=unet_params.context_dim,
         attention_head_dim=head_dim,
         use_linear_projection=use_linear_projection,
     )
+    return config
+
+
+def create_controlnet_diffusers_config(original_config, image_size: int):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    controlnet_params = original_config.model.params.control_stage_config.params
+    vae_params = original_config.model.params.first_stage_config.params.ddconfig
 
-    # ControlNet donesn't have output channels
-    if not control_net:
-        config["out_channels"] = unet_params.out_channels
-        config["up_block_types"] = tuple(up_block_types)
+    block_out_channels = [controlnet_params.model_channels * mult for mult in controlnet_params.channel_mult]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in controlnet_params.attention_resolutions else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in controlnet_params.attention_resolutions else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
+
+    head_dim = controlnet_params.num_heads if "num_heads" in controlnet_params else None
+    use_linear_projection = (
+        controlnet_params.use_linear_in_transformer if "use_linear_in_transformer" in controlnet_params else False
+    )
+    if use_linear_projection:
+        # stable diffusion 2-base-512 and 2-768
+        if head_dim is None:
+            head_dim = [5, 10, 20, 20]
+
+    config = dict(
+        sample_size=image_size // vae_scale_factor,
+        in_channels=controlnet_params.in_channels,
+        down_block_types=tuple(down_block_types),
+        block_out_channels=tuple(block_out_channels),
+        layers_per_block=controlnet_params.num_res_blocks,
+        cross_attention_dim=controlnet_params.context_dim,
+        attention_head_dim=head_dim,
+        use_linear_projection=use_linear_projection,
+        hint_channels=controlnet_params.hint_channels,
+    )
 
     return config
 
@@ -1307,7 +1346,7 @@ def load_pipeline_from_control_net_ckpt(
     unet.load_state_dict(converted_unet_checkpoint)
 
     # Convert the ControlNetModel model.
-    ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, control_net=True)
+    ctrlnet_config = create_controlnet_diffusers_config(original_config, image_size=image_size)
     ctrlnet_config["upcast_attention"] = upcast_attention
     controlnet = ControlNetModel(**ctrlnet_config)
 

From a7cb5a2c46f705e86995b34f3e29ed24e7a2c57a Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Wed, 15 Feb 2023 03:17:21 +0900
Subject: [PATCH 006/122] Add input_hint_block, input_zero_conv and
 middle_block_out - this makes missing key error on loading model

---
 .../models/unet_2d_condition_controlnet.py    | 40 +++++++++++++++----
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition_controlnet.py b/src/diffusers/models/unet_2d_condition_controlnet.py
index d665ec22abc9..2885007073b5 100644
--- a/src/diffusers/models/unet_2d_condition_controlnet.py
+++ b/src/diffusers/models/unet_2d_condition_controlnet.py
@@ -28,6 +28,17 @@
 )
 
 
+def set_zero_parameters(module):
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+# ControlNet: Zero Convolution
+def zero_conv(channels):
+    return set_zero_parameters(nn.Conv2d(channels, channels, 1, padding=0))
+
+
 class ControlledUNet2DConditionModel(UNet2DConditionModel):
     pass
 
@@ -40,8 +51,6 @@ def __init__(
         self,
         sample_size: Optional[int] = None,
         in_channels: int = 4,
-        # out_channels: int = 4,
-        # center_input_sample: bool = False,
         flip_sin_to_cos: bool = True,
         freq_shift: int = 0,
         down_block_types: Tuple[str] = (
@@ -51,7 +60,6 @@ def __init__(
             "DownBlock2D",
         ),
         mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
-        # up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
         only_cross_attention: Union[bool, Tuple[bool]] = False,
         block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
         layers_per_block: int = 2,
@@ -73,12 +81,31 @@ def __init__(
         time_cond_proj_dim: Optional[int] = None,
         conv_in_kernel: int = 3,
         hint_channels: int = 3,
-        # conv_out_kernel: int = 3,
     ):
         super().__init__()
 
         self.sample_size = sample_size
 
+        # ControlNet specific blocks: Layer configurations are from reference implementation.
+        self.input_hint_block = nn.Sequential(
+            nn.Conv2d(hint_channels, 16, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 16, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 32, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(32, 32, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(32, 96, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(96, 96, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(96, 256, 3, padding=1, stride=2),
+            nn.SiLU(),
+            set_zero_parameters(nn.Conv2d(256, block_out_channels[0], 3, padding=1)),
+        )
+        self.input_zero_conv = zero_conv(block_out_channels[0])
+
         # input
         conv_in_padding = (conv_in_kernel - 1) // 2
         self.conv_in = nn.Conv2d(
@@ -195,6 +222,5 @@ def __init__(
         # count how many layers upsample the images
         self.num_upsamplers = 0
 
-        # #################################################
-        # TODO: add input_hint_block and zero_convs modules
-        # #################################################
+        # ControlNet specific block
+        self.middle_block_out = zero_conv(block_out_channels[-1])

From 148b46d391dd6189dda1213231e91fd542c2ff5c Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Wed, 15 Feb 2023 03:35:39 +0900
Subject: [PATCH 007/122] add unet_2d_blocks_controlnet.py - copied from
 unet_2d_blocks.py as impl CrossAttnDownBlock2D,DownBlock2D - this makes
 missing key error on loading model

---
 .../models/unet_2d_blocks_controlnet.py       | 315 ++++++++++++++++++
 .../models/unet_2d_condition_controlnet.py    |  19 +-
 2 files changed, 321 insertions(+), 13 deletions(-)
 create mode 100644 src/diffusers/models/unet_2d_blocks_controlnet.py

diff --git a/src/diffusers/models/unet_2d_blocks_controlnet.py b/src/diffusers/models/unet_2d_blocks_controlnet.py
new file mode 100644
index 000000000000..bc83dca1813d
--- /dev/null
+++ b/src/diffusers/models/unet_2d_blocks_controlnet.py
@@ -0,0 +1,315 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+
+from .unet_2d_blocks import (
+    Downsample2D,
+    DualTransformer2DModel,
+    ResnetBlock2D,
+    Transformer2DModel,
+)
+
+
+def set_zero_parameters(module):
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+# ControlNet: Zero Convolution
+def zero_conv(channels):
+    return set_zero_parameters(nn.Conv2d(channels, channels, 1, padding=0))
+
+
+def get_down_block_with_zero_conv(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+):
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlock2D":
+        return DownBlock2DWithZeroConv(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "CrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
+        return CrossAttnDownBlock2DWithZeroConv(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    raise NotImplementedError(f"{down_block_type} does not exist.")
+
+
+class CrossAttnDownBlock2DWithZeroConv(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        zero_convs = []
+
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            zero_convs.append(zero_conv(out_channels))
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.zero_convs = nn.ModuleList(zero_convs)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None
+    ):
+        # TODO(Patrick, William) - attention mask is not used
+        output_states = ()
+
+        #########################
+        # TODO: support zero_conv
+        #########################
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                    cross_attention_kwargs,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class DownBlock2DWithZeroConv(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+    ):
+        super().__init__()
+        resnets = []
+        zero_convs = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            zero_convs.append(zero_conv(out_channels))
+
+        self.resnets = nn.ModuleList(resnets)
+        self.zero_convs = nn.ModuleList(zero_convs)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(self, hidden_states, temb=None):
+        output_states = ()
+
+        #########################
+        # TODO: support zero_conv
+        #########################
+
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+            else:
+                hidden_states = resnet(hidden_states, temb)
+
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
diff --git a/src/diffusers/models/unet_2d_condition_controlnet.py b/src/diffusers/models/unet_2d_condition_controlnet.py
index 2885007073b5..d02d7b8566fb 100644
--- a/src/diffusers/models/unet_2d_condition_controlnet.py
+++ b/src/diffusers/models/unet_2d_condition_controlnet.py
@@ -24,19 +24,12 @@
 from .unet_2d_blocks import (
     UNetMidBlock2DCrossAttn,
     UNetMidBlock2DSimpleCrossAttn,
-    get_down_block,
 )
-
-
-def set_zero_parameters(module):
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
-
-
-# ControlNet: Zero Convolution
-def zero_conv(channels):
-    return set_zero_parameters(nn.Conv2d(channels, channels, 1, padding=0))
+from .unet_2d_blocks_controlnet import (
+    get_down_block_with_zero_conv,
+    set_zero_parameters,
+    zero_conv,
+)
 
 
 class ControlledUNet2DConditionModel(UNet2DConditionModel):
@@ -165,7 +158,7 @@ def __init__(
             output_channel = block_out_channels[i]
             is_final_block = i == len(block_out_channels) - 1
 
-            down_block = get_down_block(
+            down_block = get_down_block_with_zero_conv(
                 down_block_type,
                 num_layers=layers_per_block,
                 in_channels=input_channel,

From 584edfd50b6bf36a3ae0eb58f8fb506f77b82ca2 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Wed, 15 Feb 2023 23:24:42 +0900
Subject: [PATCH 008/122] Add loading for input_hint_block, zero_convs and
 middle_block_out

- this makes no error message on model loading
---
 .../models/unet_2d_blocks_controlnet.py       |  6 +++--
 .../stable_diffusion/convert_from_ckpt.py     | 27 ++++++++++++++++---
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/models/unet_2d_blocks_controlnet.py b/src/diffusers/models/unet_2d_blocks_controlnet.py
index bc83dca1813d..a06f0e4dfda6 100644
--- a/src/diffusers/models/unet_2d_blocks_controlnet.py
+++ b/src/diffusers/models/unet_2d_blocks_controlnet.py
@@ -166,7 +166,6 @@ def __init__(
                 )
         self.attentions = nn.ModuleList(attentions)
         self.resnets = nn.ModuleList(resnets)
-        self.zero_convs = nn.ModuleList(zero_convs)
 
         if add_downsample:
             self.downsamplers = nn.ModuleList(
@@ -176,9 +175,11 @@ def __init__(
                     )
                 ]
             )
+            zero_convs.append(zero_conv(out_channels))
         else:
             self.downsamplers = None
 
+        self.zero_convs = nn.ModuleList(zero_convs)
         self.gradient_checkpointing = False
 
     def forward(
@@ -269,7 +270,6 @@ def __init__(
             zero_convs.append(zero_conv(out_channels))
 
         self.resnets = nn.ModuleList(resnets)
-        self.zero_convs = nn.ModuleList(zero_convs)
 
         if add_downsample:
             self.downsamplers = nn.ModuleList(
@@ -279,9 +279,11 @@ def __init__(
                     )
                 ]
             )
+            zero_convs.append(zero_conv(out_channels))
         else:
             self.downsamplers = None
 
+        self.zero_convs = nn.ModuleList(zero_convs)
         self.gradient_checkpointing = False
 
     def forward(self, hidden_states, temb=None):
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index 03bdee0e9004..82a64ffa9694 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -612,9 +612,30 @@ def convert_controlnet_checkpoint(checkpoint, config):
         attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
     )
 
-    # #################################################
-    # TODO: add to load input_hint_block and zero_convs
-    # #################################################
+    # ControlNet Specific Weight & Biases
+
+    # input_hint_block
+    for i in range(8):
+        key = f"input_hint_block.{i*2}."
+        new_checkpoint[key + "weight"] = unet_state_dict.pop(key + "weight")
+        new_checkpoint[key + "bias"] = unet_state_dict.pop(key + "bias")
+
+    # zero_convs
+    new_checkpoint["input_zero_conv.weight"] = unet_state_dict.pop("zero_convs.0.0.weight")
+    new_checkpoint["input_zero_conv.bias"] = unet_state_dict.pop("zero_convs.0.0.bias")
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+        key_dst = f"down_blocks.{block_id}.zero_convs.{layer_in_block_id}."
+        key_src = f"zero_convs.{i}.0."
+        new_checkpoint[key_dst + "weight"] = unet_state_dict.pop(key_src + "weight")
+        new_checkpoint[key_dst + "bias"] = unet_state_dict.pop(key_src + "bias")
+
+    # middle block out
+    key_dst = "middle_block_out."
+    key_src = "middle_block_out.0."
+    new_checkpoint[key_dst + "weight"] = unet_state_dict.pop(key_src + "weight")
+    new_checkpoint[key_dst + "bias"] = unet_state_dict.pop(key_src + "bias")
 
     return new_checkpoint
 

From 0f9781c1828d0b6058f23bda082f8733778a1e81 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Thu, 16 Feb 2023 00:00:53 +0900
Subject: [PATCH 009/122] Copy from UNet2DConditionalModel except __init__

---
 .../models/unet_2d_condition_controlnet.py    | 289 +++++++++++++++++-
 1 file changed, 288 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/models/unet_2d_condition_controlnet.py b/src/diffusers/models/unet_2d_condition_controlnet.py
index d02d7b8566fb..78ac88a4122f 100644
--- a/src/diffusers/models/unet_2d_condition_controlnet.py
+++ b/src/diffusers/models/unet_2d_condition_controlnet.py
@@ -12,24 +12,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
+import torch
 import torch.nn as nn
 
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..loaders import UNet2DConditionLoadersMixin
+from ..utils import logging
 from . import UNet2DConditionModel
+from .cross_attention import AttnProcessor
 from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
 from .modeling_utils import ModelMixin
 from .unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    CrossAttnUpBlock2D,
+    DownBlock2D,
     UNetMidBlock2DCrossAttn,
     UNetMidBlock2DSimpleCrossAttn,
+    UpBlock2D,
 )
 from .unet_2d_blocks_controlnet import (
     get_down_block_with_zero_conv,
     set_zero_parameters,
     zero_conv,
 )
+from .unet_2d_condition import UNet2DConditionOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
 class ControlledUNet2DConditionModel(UNet2DConditionModel):
@@ -217,3 +228,279 @@ def __init__(
 
         # ControlNet specific block
         self.middle_block_out = zero_conv(block_out_channels[-1])
+
+    @property
+    def attn_processors(self) -> Dict[str, AttnProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttnProcessor]):
+            if hasattr(module, "set_processor"):
+                processors[f"{name}.processor"] = module.processor
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    def set_attn_processor(self, processor: Union[AttnProcessor, Dict[str, AttnProcessor]]):
+        r"""
+        Parameters:
+            `processor (`dict` of `AttnProcessor` or `AttnProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                of **all** `CrossAttention` layers.
+            In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainablae attention processors.:
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+
+        def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+
+            for child in module.children():
+                fn_recursive_retrieve_slicable_dims(child)
+
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_slicable_dims(module)
+
+        num_slicable_layers = len(sliceable_head_dims)
+
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_slicable_layers * [1]
+
+        slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+            )
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                )
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet2DConditionOutput(sample=sample)

From 0327e730237a0f074e9c7241c735b6f8a5aa7806 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Thu, 16 Feb 2023 00:42:40 +0900
Subject: [PATCH 010/122] Add ultra primitive test for ControlNetModel
 inference

---
 .../models/unet_2d_condition_controlnet.py    | 33 +-----------
 ...est_models_unet_2d_condition_controlnet.py | 53 +++++++++++++++++++
 2 files changed, 54 insertions(+), 32 deletions(-)
 create mode 100644 tests/models/test_models_unet_2d_condition_controlnet.py

diff --git a/src/diffusers/models/unet_2d_condition_controlnet.py b/src/diffusers/models/unet_2d_condition_controlnet.py
index 78ac88a4122f..6513a6d64bea 100644
--- a/src/diffusers/models/unet_2d_condition_controlnet.py
+++ b/src/diffusers/models/unet_2d_condition_controlnet.py
@@ -55,6 +55,7 @@ def __init__(
         self,
         sample_size: Optional[int] = None,
         in_channels: int = 4,
+        center_input_sample: bool = False,
         flip_sin_to_cos: bool = True,
         freq_shift: int = 0,
         down_block_types: Tuple[str] = (
@@ -468,38 +469,6 @@ def forward(
                 cross_attention_kwargs=cross_attention_kwargs,
             )
 
-        # 5. up
-        for i, upsample_block in enumerate(self.up_blocks):
-            is_final_block = i == len(self.up_blocks) - 1
-
-            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
-            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
-
-            # if we have not reached the final block and need to forward the
-            # upsample size, we do it here
-            if not is_final_block and forward_upsample_size:
-                upsample_size = down_block_res_samples[-1].shape[2:]
-
-            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
-                sample = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    upsample_size=upsample_size,
-                    attention_mask=attention_mask,
-                )
-            else:
-                sample = upsample_block(
-                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
-                )
-        # 6. post-process
-        if self.conv_norm_out:
-            sample = self.conv_norm_out(sample)
-            sample = self.conv_act(sample)
-        sample = self.conv_out(sample)
-
         if not return_dict:
             return (sample,)
 
diff --git a/tests/models/test_models_unet_2d_condition_controlnet.py b/tests/models/test_models_unet_2d_condition_controlnet.py
new file mode 100644
index 000000000000..3029ab88d19b
--- /dev/null
+++ b/tests/models/test_models_unet_2d_condition_controlnet.py
@@ -0,0 +1,53 @@
+import torch
+import pytest
+
+from diffusers import ControlNetModel, UNet2DConditionModel
+
+
+# config from ControlNet_SD1.5
+unet_config = {
+    "sample_size": 64,
+    "in_channels": 4,
+    "out_channels": 4,
+    "down_block_types": ("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"),
+    "up_block_types": ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+    "block_out_channels": (320, 640, 1280, 1280),
+    "layers_per_block": 2,
+    "cross_attention_dim": 768,
+    "attention_head_dim": 8,
+    "use_linear_projection": False,
+    "upcast_attention": False,
+}
+
+ctrlnet_config = {
+    "sample_size": 64,
+    "in_channels": 4,
+    "down_block_types": ("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"),
+    "block_out_channels": (320, 640, 1280, 1280),
+    "layers_per_block": 2,
+    "cross_attention_dim": 768,
+    "attention_head_dim": 8,
+    "use_linear_projection": False,
+    "hint_channels": 3,
+    "upcast_attention": False,
+}
+
+################################################################################
+# Scaffold for WIP
+# ##############################################################################
+
+@pytest.mark.skip
+def test_unet_inference_without_exception():
+    sample = torch.randn((1, 4, 64, 64)).cuda()
+    timestep = 0
+    encoder_hidden_states = torch.randn((1, 77, 768)).cuda()
+    model = UNet2DConditionModel(**unet_config).cuda()
+    print(model(sample=sample, timestep=timestep, encoder_hidden_states=encoder_hidden_states))
+
+
+def test_inference_without_exception():
+    sample = torch.randn((1, 4, 64, 64)).cuda()
+    timestep = 0
+    encoder_hidden_states = torch.randn((1, 77, 768)).cuda()
+    model = ControlNetModel(**ctrlnet_config).cuda()
+    print(model(sample=sample, timestep=timestep, encoder_hidden_states=encoder_hidden_states))

From e5cabdf795c4832c2be56ec5bdd6881dca039c42 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Thu, 16 Feb 2023 01:31:08 +0900
Subject: [PATCH 011/122] Support ControlNetModel inference - without
 exceptions

---
 .../models/unet_2d_blocks_controlnet.py       | 22 ++++---
 .../models/unet_2d_condition_controlnet.py    | 58 +++++++++----------
 ...est_models_unet_2d_condition_controlnet.py |  8 ++-
 3 files changed, 43 insertions(+), 45 deletions(-)

diff --git a/src/diffusers/models/unet_2d_blocks_controlnet.py b/src/diffusers/models/unet_2d_blocks_controlnet.py
index a06f0e4dfda6..605c9939ff10 100644
--- a/src/diffusers/models/unet_2d_blocks_controlnet.py
+++ b/src/diffusers/models/unet_2d_blocks_controlnet.py
@@ -187,12 +187,9 @@ def forward(
     ):
         # TODO(Patrick, William) - attention mask is not used
         output_states = ()
+        zero_conved_states = ()
 
-        #########################
-        # TODO: support zero_conv
-        #########################
-
-        for resnet, attn in zip(self.resnets, self.attentions):
+        for resnet, attn, zero_conv in zip(self.resnets, self.attentions, self.zero_convs):
             if self.training and self.gradient_checkpointing:
 
                 def create_custom_forward(module, return_dict=None):
@@ -220,14 +217,16 @@ def custom_forward(*inputs):
                 ).sample
 
             output_states += (hidden_states,)
+            zero_conved_states += (zero_conv(hidden_states),)
 
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
                 hidden_states = downsampler(hidden_states)
 
             output_states += (hidden_states,)
+            zero_conved_states += (self.zero_convs[-1](hidden_states),)
 
-        return hidden_states, output_states
+        return hidden_states, output_states, zero_conved_states
 
 
 class DownBlock2DWithZeroConv(nn.Module):
@@ -288,12 +287,9 @@ def __init__(
 
     def forward(self, hidden_states, temb=None):
         output_states = ()
+        zero_conved_states = ()
 
-        #########################
-        # TODO: support zero_conv
-        #########################
-
-        for resnet in self.resnets:
+        for resnet, zero_conv in zip(self.resnets, self.zero_convs):
             if self.training and self.gradient_checkpointing:
 
                 def create_custom_forward(module):
@@ -307,11 +303,13 @@ def custom_forward(*inputs):
                 hidden_states = resnet(hidden_states, temb)
 
             output_states += (hidden_states,)
+            zero_conved_states += (zero_conv(hidden_states),)
 
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
                 hidden_states = downsampler(hidden_states)
 
             output_states += (hidden_states,)
+            zero_conved_states += (self.zero_convs[-1](hidden_states),)
 
-        return hidden_states, output_states
+        return hidden_states, output_states, zero_conved_states
diff --git a/src/diffusers/models/unet_2d_condition_controlnet.py b/src/diffusers/models/unet_2d_condition_controlnet.py
index 6513a6d64bea..4aaf6aea304b 100644
--- a/src/diffusers/models/unet_2d_condition_controlnet.py
+++ b/src/diffusers/models/unet_2d_condition_controlnet.py
@@ -356,14 +356,16 @@ def _set_gradient_checkpointing(self, module, value=False):
     def forward(
         self,
         sample: torch.FloatTensor,
+        hint: torch.Tensor,
         timestep: Union[torch.Tensor, float, int],
         encoder_hidden_states: torch.Tensor,
         class_labels: Optional[torch.Tensor] = None,
         timestep_cond: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        return_dict: bool = True,
     ) -> Union[UNet2DConditionOutput, Tuple]:
+        # TODO: fix docstring
+
         r"""
         Args:
             sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
@@ -381,19 +383,9 @@ def forward(
             [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
             returning a tuple, the first element is the sample tensor.
         """
-        # By default samples have to be AT least a multiple of the overall upsampling factor.
-        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
-        # However, the upsampling interpolation output size can be forced to fit any upsampling size
-        # on the fly if necessary.
-        default_overall_up_factor = 2**self.num_upsamplers
-
-        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
-        forward_upsample_size = False
-        upsample_size = None
 
-        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
-            logger.info("Forward upsample size to force interpolation output size.")
-            forward_upsample_size = True
+        # prepare output
+        outputs = []
 
         # prepare attention_mask
         if attention_mask is not None:
@@ -440,14 +432,19 @@ def forward(
             class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
             emb = emb + class_emb
 
-        # 2. pre-process
+        # 2. input_hint (ControlNet specific)
+        guided_hint = self.input_hint_block(hint)
+
+        # 3. pre-process
         sample = self.conv_in(sample)
+        sample += guided_hint
+        outputs.append(self.input_zero_conv(sample))
 
-        # 3. down
+        # 4. down
         down_block_res_samples = (sample,)
         for downsample_block in self.down_blocks:
             if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
-                sample, res_samples = downsample_block(
+                sample, res_samples, zero_conved_samples = downsample_block(
                     hidden_states=sample,
                     temb=emb,
                     encoder_hidden_states=encoder_hidden_states,
@@ -455,21 +452,20 @@ def forward(
                     cross_attention_kwargs=cross_attention_kwargs,
                 )
             else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+                sample, res_samples, zero_conved_samples = downsample_block(hidden_states=sample, temb=emb)
 
             down_block_res_samples += res_samples
+            outputs += zero_conved_samples
+
+        # 5. mid
+        assert self.mid_block is not None
+        sample = self.mid_block(
+            sample,
+            emb,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            cross_attention_kwargs=cross_attention_kwargs,
+        )
+        outputs.append(self.middle_block_out(sample))
 
-        # 4. mid
-        if self.mid_block is not None:
-            sample = self.mid_block(
-                sample,
-                emb,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=attention_mask,
-                cross_attention_kwargs=cross_attention_kwargs,
-            )
-
-        if not return_dict:
-            return (sample,)
-
-        return UNet2DConditionOutput(sample=sample)
+        return outputs
diff --git a/tests/models/test_models_unet_2d_condition_controlnet.py b/tests/models/test_models_unet_2d_condition_controlnet.py
index 3029ab88d19b..b2025eb659af 100644
--- a/tests/models/test_models_unet_2d_condition_controlnet.py
+++ b/tests/models/test_models_unet_2d_condition_controlnet.py
@@ -1,5 +1,5 @@
-import torch
 import pytest
+import torch
 
 from diffusers import ControlNetModel, UNet2DConditionModel
 
@@ -36,6 +36,7 @@
 # Scaffold for WIP
 # ##############################################################################
 
+
 @pytest.mark.skip
 def test_unet_inference_without_exception():
     sample = torch.randn((1, 4, 64, 64)).cuda()
@@ -47,7 +48,10 @@ def test_unet_inference_without_exception():
 
 def test_inference_without_exception():
     sample = torch.randn((1, 4, 64, 64)).cuda()
+    hint = torch.randn((1, 3, 512, 512)).cuda()
     timestep = 0
     encoder_hidden_states = torch.randn((1, 77, 768)).cuda()
     model = ControlNetModel(**ctrlnet_config).cuda()
-    print(model(sample=sample, timestep=timestep, encoder_hidden_states=encoder_hidden_states))
+    outputs = model(sample=sample, hint=hint, timestep=timestep, encoder_hidden_states=encoder_hidden_states)
+    assert len(outputs) == 12 + 1  # 12layer down and one middle
+    print(outputs)

From 1fc01a35f753f4d45b240e81231067b0126d0423 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Thu, 16 Feb 2023 01:43:40 +0900
Subject: [PATCH 012/122] copy forward() from UNet2DConditionModel

---
 .../models/unet_2d_condition_controlnet.py    | 153 +++++++++++++++++-
 1 file changed, 152 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/models/unet_2d_condition_controlnet.py b/src/diffusers/models/unet_2d_condition_controlnet.py
index 4aaf6aea304b..8fb3ec6474da 100644
--- a/src/diffusers/models/unet_2d_condition_controlnet.py
+++ b/src/diffusers/models/unet_2d_condition_controlnet.py
@@ -44,7 +44,158 @@
 
 
 class ControlledUNet2DConditionModel(UNet2DConditionModel):
-    pass
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+            )
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                )
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet2DConditionOutput(sample=sample)
 
 
 class ControlNetModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):

From cb7bb9a0f244fa1547c7ae4d9291bfecb8f70d5d Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Thu, 16 Feb 2023 02:17:16 +0900
Subject: [PATCH 013/122] Impl ControlledUNet2DConditionModel inference -
 test_controlled_unet_inference passed

---
 .../models/unet_2d_condition_controlnet.py    | 18 ++++++++-
 ...est_models_unet_2d_condition_controlnet.py | 39 +++++++++++++++----
 2 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition_controlnet.py b/src/diffusers/models/unet_2d_condition_controlnet.py
index 8fb3ec6474da..01bfdd8cb378 100644
--- a/src/diffusers/models/unet_2d_condition_controlnet.py
+++ b/src/diffusers/models/unet_2d_condition_controlnet.py
@@ -47,6 +47,7 @@ class ControlledUNet2DConditionModel(UNet2DConditionModel):
     def forward(
         self,
         sample: torch.FloatTensor,
+        control: List[torch.Tensor],
         timestep: Union[torch.Tensor, float, int],
         encoder_hidden_states: torch.Tensor,
         class_labels: Optional[torch.Tensor] = None,
@@ -55,6 +56,8 @@ def forward(
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         return_dict: bool = True,
     ) -> Union[UNet2DConditionOutput, Tuple]:
+        # TODO: Fix Docstrings
+
         r"""
         Args:
             sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
@@ -160,13 +163,21 @@ def forward(
                 cross_attention_kwargs=cross_attention_kwargs,
             )
 
-        # 5. up
+        # 5. apply middle_block_out output
+        sample += control.pop()
+
+        # 6. up
         for i, upsample_block in enumerate(self.up_blocks):
             is_final_block = i == len(self.up_blocks) - 1
 
             res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
             down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
 
+            # apply controlnet downblock output
+            control_samples = control[-len(upsample_block.resnets) :]
+            control = control[: -len(upsample_block.resnets)]
+            res_samples = [r + c for r, c in zip(res_samples, control_samples)]
+
             # if we have not reached the final block and need to forward the
             # upsample size, we do it here
             if not is_final_block and forward_upsample_size:
@@ -186,7 +197,10 @@ def forward(
                 sample = upsample_block(
                     hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
                 )
-        # 6. post-process
+
+        assert len(control) == 0, f"must consume all control array ({len(control)})"
+
+        # 7. post-process
         if self.conv_norm_out:
             sample = self.conv_norm_out(sample)
             sample = self.conv_act(sample)
diff --git a/tests/models/test_models_unet_2d_condition_controlnet.py b/tests/models/test_models_unet_2d_condition_controlnet.py
index b2025eb659af..d3ec06ccc6b3 100644
--- a/tests/models/test_models_unet_2d_condition_controlnet.py
+++ b/tests/models/test_models_unet_2d_condition_controlnet.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from diffusers import ControlNetModel, UNet2DConditionModel
+from diffusers import ControlledUNet2DConditionModel, ControlNetModel, UNet2DConditionModel
 
 
 # config from ControlNet_SD1.5
@@ -33,8 +33,8 @@
 }
 
 ################################################################################
-# Scaffold for WIP
-# ##############################################################################
+# WIP
+################################################################################
 
 
 @pytest.mark.skip
@@ -43,15 +43,40 @@ def test_unet_inference_without_exception():
     timestep = 0
     encoder_hidden_states = torch.randn((1, 77, 768)).cuda()
     model = UNet2DConditionModel(**unet_config).cuda()
-    print(model(sample=sample, timestep=timestep, encoder_hidden_states=encoder_hidden_states))
+    model.eval()
+    with torch.no_grad():
+        out = model(sample=sample, timestep=timestep, encoder_hidden_states=encoder_hidden_states)
+    assert out.sample.shape == (1, 4, 64, 64)
+    print(out.sample)
 
 
-def test_inference_without_exception():
+def controlnet_inference():
     sample = torch.randn((1, 4, 64, 64)).cuda()
     hint = torch.randn((1, 3, 512, 512)).cuda()
     timestep = 0
     encoder_hidden_states = torch.randn((1, 77, 768)).cuda()
     model = ControlNetModel(**ctrlnet_config).cuda()
-    outputs = model(sample=sample, hint=hint, timestep=timestep, encoder_hidden_states=encoder_hidden_states)
-    assert len(outputs) == 12 + 1  # 12layer down and one middle
+    model.eval()
+    with torch.no_grad():
+        outputs = model(sample=sample, hint=hint, timestep=timestep, encoder_hidden_states=encoder_hidden_states)
+    return outputs
+
+
+@pytest.mark.skip
+def test_controlnet_inference():
+    outputs = controlnet_inference()
+    assert len(outputs) == 12 + 1  # 12 layer down and one middle
     print(outputs)
+
+
+def test_controlled_unet_inference():
+    sample = torch.randn((1, 4, 64, 64)).cuda()
+    control = controlnet_inference()
+    timestep = 0
+    encoder_hidden_states = torch.randn((1, 77, 768)).cuda()
+    model = ControlledUNet2DConditionModel(**unet_config).cuda()
+    model.eval()
+    with torch.no_grad():
+        out = model(sample=sample, control=control, timestep=timestep, encoder_hidden_states=encoder_hidden_states)
+    assert out.sample.shape == (1, 4, 64, 64)
+    print(out.sample)

From 87ed105cdbe42684f97d6a1317fd0429e399220e Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Thu, 16 Feb 2023 02:29:20 +0900
Subject: [PATCH 014/122] Frozen weight & biases for training

---
 .../models/unet_2d_condition_controlnet.py    | 119 +++++++++---------
 1 file changed, 61 insertions(+), 58 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition_controlnet.py b/src/diffusers/models/unet_2d_condition_controlnet.py
index 01bfdd8cb378..99d990e7c281 100644
--- a/src/diffusers/models/unet_2d_condition_controlnet.py
+++ b/src/diffusers/models/unet_2d_condition_controlnet.py
@@ -57,6 +57,7 @@ def forward(
         return_dict: bool = True,
     ) -> Union[UNet2DConditionOutput, Tuple]:
         # TODO: Fix Docstrings
+        # TODO: add only_mid_control option
 
         r"""
         Args:
@@ -98,70 +99,72 @@ def forward(
         if self.config.center_input_sample:
             sample = 2 * sample - 1.0
 
-        # 1. time
-        timesteps = timestep
-        if not torch.is_tensor(timesteps):
-            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
-            # This would be a good case for the `match` statement (Python 3.10+)
-            is_mps = sample.device.type == "mps"
-            if isinstance(timestep, float):
-                dtype = torch.float32 if is_mps else torch.float64
-            else:
-                dtype = torch.int32 if is_mps else torch.int64
-            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
-        elif len(timesteps.shape) == 0:
-            timesteps = timesteps[None].to(sample.device)
-
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand(sample.shape[0])
-
-        t_emb = self.time_proj(timesteps)
-
-        # timesteps does not contain any weights and will always return f32 tensors
-        # but time_embedding might actually be running in fp16. so we need to cast here.
-        # there might be better ways to encapsulate this.
-        t_emb = t_emb.to(dtype=self.dtype)
-
-        emb = self.time_embedding(t_emb, timestep_cond)
-
-        if self.class_embedding is not None:
-            if class_labels is None:
-                raise ValueError("class_labels should be provided when num_class_embeds > 0")
-
-            if self.config.class_embed_type == "timestep":
-                class_labels = self.time_proj(class_labels)
-
-            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
-            emb = emb + class_emb
+        # Frozen weight & biases from 1 to 4 for training
+        with torch.no_grad():
+            # 1. time
+            timesteps = timestep
+            if not torch.is_tensor(timesteps):
+                # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+                # This would be a good case for the `match` statement (Python 3.10+)
+                is_mps = sample.device.type == "mps"
+                if isinstance(timestep, float):
+                    dtype = torch.float32 if is_mps else torch.float64
+                else:
+                    dtype = torch.int32 if is_mps else torch.int64
+                timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+            elif len(timesteps.shape) == 0:
+                timesteps = timesteps[None].to(sample.device)
+
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timesteps = timesteps.expand(sample.shape[0])
+
+            t_emb = self.time_proj(timesteps)
+
+            # timesteps does not contain any weights and will always return f32 tensors
+            # but time_embedding might actually be running in fp16. so we need to cast here.
+            # there might be better ways to encapsulate this.
+            t_emb = t_emb.to(dtype=self.dtype)
+
+            emb = self.time_embedding(t_emb, timestep_cond)
+
+            if self.class_embedding is not None:
+                if class_labels is None:
+                    raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+                if self.config.class_embed_type == "timestep":
+                    class_labels = self.time_proj(class_labels)
+
+                class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+                emb = emb + class_emb
+
+            # 2. pre-process
+            sample = self.conv_in(sample)
+
+            # 3. down
+            down_block_res_samples = (sample,)
+            for downsample_block in self.down_blocks:
+                if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                    sample, res_samples = downsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        encoder_hidden_states=encoder_hidden_states,
+                        attention_mask=attention_mask,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                    )
+                else:
+                    sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
 
-        # 2. pre-process
-        sample = self.conv_in(sample)
+                down_block_res_samples += res_samples
 
-        # 3. down
-        down_block_res_samples = (sample,)
-        for downsample_block in self.down_blocks:
-            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
-                sample, res_samples = downsample_block(
-                    hidden_states=sample,
-                    temb=emb,
+            # 4. mid
+            if self.mid_block is not None:
+                sample = self.mid_block(
+                    sample,
+                    emb,
                     encoder_hidden_states=encoder_hidden_states,
                     attention_mask=attention_mask,
                     cross_attention_kwargs=cross_attention_kwargs,
                 )
-            else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
-
-            down_block_res_samples += res_samples
-
-        # 4. mid
-        if self.mid_block is not None:
-            sample = self.mid_block(
-                sample,
-                emb,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=attention_mask,
-                cross_attention_kwargs=cross_attention_kwargs,
-            )
 
         # 5. apply middle_block_out output
         sample += control.pop()

From efccecc8044df1742b955620672f97e93a7c04c3 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Fri, 17 Feb 2023 00:57:32 +0900
Subject: [PATCH 015/122] Minimized version of ControlNet/ControlledUnet -
 test_modules_controllnet.py passed

---
 src/diffusers/models/__init__.py          |  1 +
 src/diffusers/models/controlnet_blocks.py | 79 +++++++++++++++++++++
 src/diffusers/models/unet_2d_condition.py | 39 ++++++++++-
 tests/models/test_models_controlnet.py    | 84 +++++++++++++++++++++++
 4 files changed, 202 insertions(+), 1 deletion(-)
 create mode 100644 src/diffusers/models/controlnet_blocks.py
 create mode 100644 tests/models/test_models_controlnet.py

diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 29c0091c4a82..f1071c464eb4 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -17,6 +17,7 @@
 
 if is_torch_available():
     from .autoencoder_kl import AutoencoderKL
+    from .controlnet_blocks import ControlNetInputHintBlock, ControlNetZeroConvBlock
     from .dual_transformer_2d import DualTransformer2DModel
     from .modeling_utils import ModelMixin
     from .prior_transformer import PriorTransformer
diff --git a/src/diffusers/models/controlnet_blocks.py b/src/diffusers/models/controlnet_blocks.py
new file mode 100644
index 000000000000..7b7941b42675
--- /dev/null
+++ b/src/diffusers/models/controlnet_blocks.py
@@ -0,0 +1,79 @@
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+
+
+def set_zero_parameters(module):
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+# ControlNet: Zero Convolution
+def zero_conv(channels):
+    return set_zero_parameters(nn.Conv2d(channels, channels, 1, padding=0))
+
+
+class ControlNetInputHintBlock(nn.Module):
+    def __init__(self, hint_channels: int = 3, channels: int = 320):
+        super().__init__()
+        #  Layer configurations are from reference implementation.
+        self.input_hint_block = nn.Sequential(
+            nn.Conv2d(hint_channels, 16, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 16, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 32, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(32, 32, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(32, 96, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(96, 96, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(96, 256, 3, padding=1, stride=2),
+            nn.SiLU(),
+            set_zero_parameters(nn.Conv2d(256, channels, 3, padding=1)),
+        )
+
+    def forward(self, hint: torch.Tensor):
+        return self.input_hint_block(hint)
+
+
+class ControlNetZeroConvBlock(nn.Module):
+    def __init__(
+        self,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        layers_per_block: int = 2,
+    ):
+        super().__init__()
+        self.input_zero_conv = zero_conv(block_out_channels[0])
+        zero_convs = []
+        for i, down_block_type in enumerate(down_block_types):
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            for _ in range(layers_per_block):
+                zero_convs.append(zero_conv(output_channel))
+            if not is_final_block:
+                zero_convs.append(zero_conv(output_channel))
+        self.zero_convs = nn.ModuleList(zero_convs)
+        self.mid_zero_conv = zero_conv(block_out_channels[-1])
+
+    def forward(
+        self,
+        down_block_res_samples: List[torch.Tensor],
+        mid_block_sample: torch.Tensor,
+    ) -> List[torch.Tensor]:
+        outputs = []
+        outputs.append(self.input_zero_conv(down_block_res_samples[0]))
+        for res_sample, zero_conv in zip(down_block_res_samples[1:], self.zero_convs):
+            outputs.append(zero_conv(res_sample))
+        outputs.append(self.mid_zero_conv(mid_block_sample))
+        return outputs
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index ba2c09b297b9..41b3bcf2fc62 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -21,6 +21,7 @@
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..loaders import UNet2DConditionLoadersMixin
 from ..utils import BaseOutput, logging
+from .controlnet_blocks import ControlNetInputHintBlock, ControlNetZeroConvBlock
 from .cross_attention import AttnProcessor
 from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
 from .modeling_utils import ModelMixin
@@ -145,6 +146,7 @@ def __init__(
         time_cond_proj_dim: Optional[int] = None,
         conv_in_kernel: int = 3,
         conv_out_kernel: int = 3,
+        controlnet_hint_channels: Optional[int] = None,
     ):
         super().__init__()
 
@@ -323,6 +325,16 @@ def __init__(
             block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
         )
 
+        if controlnet_hint_channels is not None:
+            self.controlnet_input_hint_block = ControlNetInputHintBlock(
+                hint_channels=controlnet_hint_channels, channels=block_out_channels[0]
+            )
+            self.controlnet_zero_conv_block = ControlNetZeroConvBlock(
+                block_out_channels=block_out_channels,
+                down_block_types=down_block_types,
+                layers_per_block=layers_per_block,
+            )
+
     @property
     def attn_processors(self) -> Dict[str, AttnProcessor]:
         r"""
@@ -455,8 +467,10 @@ def forward(
         timestep_cond: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_hint: Optional[torch.Tensor] = None,
+        control: Optional[List[torch.Tensor]] = None,
         return_dict: bool = True,
-    ) -> Union[UNet2DConditionOutput, Tuple]:
+    ) -> Union[UNet2DConditionOutput, Tuple, List[torch.Tensor]]:
         r"""
         Args:
             sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
@@ -535,6 +549,8 @@ def forward(
 
         # 2. pre-process
         sample = self.conv_in(sample)
+        if controlnet_hint is not None:
+            sample += self.controlnet_input_hint_block(controlnet_hint)
 
         # 3. down
         down_block_res_samples = (sample,)
@@ -562,6 +578,16 @@ def forward(
                 cross_attention_kwargs=cross_attention_kwargs,
             )
 
+        if controlnet_hint is not None:
+            # ControlNet: zero convs
+            return self.controlnet_zero_conv_block(
+                down_block_res_samples=down_block_res_samples, mid_block_sample=sample
+            )
+        
+        if control is not None:
+            # ControlledUnet: apply mid_zero_conv output
+            sample += control.pop()
+        
         # 5. up
         for i, upsample_block in enumerate(self.up_blocks):
             is_final_block = i == len(self.up_blocks) - 1
@@ -569,6 +595,12 @@ def forward(
             res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
             down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
 
+            if control is not None:
+                # ControlledUnet: apply ControlNet downblock zero_convs output
+                control_samples = control[-len(upsample_block.resnets) :]
+                control = control[: -len(upsample_block.resnets)]
+                res_samples = [r + c for r, c in zip(res_samples, control_samples)]
+
             # if we have not reached the final block and need to forward the
             # upsample size, we do it here
             if not is_final_block and forward_upsample_size:
@@ -588,6 +620,11 @@ def forward(
                 sample = upsample_block(
                     hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
                 )
+
+        # TODO: remove this block
+        if control is not None:
+            assert len(control) == 0, f"must consume all control array ({len(control)})"
+                
         # 6. post-process
         if self.conv_norm_out:
             sample = self.conv_norm_out(sample)
diff --git a/tests/models/test_models_controlnet.py b/tests/models/test_models_controlnet.py
new file mode 100644
index 000000000000..2da25fcb379a
--- /dev/null
+++ b/tests/models/test_models_controlnet.py
@@ -0,0 +1,84 @@
+import torch
+
+from diffusers import UNet2DConditionModel
+
+
+################################################################################
+# PoC version
+################################################################################
+
+
+# config from ControlNet_SD1.5
+unet_config = {
+    "sample_size": 64,
+    "in_channels": 4,
+    "out_channels": 4,
+    "down_block_types": ("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"),
+    "up_block_types": ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+    "block_out_channels": (320, 640, 1280, 1280),
+    "layers_per_block": 2,
+    "cross_attention_dim": 768,
+    "attention_head_dim": 8,
+    "use_linear_projection": False,
+    "upcast_attention": False,
+}
+
+ctrlnet_config = {
+    "sample_size": 64,
+    "in_channels": 4,
+    "down_block_types": ("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"),
+    "block_out_channels": (320, 640, 1280, 1280),
+    "layers_per_block": 2,
+    "cross_attention_dim": 768,
+    "attention_head_dim": 8,
+    "use_linear_projection": False,
+    "controlnet_hint_channels": 3,
+    "upcast_attention": False,
+}
+
+
+# @pytest.mark.skip
+def test_unet_inference():
+    sample = torch.randn((1, 4, 64, 64)).cuda()
+    timestep = 0
+    encoder_hidden_states = torch.randn((1, 77, 768)).cuda()
+    model = UNet2DConditionModel(**unet_config).cuda()
+    model.eval()
+    with torch.no_grad():
+        out = model(sample=sample, timestep=timestep, encoder_hidden_states=encoder_hidden_states)
+    assert out.sample.shape == (1, 4, 64, 64)
+    print(out.sample)
+
+
+def controlnet_inference():
+    sample = torch.randn((1, 4, 64, 64)).cuda()
+    hint = torch.randn((1, 3, 512, 512)).cuda()
+    timestep = 0
+    encoder_hidden_states = torch.randn((1, 77, 768)).cuda()
+    model = UNet2DConditionModel(**ctrlnet_config).cuda()
+    model.eval()
+    with torch.no_grad():
+        outputs = model(
+            sample=sample, controlnet_hint=hint, timestep=timestep, encoder_hidden_states=encoder_hidden_states
+        )
+    return outputs
+
+
+# @pytest.mark.skip
+def test_controlnet_inference():
+    outputs = controlnet_inference()
+    assert len(outputs) == 12 + 1  # 12 layer down and one middle
+    print(outputs)
+
+
+def test_controlled_unet_inference():
+    sample = torch.randn((1, 4, 64, 64)).cuda()
+    control = controlnet_inference()
+    timestep = 0
+    encoder_hidden_states = torch.randn((1, 77, 768)).cuda()
+    model = UNet2DConditionModel(**unet_config).cuda()
+    model.eval()
+    with torch.no_grad():
+        out = model(sample=sample, control=control, timestep=timestep, encoder_hidden_states=encoder_hidden_states)
+    assert out.sample.shape == (1, 4, 64, 64)
+    print(out.sample)

From a838366de264a49ea22d9a46fee3eefbc90445a5 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Fri, 17 Feb 2023 00:59:29 +0900
Subject: [PATCH 016/122] make style

---
 src/diffusers/models/unet_2d_condition.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 41b3bcf2fc62..b4917872f923 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -583,11 +583,11 @@ def forward(
             return self.controlnet_zero_conv_block(
                 down_block_res_samples=down_block_res_samples, mid_block_sample=sample
             )
-        
+
         if control is not None:
             # ControlledUnet: apply mid_zero_conv output
             sample += control.pop()
-        
+
         # 5. up
         for i, upsample_block in enumerate(self.up_blocks):
             is_final_block = i == len(self.up_blocks) - 1
@@ -624,7 +624,7 @@ def forward(
         # TODO: remove this block
         if control is not None:
             assert len(control) == 0, f"must consume all control array ({len(control)})"
-                
+
         # 6. post-process
         if self.conv_norm_out:
             sample = self.conv_norm_out(sample)

From a296de9c155c8f2e94628809833967329da2c9a0 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Fri, 17 Feb 2023 01:15:34 +0900
Subject: [PATCH 017/122] Add support model loading for minimized ver

---
 src/diffusers/models/unet_2d_condition.py     | 22 +++++++-------
 .../stable_diffusion/convert_from_ckpt.py     | 29 +++++++++----------
 2 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index b4917872f923..09ba478ba6c9 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -268,6 +268,18 @@ def __init__(
         # count how many layers upsample the images
         self.num_upsamplers = 0
 
+        if controlnet_hint_channels is not None:
+            # ControlNet: add input_hint_block, zero_conv_block
+            self.controlnet_input_hint_block = ControlNetInputHintBlock(
+                hint_channels=controlnet_hint_channels, channels=block_out_channels[0]
+            )
+            self.controlnet_zero_conv_block = ControlNetZeroConvBlock(
+                block_out_channels=block_out_channels,
+                down_block_types=down_block_types,
+                layers_per_block=layers_per_block,
+            )
+            return  # Modules from the following lines are not defined in ControlNet
+
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
         reversed_attention_head_dim = list(reversed(attention_head_dim))
@@ -325,16 +337,6 @@ def __init__(
             block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
         )
 
-        if controlnet_hint_channels is not None:
-            self.controlnet_input_hint_block = ControlNetInputHintBlock(
-                hint_channels=controlnet_hint_channels, channels=block_out_channels[0]
-            )
-            self.controlnet_zero_conv_block = ControlNetZeroConvBlock(
-                block_out_channels=block_out_channels,
-                down_block_types=down_block_types,
-                layers_per_block=layers_per_block,
-            )
-
     @property
     def attn_processors(self) -> Dict[str, AttnProcessor]:
         r"""
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index 82a64ffa9694..8eb1316be83d 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -25,8 +25,6 @@
 
 from diffusers import (
     AutoencoderKL,
-    ControlledUNet2DConditionModel,
-    ControlNetModel,
     DDIMScheduler,
     DPMSolverMultistepScheduler,
     EulerAncestralDiscreteScheduler,
@@ -304,7 +302,7 @@ def create_controlnet_diffusers_config(original_config, image_size: int):
         cross_attention_dim=controlnet_params.context_dim,
         attention_head_dim=head_dim,
         use_linear_projection=use_linear_projection,
-        hint_channels=controlnet_params.hint_channels,
+        controlnet_hint_channels=controlnet_params.hint_channels,
     )
 
     return config
@@ -612,27 +610,26 @@ def convert_controlnet_checkpoint(checkpoint, config):
         attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
     )
 
-    # ControlNet Specific Weight & Biases
+    # ControlNet Specific Weights & Biases
 
     # input_hint_block
     for i in range(8):
-        key = f"input_hint_block.{i*2}."
-        new_checkpoint[key + "weight"] = unet_state_dict.pop(key + "weight")
-        new_checkpoint[key + "bias"] = unet_state_dict.pop(key + "bias")
+        key_dst = f"controlnet_input_hint_block.input_hint_block.{i*2}."
+        key_src = f"input_hint_block.{i*2}."
+        new_checkpoint[key_dst + "weight"] = unet_state_dict.pop(key_src + "weight")
+        new_checkpoint[key_dst + "bias"] = unet_state_dict.pop(key_src + "bias")
 
     # zero_convs
-    new_checkpoint["input_zero_conv.weight"] = unet_state_dict.pop("zero_convs.0.0.weight")
-    new_checkpoint["input_zero_conv.bias"] = unet_state_dict.pop("zero_convs.0.0.bias")
+    new_checkpoint["controlnet_zero_conv_block.input_zero_conv.weight"] = unet_state_dict.pop("zero_convs.0.0.weight")
+    new_checkpoint["controlnet_zero_conv_block.input_zero_conv.bias"] = unet_state_dict.pop("zero_convs.0.0.bias")
     for i in range(1, num_input_blocks):
-        block_id = (i - 1) // (config["layers_per_block"] + 1)
-        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
-        key_dst = f"down_blocks.{block_id}.zero_convs.{layer_in_block_id}."
+        key_dst = f"controlnet_zero_conv_block.zero_convs.{i-1}."
         key_src = f"zero_convs.{i}.0."
         new_checkpoint[key_dst + "weight"] = unet_state_dict.pop(key_src + "weight")
         new_checkpoint[key_dst + "bias"] = unet_state_dict.pop(key_src + "bias")
 
-    # middle block out
-    key_dst = "middle_block_out."
+    # mid_zero_conv
+    key_dst = "controlnet_zero_conv_block.mid_zero_conv."
     key_src = "middle_block_out.0."
     new_checkpoint[key_dst + "weight"] = unet_state_dict.pop(key_src + "weight")
     new_checkpoint[key_dst + "bias"] = unet_state_dict.pop(key_src + "bias")
@@ -1358,7 +1355,7 @@ def load_pipeline_from_control_net_ckpt(
     # Convert the ControlledUNet2DConditionModel model.
     unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
     unet_config["upcast_attention"] = upcast_attention
-    unet = ControlledUNet2DConditionModel(**unet_config)
+    unet = UNet2DConditionModel(**unet_config)
 
     converted_unet_checkpoint = convert_ldm_unet_checkpoint(
         checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
@@ -1369,7 +1366,7 @@ def load_pipeline_from_control_net_ckpt(
     # Convert the ControlNetModel model.
     ctrlnet_config = create_controlnet_diffusers_config(original_config, image_size=image_size)
     ctrlnet_config["upcast_attention"] = upcast_attention
-    controlnet = ControlNetModel(**ctrlnet_config)
+    controlnet = UNet2DConditionModel(**ctrlnet_config)
 
     converted_ctrl_checkpoint = convert_controlnet_checkpoint(checkpoint, unet_config)
 

From bd51c6df6ab521b255dc6ba3d7f54ff919348fa1 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Fri, 17 Feb 2023 01:22:03 +0900
Subject: [PATCH 018/122] Remove all previous version files

---
 src/diffusers/__init__.py                     |   2 -
 src/diffusers/models/__init__.py              |   1 -
 .../models/unet_2d_blocks_controlnet.py       | 315 ---------
 .../models/unet_2d_condition_controlnet.py    | 639 ------------------
 .../pipeline_stable_diffusion_controlnet.py   |   6 +-
 ...est_models_unet_2d_condition_controlnet.py |  82 ---
 6 files changed, 3 insertions(+), 1042 deletions(-)
 delete mode 100644 src/diffusers/models/unet_2d_blocks_controlnet.py
 delete mode 100644 src/diffusers/models/unet_2d_condition_controlnet.py
 delete mode 100644 tests/models/test_models_unet_2d_condition_controlnet.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 5dca2a5d0032..cdb45bb74ccf 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -34,8 +34,6 @@
 else:
     from .models import (
         AutoencoderKL,
-        ControlledUNet2DConditionModel,
-        ControlNetModel,
         ModelMixin,
         PriorTransformer,
         Transformer2DModel,
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index f1071c464eb4..9917f4b24f51 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -25,7 +25,6 @@
     from .unet_1d import UNet1DModel
     from .unet_2d import UNet2DModel
     from .unet_2d_condition import UNet2DConditionModel
-    from .unet_2d_condition_controlnet import ControlledUNet2DConditionModel, ControlNetModel
     from .vq_model import VQModel
 
 if is_flax_available():
diff --git a/src/diffusers/models/unet_2d_blocks_controlnet.py b/src/diffusers/models/unet_2d_blocks_controlnet.py
deleted file mode 100644
index 605c9939ff10..000000000000
--- a/src/diffusers/models/unet_2d_blocks_controlnet.py
+++ /dev/null
@@ -1,315 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-
-from .unet_2d_blocks import (
-    Downsample2D,
-    DualTransformer2DModel,
-    ResnetBlock2D,
-    Transformer2DModel,
-)
-
-
-def set_zero_parameters(module):
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
-
-
-# ControlNet: Zero Convolution
-def zero_conv(channels):
-    return set_zero_parameters(nn.Conv2d(channels, channels, 1, padding=0))
-
-
-def get_down_block_with_zero_conv(
-    down_block_type,
-    num_layers,
-    in_channels,
-    out_channels,
-    temb_channels,
-    add_downsample,
-    resnet_eps,
-    resnet_act_fn,
-    attn_num_head_channels,
-    resnet_groups=None,
-    cross_attention_dim=None,
-    downsample_padding=None,
-    dual_cross_attention=False,
-    use_linear_projection=False,
-    only_cross_attention=False,
-    upcast_attention=False,
-    resnet_time_scale_shift="default",
-):
-    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
-    if down_block_type == "DownBlock2D":
-        return DownBlock2DWithZeroConv(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            downsample_padding=downsample_padding,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-        )
-    elif down_block_type == "CrossAttnDownBlock2D":
-        if cross_attention_dim is None:
-            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
-        return CrossAttnDownBlock2DWithZeroConv(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            downsample_padding=downsample_padding,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
-            dual_cross_attention=dual_cross_attention,
-            use_linear_projection=use_linear_projection,
-            only_cross_attention=only_cross_attention,
-            upcast_attention=upcast_attention,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-        )
-    raise NotImplementedError(f"{down_block_type} does not exist.")
-
-
-class CrossAttnDownBlock2DWithZeroConv(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        cross_attention_dim=1280,
-        output_scale_factor=1.0,
-        downsample_padding=1,
-        add_downsample=True,
-        dual_cross_attention=False,
-        use_linear_projection=False,
-        only_cross_attention=False,
-        upcast_attention=False,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-        zero_convs = []
-
-        self.has_cross_attention = True
-        self.attn_num_head_channels = attn_num_head_channels
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            zero_convs.append(zero_conv(out_channels))
-            if not dual_cross_attention:
-                attentions.append(
-                    Transformer2DModel(
-                        attn_num_head_channels,
-                        out_channels // attn_num_head_channels,
-                        in_channels=out_channels,
-                        num_layers=1,
-                        cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups,
-                        use_linear_projection=use_linear_projection,
-                        only_cross_attention=only_cross_attention,
-                        upcast_attention=upcast_attention,
-                    )
-                )
-            else:
-                attentions.append(
-                    DualTransformer2DModel(
-                        attn_num_head_channels,
-                        out_channels // attn_num_head_channels,
-                        in_channels=out_channels,
-                        num_layers=1,
-                        cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups,
-                    )
-                )
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-
-        if add_downsample:
-            self.downsamplers = nn.ModuleList(
-                [
-                    Downsample2D(
-                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
-                    )
-                ]
-            )
-            zero_convs.append(zero_conv(out_channels))
-        else:
-            self.downsamplers = None
-
-        self.zero_convs = nn.ModuleList(zero_convs)
-        self.gradient_checkpointing = False
-
-    def forward(
-        self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None
-    ):
-        # TODO(Patrick, William) - attention mask is not used
-        output_states = ()
-        zero_conved_states = ()
-
-        for resnet, attn, zero_conv in zip(self.resnets, self.attentions, self.zero_convs):
-            if self.training and self.gradient_checkpointing:
-
-                def create_custom_forward(module, return_dict=None):
-                    def custom_forward(*inputs):
-                        if return_dict is not None:
-                            return module(*inputs, return_dict=return_dict)
-                        else:
-                            return module(*inputs)
-
-                    return custom_forward
-
-                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(attn, return_dict=False),
-                    hidden_states,
-                    encoder_hidden_states,
-                    cross_attention_kwargs,
-                )[0]
-            else:
-                hidden_states = resnet(hidden_states, temb)
-                hidden_states = attn(
-                    hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-            output_states += (hidden_states,)
-            zero_conved_states += (zero_conv(hidden_states),)
-
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-
-            output_states += (hidden_states,)
-            zero_conved_states += (self.zero_convs[-1](hidden_states),)
-
-        return hidden_states, output_states, zero_conved_states
-
-
-class DownBlock2DWithZeroConv(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor=1.0,
-        add_downsample=True,
-        downsample_padding=1,
-    ):
-        super().__init__()
-        resnets = []
-        zero_convs = []
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            zero_convs.append(zero_conv(out_channels))
-
-        self.resnets = nn.ModuleList(resnets)
-
-        if add_downsample:
-            self.downsamplers = nn.ModuleList(
-                [
-                    Downsample2D(
-                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
-                    )
-                ]
-            )
-            zero_convs.append(zero_conv(out_channels))
-        else:
-            self.downsamplers = None
-
-        self.zero_convs = nn.ModuleList(zero_convs)
-        self.gradient_checkpointing = False
-
-    def forward(self, hidden_states, temb=None):
-        output_states = ()
-        zero_conved_states = ()
-
-        for resnet, zero_conv in zip(self.resnets, self.zero_convs):
-            if self.training and self.gradient_checkpointing:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-
-                    return custom_forward
-
-                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
-            else:
-                hidden_states = resnet(hidden_states, temb)
-
-            output_states += (hidden_states,)
-            zero_conved_states += (zero_conv(hidden_states),)
-
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-
-            output_states += (hidden_states,)
-            zero_conved_states += (self.zero_convs[-1](hidden_states),)
-
-        return hidden_states, output_states, zero_conved_states
diff --git a/src/diffusers/models/unet_2d_condition_controlnet.py b/src/diffusers/models/unet_2d_condition_controlnet.py
deleted file mode 100644
index 99d990e7c281..000000000000
--- a/src/diffusers/models/unet_2d_condition_controlnet.py
+++ /dev/null
@@ -1,639 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import torch
-import torch.nn as nn
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..loaders import UNet2DConditionLoadersMixin
-from ..utils import logging
-from . import UNet2DConditionModel
-from .cross_attention import AttnProcessor
-from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
-from .modeling_utils import ModelMixin
-from .unet_2d_blocks import (
-    CrossAttnDownBlock2D,
-    CrossAttnUpBlock2D,
-    DownBlock2D,
-    UNetMidBlock2DCrossAttn,
-    UNetMidBlock2DSimpleCrossAttn,
-    UpBlock2D,
-)
-from .unet_2d_blocks_controlnet import (
-    get_down_block_with_zero_conv,
-    set_zero_parameters,
-    zero_conv,
-)
-from .unet_2d_condition import UNet2DConditionOutput
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class ControlledUNet2DConditionModel(UNet2DConditionModel):
-    def forward(
-        self,
-        sample: torch.FloatTensor,
-        control: List[torch.Tensor],
-        timestep: Union[torch.Tensor, float, int],
-        encoder_hidden_states: torch.Tensor,
-        class_labels: Optional[torch.Tensor] = None,
-        timestep_cond: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        return_dict: bool = True,
-    ) -> Union[UNet2DConditionOutput, Tuple]:
-        # TODO: Fix Docstrings
-        # TODO: add only_mid_control option
-
-        r"""
-        Args:
-            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
-            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
-            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-
-        Returns:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-        """
-        # By default samples have to be AT least a multiple of the overall upsampling factor.
-        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
-        # However, the upsampling interpolation output size can be forced to fit any upsampling size
-        # on the fly if necessary.
-        default_overall_up_factor = 2**self.num_upsamplers
-
-        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
-        forward_upsample_size = False
-        upsample_size = None
-
-        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
-            logger.info("Forward upsample size to force interpolation output size.")
-            forward_upsample_size = True
-
-        # prepare attention_mask
-        if attention_mask is not None:
-            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
-            attention_mask = attention_mask.unsqueeze(1)
-
-        # 0. center input if necessary
-        if self.config.center_input_sample:
-            sample = 2 * sample - 1.0
-
-        # Frozen weight & biases from 1 to 4 for training
-        with torch.no_grad():
-            # 1. time
-            timesteps = timestep
-            if not torch.is_tensor(timesteps):
-                # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
-                # This would be a good case for the `match` statement (Python 3.10+)
-                is_mps = sample.device.type == "mps"
-                if isinstance(timestep, float):
-                    dtype = torch.float32 if is_mps else torch.float64
-                else:
-                    dtype = torch.int32 if is_mps else torch.int64
-                timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
-            elif len(timesteps.shape) == 0:
-                timesteps = timesteps[None].to(sample.device)
-
-            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-            timesteps = timesteps.expand(sample.shape[0])
-
-            t_emb = self.time_proj(timesteps)
-
-            # timesteps does not contain any weights and will always return f32 tensors
-            # but time_embedding might actually be running in fp16. so we need to cast here.
-            # there might be better ways to encapsulate this.
-            t_emb = t_emb.to(dtype=self.dtype)
-
-            emb = self.time_embedding(t_emb, timestep_cond)
-
-            if self.class_embedding is not None:
-                if class_labels is None:
-                    raise ValueError("class_labels should be provided when num_class_embeds > 0")
-
-                if self.config.class_embed_type == "timestep":
-                    class_labels = self.time_proj(class_labels)
-
-                class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
-                emb = emb + class_emb
-
-            # 2. pre-process
-            sample = self.conv_in(sample)
-
-            # 3. down
-            down_block_res_samples = (sample,)
-            for downsample_block in self.down_blocks:
-                if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
-                    sample, res_samples = downsample_block(
-                        hidden_states=sample,
-                        temb=emb,
-                        encoder_hidden_states=encoder_hidden_states,
-                        attention_mask=attention_mask,
-                        cross_attention_kwargs=cross_attention_kwargs,
-                    )
-                else:
-                    sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
-
-                down_block_res_samples += res_samples
-
-            # 4. mid
-            if self.mid_block is not None:
-                sample = self.mid_block(
-                    sample,
-                    emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                )
-
-        # 5. apply middle_block_out output
-        sample += control.pop()
-
-        # 6. up
-        for i, upsample_block in enumerate(self.up_blocks):
-            is_final_block = i == len(self.up_blocks) - 1
-
-            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
-            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
-
-            # apply controlnet downblock output
-            control_samples = control[-len(upsample_block.resnets) :]
-            control = control[: -len(upsample_block.resnets)]
-            res_samples = [r + c for r, c in zip(res_samples, control_samples)]
-
-            # if we have not reached the final block and need to forward the
-            # upsample size, we do it here
-            if not is_final_block and forward_upsample_size:
-                upsample_size = down_block_res_samples[-1].shape[2:]
-
-            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
-                sample = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    upsample_size=upsample_size,
-                    attention_mask=attention_mask,
-                )
-            else:
-                sample = upsample_block(
-                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
-                )
-
-        assert len(control) == 0, f"must consume all control array ({len(control)})"
-
-        # 7. post-process
-        if self.conv_norm_out:
-            sample = self.conv_norm_out(sample)
-            sample = self.conv_act(sample)
-        sample = self.conv_out(sample)
-
-        if not return_dict:
-            return (sample,)
-
-        return UNet2DConditionOutput(sample=sample)
-
-
-class ControlNetModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
-    _supports_gradient_checkpointing = True
-
-    @register_to_config
-    def __init__(
-        self,
-        sample_size: Optional[int] = None,
-        in_channels: int = 4,
-        center_input_sample: bool = False,
-        flip_sin_to_cos: bool = True,
-        freq_shift: int = 0,
-        down_block_types: Tuple[str] = (
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "DownBlock2D",
-        ),
-        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
-        only_cross_attention: Union[bool, Tuple[bool]] = False,
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
-        layers_per_block: int = 2,
-        downsample_padding: int = 1,
-        mid_block_scale_factor: float = 1,
-        act_fn: str = "silu",
-        norm_num_groups: Optional[int] = 32,
-        norm_eps: float = 1e-5,
-        cross_attention_dim: int = 1280,
-        attention_head_dim: Union[int, Tuple[int]] = 8,
-        dual_cross_attention: bool = False,
-        use_linear_projection: bool = False,
-        class_embed_type: Optional[str] = None,
-        num_class_embeds: Optional[int] = None,
-        upcast_attention: bool = False,
-        resnet_time_scale_shift: str = "default",
-        time_embedding_type: str = "positional",  # fourier, positional
-        timestep_post_act: Optional[str] = None,
-        time_cond_proj_dim: Optional[int] = None,
-        conv_in_kernel: int = 3,
-        hint_channels: int = 3,
-    ):
-        super().__init__()
-
-        self.sample_size = sample_size
-
-        # ControlNet specific blocks: Layer configurations are from reference implementation.
-        self.input_hint_block = nn.Sequential(
-            nn.Conv2d(hint_channels, 16, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(16, 16, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(16, 32, 3, padding=1, stride=2),
-            nn.SiLU(),
-            nn.Conv2d(32, 32, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(32, 96, 3, padding=1, stride=2),
-            nn.SiLU(),
-            nn.Conv2d(96, 96, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(96, 256, 3, padding=1, stride=2),
-            nn.SiLU(),
-            set_zero_parameters(nn.Conv2d(256, block_out_channels[0], 3, padding=1)),
-        )
-        self.input_zero_conv = zero_conv(block_out_channels[0])
-
-        # input
-        conv_in_padding = (conv_in_kernel - 1) // 2
-        self.conv_in = nn.Conv2d(
-            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
-        )
-
-        # time
-        if time_embedding_type == "fourier":
-            time_embed_dim = block_out_channels[0] * 2
-            if time_embed_dim % 2 != 0:
-                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
-            self.time_proj = GaussianFourierProjection(
-                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
-            )
-            timestep_input_dim = time_embed_dim
-        elif time_embedding_type == "positional":
-            time_embed_dim = block_out_channels[0] * 4
-
-            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
-            timestep_input_dim = block_out_channels[0]
-        else:
-            raise ValueError(
-                f"{time_embedding_type} does not exist. Pleaes make sure to use one of `fourier` or `positional`."
-            )
-
-        self.time_embedding = TimestepEmbedding(
-            timestep_input_dim,
-            time_embed_dim,
-            act_fn=act_fn,
-            post_act_fn=timestep_post_act,
-            cond_proj_dim=time_cond_proj_dim,
-        )
-
-        # class embedding
-        if class_embed_type is None and num_class_embeds is not None:
-            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
-        elif class_embed_type == "timestep":
-            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
-        elif class_embed_type == "identity":
-            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
-        else:
-            self.class_embedding = None
-
-        self.down_blocks = nn.ModuleList([])
-        self.up_blocks = nn.ModuleList([])
-
-        if isinstance(only_cross_attention, bool):
-            only_cross_attention = [only_cross_attention] * len(down_block_types)
-
-        if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim,) * len(down_block_types)
-
-        # down
-        output_channel = block_out_channels[0]
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-
-            down_block = get_down_block_with_zero_conv(
-                down_block_type,
-                num_layers=layers_per_block,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                temb_channels=time_embed_dim,
-                add_downsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=attention_head_dim[i],
-                downsample_padding=downsample_padding,
-                dual_cross_attention=dual_cross_attention,
-                use_linear_projection=use_linear_projection,
-                only_cross_attention=only_cross_attention[i],
-                upcast_attention=upcast_attention,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-            )
-            self.down_blocks.append(down_block)
-
-        # mid
-        if mid_block_type == "UNetMidBlock2DCrossAttn":
-            self.mid_block = UNetMidBlock2DCrossAttn(
-                in_channels=block_out_channels[-1],
-                temb_channels=time_embed_dim,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                output_scale_factor=mid_block_scale_factor,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=attention_head_dim[-1],
-                resnet_groups=norm_num_groups,
-                dual_cross_attention=dual_cross_attention,
-                use_linear_projection=use_linear_projection,
-                upcast_attention=upcast_attention,
-            )
-        elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
-            self.mid_block = UNetMidBlock2DSimpleCrossAttn(
-                in_channels=block_out_channels[-1],
-                temb_channels=time_embed_dim,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                output_scale_factor=mid_block_scale_factor,
-                cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=attention_head_dim[-1],
-                resnet_groups=norm_num_groups,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-            )
-        elif mid_block_type is None:
-            self.mid_block = None
-        else:
-            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
-
-        # count how many layers upsample the images
-        self.num_upsamplers = 0
-
-        # ControlNet specific block
-        self.middle_block_out = zero_conv(block_out_channels[-1])
-
-    @property
-    def attn_processors(self) -> Dict[str, AttnProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttnProcessor]):
-            if hasattr(module, "set_processor"):
-                processors[f"{name}.processor"] = module.processor
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    def set_attn_processor(self, processor: Union[AttnProcessor, Dict[str, AttnProcessor]]):
-        r"""
-        Parameters:
-            `processor (`dict` of `AttnProcessor` or `AttnProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                of **all** `CrossAttention` layers.
-            In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainablae attention processors.:
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
-    def set_attention_slice(self, slice_size):
-        r"""
-        Enable sliced attention computation.
-
-        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
-        in several steps. This is useful to save some memory in exchange for a small speed decrease.
-
-        Args:
-            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
-                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
-                `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is
-                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
-                must be a multiple of `slice_size`.
-        """
-        sliceable_head_dims = []
-
-        def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module):
-            if hasattr(module, "set_attention_slice"):
-                sliceable_head_dims.append(module.sliceable_head_dim)
-
-            for child in module.children():
-                fn_recursive_retrieve_slicable_dims(child)
-
-        # retrieve number of attention layers
-        for module in self.children():
-            fn_recursive_retrieve_slicable_dims(module)
-
-        num_slicable_layers = len(sliceable_head_dims)
-
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = [dim // 2 for dim in sliceable_head_dims]
-        elif slice_size == "max":
-            # make smallest slice possible
-            slice_size = num_slicable_layers * [1]
-
-        slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
-
-        if len(slice_size) != len(sliceable_head_dims):
-            raise ValueError(
-                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
-                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
-            )
-
-        for i in range(len(slice_size)):
-            size = slice_size[i]
-            dim = sliceable_head_dims[i]
-            if size is not None and size > dim:
-                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
-
-        # Recursively walk through all the children.
-        # Any children which exposes the set_attention_slice method
-        # gets the message
-        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
-            if hasattr(module, "set_attention_slice"):
-                module.set_attention_slice(slice_size.pop())
-
-            for child in module.children():
-                fn_recursive_set_attention_slice(child, slice_size)
-
-        reversed_slice_size = list(reversed(slice_size))
-        for module in self.children():
-            fn_recursive_set_attention_slice(module, reversed_slice_size)
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)):
-            module.gradient_checkpointing = value
-
-    def forward(
-        self,
-        sample: torch.FloatTensor,
-        hint: torch.Tensor,
-        timestep: Union[torch.Tensor, float, int],
-        encoder_hidden_states: torch.Tensor,
-        class_labels: Optional[torch.Tensor] = None,
-        timestep_cond: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> Union[UNet2DConditionOutput, Tuple]:
-        # TODO: fix docstring
-
-        r"""
-        Args:
-            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
-            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
-            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-
-        Returns:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-        """
-
-        # prepare output
-        outputs = []
-
-        # prepare attention_mask
-        if attention_mask is not None:
-            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
-            attention_mask = attention_mask.unsqueeze(1)
-
-        # 0. center input if necessary
-        if self.config.center_input_sample:
-            sample = 2 * sample - 1.0
-
-        # 1. time
-        timesteps = timestep
-        if not torch.is_tensor(timesteps):
-            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
-            # This would be a good case for the `match` statement (Python 3.10+)
-            is_mps = sample.device.type == "mps"
-            if isinstance(timestep, float):
-                dtype = torch.float32 if is_mps else torch.float64
-            else:
-                dtype = torch.int32 if is_mps else torch.int64
-            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
-        elif len(timesteps.shape) == 0:
-            timesteps = timesteps[None].to(sample.device)
-
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand(sample.shape[0])
-
-        t_emb = self.time_proj(timesteps)
-
-        # timesteps does not contain any weights and will always return f32 tensors
-        # but time_embedding might actually be running in fp16. so we need to cast here.
-        # there might be better ways to encapsulate this.
-        t_emb = t_emb.to(dtype=self.dtype)
-
-        emb = self.time_embedding(t_emb, timestep_cond)
-
-        if self.class_embedding is not None:
-            if class_labels is None:
-                raise ValueError("class_labels should be provided when num_class_embeds > 0")
-
-            if self.config.class_embed_type == "timestep":
-                class_labels = self.time_proj(class_labels)
-
-            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
-            emb = emb + class_emb
-
-        # 2. input_hint (ControlNet specific)
-        guided_hint = self.input_hint_block(hint)
-
-        # 3. pre-process
-        sample = self.conv_in(sample)
-        sample += guided_hint
-        outputs.append(self.input_zero_conv(sample))
-
-        # 4. down
-        down_block_res_samples = (sample,)
-        for downsample_block in self.down_blocks:
-            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
-                sample, res_samples, zero_conved_samples = downsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                )
-            else:
-                sample, res_samples, zero_conved_samples = downsample_block(hidden_states=sample, temb=emb)
-
-            down_block_res_samples += res_samples
-            outputs += zero_conved_samples
-
-        # 5. mid
-        assert self.mid_block is not None
-        sample = self.mid_block(
-            sample,
-            emb,
-            encoder_hidden_states=encoder_hidden_states,
-            attention_mask=attention_mask,
-            cross_attention_kwargs=cross_attention_kwargs,
-        )
-        outputs.append(self.middle_block_out(sample))
-
-        return outputs
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index a04fc5c5c63f..9a1f6264139b 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -15,7 +15,7 @@
 
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
-from ...models import AutoencoderKL, ControlledUNet2DConditionModel, ControlNetModel
+from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import logging
 from ..pipeline_utils import DiffusionPipeline
@@ -31,8 +31,8 @@ def __init__(
         vae: AutoencoderKL,
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
-        unet: ControlledUNet2DConditionModel,
-        controlnet: ControlNetModel,
+        unet: UNet2DConditionModel,
+        controlnet: UNet2DConditionModel,
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
diff --git a/tests/models/test_models_unet_2d_condition_controlnet.py b/tests/models/test_models_unet_2d_condition_controlnet.py
deleted file mode 100644
index d3ec06ccc6b3..000000000000
--- a/tests/models/test_models_unet_2d_condition_controlnet.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import pytest
-import torch
-
-from diffusers import ControlledUNet2DConditionModel, ControlNetModel, UNet2DConditionModel
-
-
-# config from ControlNet_SD1.5
-unet_config = {
-    "sample_size": 64,
-    "in_channels": 4,
-    "out_channels": 4,
-    "down_block_types": ("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"),
-    "up_block_types": ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
-    "block_out_channels": (320, 640, 1280, 1280),
-    "layers_per_block": 2,
-    "cross_attention_dim": 768,
-    "attention_head_dim": 8,
-    "use_linear_projection": False,
-    "upcast_attention": False,
-}
-
-ctrlnet_config = {
-    "sample_size": 64,
-    "in_channels": 4,
-    "down_block_types": ("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"),
-    "block_out_channels": (320, 640, 1280, 1280),
-    "layers_per_block": 2,
-    "cross_attention_dim": 768,
-    "attention_head_dim": 8,
-    "use_linear_projection": False,
-    "hint_channels": 3,
-    "upcast_attention": False,
-}
-
-################################################################################
-# WIP
-################################################################################
-
-
-@pytest.mark.skip
-def test_unet_inference_without_exception():
-    sample = torch.randn((1, 4, 64, 64)).cuda()
-    timestep = 0
-    encoder_hidden_states = torch.randn((1, 77, 768)).cuda()
-    model = UNet2DConditionModel(**unet_config).cuda()
-    model.eval()
-    with torch.no_grad():
-        out = model(sample=sample, timestep=timestep, encoder_hidden_states=encoder_hidden_states)
-    assert out.sample.shape == (1, 4, 64, 64)
-    print(out.sample)
-
-
-def controlnet_inference():
-    sample = torch.randn((1, 4, 64, 64)).cuda()
-    hint = torch.randn((1, 3, 512, 512)).cuda()
-    timestep = 0
-    encoder_hidden_states = torch.randn((1, 77, 768)).cuda()
-    model = ControlNetModel(**ctrlnet_config).cuda()
-    model.eval()
-    with torch.no_grad():
-        outputs = model(sample=sample, hint=hint, timestep=timestep, encoder_hidden_states=encoder_hidden_states)
-    return outputs
-
-
-@pytest.mark.skip
-def test_controlnet_inference():
-    outputs = controlnet_inference()
-    assert len(outputs) == 12 + 1  # 12 layer down and one middle
-    print(outputs)
-
-
-def test_controlled_unet_inference():
-    sample = torch.randn((1, 4, 64, 64)).cuda()
-    control = controlnet_inference()
-    timestep = 0
-    encoder_hidden_states = torch.randn((1, 77, 768)).cuda()
-    model = ControlledUNet2DConditionModel(**unet_config).cuda()
-    model.eval()
-    with torch.no_grad():
-        out = model(sample=sample, control=control, timestep=timestep, encoder_hidden_states=encoder_hidden_states)
-    assert out.sample.shape == (1, 4, 64, 64)
-    print(out.sample)

From 76569253302bcdccc8ebb08ff5a69c041be8f040 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Fri, 17 Feb 2023 02:33:38 +0900
Subject: [PATCH 019/122] from_pretrained and inference test passed

---
 tests/models/test_models_controlnet.py | 71 ++++++++++++++++++--------
 1 file changed, 50 insertions(+), 21 deletions(-)

diff --git a/tests/models/test_models_controlnet.py b/tests/models/test_models_controlnet.py
index 2da25fcb379a..8b5a0acd7adf 100644
--- a/tests/models/test_models_controlnet.py
+++ b/tests/models/test_models_controlnet.py
@@ -35,27 +35,21 @@
     "controlnet_hint_channels": 3,
     "upcast_attention": False,
 }
+model_id_sd15_canny = "takuma104/control_sd15_canny"  # currntry this is private model
 
 
-# @pytest.mark.skip
-def test_unet_inference():
-    sample = torch.randn((1, 4, 64, 64)).cuda()
-    timestep = 0
-    encoder_hidden_states = torch.randn((1, 77, 768)).cuda()
-    model = UNet2DConditionModel(**unet_config).cuda()
-    model.eval()
-    with torch.no_grad():
-        out = model(sample=sample, timestep=timestep, encoder_hidden_states=encoder_hidden_states)
-    assert out.sample.shape == (1, 4, 64, 64)
-    print(out.sample)
+## utils #######################################################################
 
 
-def controlnet_inference():
+def controlnet_inference(model_id=None):
     sample = torch.randn((1, 4, 64, 64)).cuda()
     hint = torch.randn((1, 3, 512, 512)).cuda()
     timestep = 0
     encoder_hidden_states = torch.randn((1, 77, 768)).cuda()
-    model = UNet2DConditionModel(**ctrlnet_config).cuda()
+    if model_id is None:
+        model = UNet2DConditionModel(**ctrlnet_config).cuda()
+    else:
+        model = UNet2DConditionModel.from_pretrained(model_id, subfolder="controlnet").cuda()
     model.eval()
     with torch.no_grad():
         outputs = model(
@@ -64,21 +58,56 @@ def controlnet_inference():
     return outputs
 
 
-# @pytest.mark.skip
-def test_controlnet_inference():
-    outputs = controlnet_inference()
-    assert len(outputs) == 12 + 1  # 12 layer down and one middle
-    print(outputs)
+def controlled_unet_inference(control, model_id=None):
+    sample = torch.randn((1, 4, 64, 64)).cuda()
+    timestep = 0
+    encoder_hidden_states = torch.randn((1, 77, 768)).cuda()
+    if model_id is None:
+        model = UNet2DConditionModel(**unet_config).cuda()
+    else:
+        model = UNet2DConditionModel.from_pretrained(model_id_sd15_canny, subfolder="unet").cuda()
+    model.eval()
+    with torch.no_grad():
+        out = model(sample=sample, control=control, timestep=timestep, encoder_hidden_states=encoder_hidden_states)
+    return out
 
 
-def test_controlled_unet_inference():
+## tests #######################################################################
+
+
+def test_unet_inference():
     sample = torch.randn((1, 4, 64, 64)).cuda()
-    control = controlnet_inference()
     timestep = 0
     encoder_hidden_states = torch.randn((1, 77, 768)).cuda()
     model = UNet2DConditionModel(**unet_config).cuda()
     model.eval()
     with torch.no_grad():
-        out = model(sample=sample, control=control, timestep=timestep, encoder_hidden_states=encoder_hidden_states)
+        out = model(sample=sample, timestep=timestep, encoder_hidden_states=encoder_hidden_states)
+    assert out.sample.shape == (1, 4, 64, 64)
+    print(out.sample)
+
+
+def test_controlnet_inference():
+    outputs = controlnet_inference()
+    assert len(outputs) == 12 + 1  # 12 layer down and one middle
+    print(outputs)
+
+
+def test_controlled_unet_inference():
+    control = controlnet_inference()
+    out = controlled_unet_inference(control=control)
+    assert out.sample.shape == (1, 4, 64, 64)
+    print(out.sample)
+
+
+def test_controlnet_from_pretrained_and_inference():
+    outputs = controlnet_inference(model_id=model_id_sd15_canny)
+    assert len(outputs) == 12 + 1  # 12 layer down and one middle
+    print(outputs)
+
+
+def test_controlled_unet_from_pretrained_and_inference():
+    control = controlnet_inference()
+    out = controlled_unet_inference(control=control, model_id=model_id_sd15_canny)
     assert out.sample.shape == (1, 4, 64, 64)
     print(out.sample)

From 839e009e17f6c6d9149f25461c1a270b40cc72d0 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Fri, 17 Feb 2023 03:07:58 +0900
Subject: [PATCH 020/122] copied from pipeline_stable_diffusion.py except
 `__init__()`

---
 .../pipeline_stable_diffusion_controlnet.py   | 496 +++++++++++++++++-
 .../test_stable_diffusion_control_net.py      |  26 +
 2 files changed, 521 insertions(+), 1 deletion(-)
 create mode 100644 tests/pipelines/stable_diffusion/test_stable_diffusion_control_net.py

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 9a1f6264139b..1449ba36717f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -13,12 +13,17 @@
 # limitations under the License.
 
 
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import logging
+from ...utils import is_accelerate_available, logging, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
+from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
 
 
@@ -52,3 +57,492 @@ def __init__(
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding.
+
+        When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
+        steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+            cpu_offload(cpu_offloaded_model, device)
+
+        if self.safety_checker is not None:
+            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    # @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        elif output_type == "pil":
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+        else:
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_control_net.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_control_net.py
new file mode 100644
index 000000000000..dd62e6046e48
--- /dev/null
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_control_net.py
@@ -0,0 +1,26 @@
+import pytest
+import torch
+
+from diffusers import StableDiffusionControlNetPipeline
+
+
+################################################################################
+# PoC version
+################################################################################
+
+model_id_sd15_canny = "takuma104/control_sd15_canny"  # currntry this is private model
+
+
+@pytest.mark.skip
+def test_from_pretrained():
+    pipe = StableDiffusionControlNetPipeline.from_pretrained(model_id_sd15_canny)
+    print(pipe)
+
+
+def test_from_pretrained_and_inference():
+    pipe = StableDiffusionControlNetPipeline.from_pretrained(model_id_sd15_canny, torch_dtype=torch.bfloat16).to(
+        "cuda"
+    )
+    image = pipe(prompt="an apple", num_inference_steps=15).images[0]
+    image.save("/tmp/an_apple_generated.png")
+    print(image.size)

From cf16a43ba0e4c9857bf9759dde6253e6649a81f7 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Sat, 18 Feb 2023 02:35:28 +0900
Subject: [PATCH 021/122] Impl pipeline, pixel match test (almost) passed.

---
 .../pipeline_stable_diffusion_controlnet.py   | 28 ++++++++---
 .../test_stable_diffusion_control_net.py      | 48 ++++++++++++++++++-
 2 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 1449ba36717f..26d80ca8a922 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -369,6 +369,7 @@ def __call__(
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: Optional[int] = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_hint: Optional[torch.FloatTensor] = None,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -501,13 +502,26 @@ def __call__(
                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
+                if controlnet_hint is not None:
+                    # ControlNet predict the noise residual
+                    control = self.controlnet(
+                        latent_model_input, t, encoder_hidden_states=prompt_embeds, controlnet_hint=controlnet_hint
+                    )
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        control=control,
+                    ).sample
+                else:
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                    ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_control_net.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_control_net.py
index dd62e6046e48..37e0ade8d061 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_control_net.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_control_net.py
@@ -1,14 +1,22 @@
+import einops
+import numpy as np
 import pytest
 import torch
 
 from diffusers import StableDiffusionControlNetPipeline
+from diffusers.utils import load_image
 
 
 ################################################################################
 # PoC version
 ################################################################################
 
-model_id_sd15_canny = "takuma104/control_sd15_canny"  # currntry this is private model
+model_id_sd15_canny = "takuma104/control_sd15_canny"
+test_prompt = "best quality, extremely detailed, illustration, looking at viewer"
+test_negative_prompt = (
+    "longbody, lowres, bad anatomy, bad hands, missing fingers, "
+    + "pubic hair,extra digit, fewer digits, cropped, worst quality, low quality"
+)
 
 
 @pytest.mark.skip
@@ -17,10 +25,46 @@ def test_from_pretrained():
     print(pipe)
 
 
-def test_from_pretrained_and_inference():
+@pytest.mark.skip
+def test_from_pretrained_and_unet_inference():
     pipe = StableDiffusionControlNetPipeline.from_pretrained(model_id_sd15_canny, torch_dtype=torch.bfloat16).to(
         "cuda"
     )
     image = pipe(prompt="an apple", num_inference_steps=15).images[0]
     image.save("/tmp/an_apple_generated.png")
     print(image.size)
+
+
+def test_pixel_match():
+    pipe = StableDiffusionControlNetPipeline.from_pretrained(model_id_sd15_canny).to("cuda")
+    pipe.enable_attention_slicing(1)
+
+    seed = 0
+    canny_edged_image = load_image(
+        "https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged.png"
+    )
+
+    # reference image generated by https://gist.github.com/takuma104/6cdb6d9aa27f67462f11554cccdf4b34
+    output_ref_image = load_image(
+        f"https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged_seed_{seed}.png"
+    )
+
+    # code from https://github.com/lllyasviel/ControlNet/blob/main/gradio_canny2image.py
+    num_samples = 1
+    control = torch.from_numpy(np.array(canny_edged_image).copy()).float().cuda() / 255.0
+    control = torch.stack([control for _ in range(num_samples)], dim=0)
+    control = einops.rearrange(control, "b h w c -> b c h w").clone()
+
+    generator = torch.Generator(device="cuda").manual_seed(seed)
+    image = pipe(
+        prompt=test_prompt,
+        negative_prompt=test_negative_prompt,
+        guidance_scale=9.0,
+        num_inference_steps=20,
+        generator=generator,
+        controlnet_hint=control,
+    ).images[0]
+    image.save(f"/tmp/seed_{seed}.png")
+
+    max_diff = np.abs(np.array(image).astype(np.int) - np.array(output_ref_image).astype(np.int)).max()
+    assert max_diff < 10  # must be max_diff == 0 but it appears that there is a tiny difference for some reason.

From 9cc8b99399bb47de690803ebd8662f2f2a553c8b Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Sat, 18 Feb 2023 04:58:02 +0900
Subject: [PATCH 022/122] make style

---
 src/diffusers/__init__.py                                     | 2 +-
 src/diffusers/pipelines/__init__.py                           | 2 +-
 src/diffusers/pipelines/stable_diffusion/__init__.py          | 2 +-
 src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index bcfae36a2c20..e04bbc10bf6b 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -111,9 +111,9 @@
         CycleDiffusionPipeline,
         LDMTextToImagePipeline,
         PaintByExamplePipeline,
-        StableDiffusionControlNetPipeline,
         SemanticStableDiffusionPipeline,
         StableDiffusionAttendAndExcitePipeline,
+        StableDiffusionControlNetPipeline,
         StableDiffusionDepth2ImgPipeline,
         StableDiffusionImageVariationPipeline,
         StableDiffusionImg2ImgPipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 6753e71f2e10..f5b7026fdc85 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -47,8 +47,8 @@
     from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
     from .stable_diffusion import (
         CycleDiffusionPipeline,
-        StableDiffusionControlNetPipeline,
         StableDiffusionAttendAndExcitePipeline,
+        StableDiffusionControlNetPipeline,
         StableDiffusionDepth2ImgPipeline,
         StableDiffusionImageVariationPipeline,
         StableDiffusionImg2ImgPipeline,
diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
index 01cc71e50ce9..c20394845e86 100644
--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -44,8 +44,8 @@ class StableDiffusionPipelineOutput(BaseOutput):
 else:
     from .pipeline_cycle_diffusion import CycleDiffusionPipeline
     from .pipeline_stable_diffusion import StableDiffusionPipeline
-    from .pipeline_stable_diffusion_controlnet import StableDiffusionControlNetPipeline
     from .pipeline_stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline
+    from .pipeline_stable_diffusion_controlnet import StableDiffusionControlNetPipeline
     from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
     from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
     from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index d47466b5cb0a..2d436eae0724 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -43,8 +43,8 @@
     LDMTextToImagePipeline,
     LMSDiscreteScheduler,
     PNDMScheduler,
-    StableDiffusionControlNetPipeline,
     PriorTransformer,
+    StableDiffusionControlNetPipeline,
     StableDiffusionPipeline,
     StableUnCLIPImg2ImgPipeline,
     StableUnCLIPPipeline,

From 7dbbe22ef90b36f2d1932aa9aa2a5381f8c414d9 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Sat, 18 Feb 2023 05:07:16 +0900
Subject: [PATCH 023/122] make fix-copies

---
 .../versatile_diffusion/modeling_text_unet.py | 40 ++++++++++++++++++-
 .../dummy_torch_and_transformers_objects.py   | 15 +++++++
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 4adf9eed0e29..86f75ee864dd 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -234,6 +234,7 @@ def __init__(
         conv_in_kernel: int = 3,
         conv_out_kernel: int = 3,
         projection_class_embeddings_input_dim: Optional[int] = None,
+        controlnet_hint_channels: Optional[int] = None,
     ):
         super().__init__()
 
@@ -393,6 +394,18 @@ def __init__(
         # count how many layers upsample the images
         self.num_upsamplers = 0
 
+        if controlnet_hint_channels is not None:
+            # ControlNet: add input_hint_block, zero_conv_block
+            self.controlnet_input_hint_block = ControlNetInputHintBlock(
+                hint_channels=controlnet_hint_channels, channels=block_out_channels[0]
+            )
+            self.controlnet_zero_conv_block = ControlNetZeroConvBlock(
+                block_out_channels=block_out_channels,
+                down_block_types=down_block_types,
+                layers_per_block=layers_per_block,
+            )
+            return  # Modules from the following lines are not defined in ControlNet
+
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
         reversed_attention_head_dim = list(reversed(attention_head_dim))
@@ -582,8 +595,10 @@ def forward(
         timestep_cond: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_hint: Optional[torch.Tensor] = None,
+        control: Optional[List[torch.Tensor]] = None,
         return_dict: bool = True,
-    ) -> Union[UNet2DConditionOutput, Tuple]:
+    ) -> Union[UNet2DConditionOutput, Tuple, List[torch.Tensor]]:
         r"""
         Args:
             sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
@@ -662,6 +677,8 @@ def forward(
 
         # 2. pre-process
         sample = self.conv_in(sample)
+        if controlnet_hint is not None:
+            sample += self.controlnet_input_hint_block(controlnet_hint)
 
         # 3. down
         down_block_res_samples = (sample,)
@@ -689,6 +706,16 @@ def forward(
                 cross_attention_kwargs=cross_attention_kwargs,
             )
 
+        if controlnet_hint is not None:
+            # ControlNet: zero convs
+            return self.controlnet_zero_conv_block(
+                down_block_res_samples=down_block_res_samples, mid_block_sample=sample
+            )
+
+        if control is not None:
+            # ControlledUnet: apply mid_zero_conv output
+            sample += control.pop()
+
         # 5. up
         for i, upsample_block in enumerate(self.up_blocks):
             is_final_block = i == len(self.up_blocks) - 1
@@ -696,6 +723,12 @@ def forward(
             res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
             down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
 
+            if control is not None:
+                # ControlledUnet: apply ControlNet downblock zero_convs output
+                control_samples = control[-len(upsample_block.resnets) :]
+                control = control[: -len(upsample_block.resnets)]
+                res_samples = [r + c for r, c in zip(res_samples, control_samples)]
+
             # if we have not reached the final block and need to forward the
             # upsample size, we do it here
             if not is_final_block and forward_upsample_size:
@@ -715,6 +748,11 @@ def forward(
                 sample = upsample_block(
                     hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
                 )
+
+        # TODO: remove this block
+        if control is not None:
+            assert len(control) == 0, f"must consume all control array ({len(control)})"
+
         # 6. post-process
         if self.conv_norm_out:
             sample = self.conv_norm_out(sample)
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 24731a00cabe..1b0f812ad16c 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -107,6 +107,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class StableDiffusionControlNetPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class StableDiffusionDepth2ImgPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

From a316d863ac8b1ebeb9dd13ca5c2179f247edb18f Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Sat, 18 Feb 2023 05:19:37 +0900
Subject: [PATCH 024/122] Fix to add import ControlNet blocks for `make
 fix-copies`

---
 .../pipelines/versatile_diffusion/modeling_text_unet.py          | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 86f75ee864dd..210daaec7f2d 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -7,6 +7,7 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...models import ModelMixin
 from ...models.attention import CrossAttention
+from ...models.controlnet_blocks import ControlNetInputHintBlock, ControlNetZeroConvBlock
 from ...models.cross_attention import AttnProcessor, CrossAttnAddedKVProcessor
 from ...models.dual_transformer_2d import DualTransformer2DModel
 from ...models.embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps

From b17fd20d9206a8adce00bbddf878cfd6c293457e Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Sat, 18 Feb 2023 19:54:32 +0900
Subject: [PATCH 025/122] Remove einops dependency

---
 .../test_stable_diffusion_control_net.py               | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_control_net.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_control_net.py
index 37e0ade8d061..d8dc66c64667 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_control_net.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_control_net.py
@@ -1,4 +1,3 @@
-import einops
 import numpy as np
 import pytest
 import torch
@@ -49,11 +48,10 @@ def test_pixel_match():
         f"https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged_seed_{seed}.png"
     )
 
-    # code from https://github.com/lllyasviel/ControlNet/blob/main/gradio_canny2image.py
-    num_samples = 1
+    batch = 1
     control = torch.from_numpy(np.array(canny_edged_image).copy()).float().cuda() / 255.0
-    control = torch.stack([control for _ in range(num_samples)], dim=0)
-    control = einops.rearrange(control, "b h w c -> b c h w").clone()
+    control = control.repeat(batch, 1, 1, 1)
+    control = control.permute(0, 3, 1, 2)  # b h w c -> b c h w
 
     generator = torch.Generator(device="cuda").manual_seed(seed)
     image = pipe(
@@ -66,5 +64,5 @@ def test_pixel_match():
     ).images[0]
     image.save(f"/tmp/seed_{seed}.png")
 
-    max_diff = np.abs(np.array(image).astype(np.int) - np.array(output_ref_image).astype(np.int)).max()
+    max_diff = np.abs(np.array(image).astype(np.int32) - np.array(output_ref_image).astype(np.int32)).max()
     assert max_diff < 10  # must be max_diff == 0 but it appears that there is a tiny difference for some reason.

From 894bd84ad8c5e438fb28d956e3a60d10d13d0b02 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Sat, 18 Feb 2023 21:32:07 +0900
Subject: [PATCH 026/122] Support  np.ndarray, PIL.Image for controlnet_hint

---
 .../pipeline_stable_diffusion_controlnet.py   | 72 ++++++++++++++++---
 .../test_stable_diffusion_control_net.py      | 29 ++++++++
 2 files changed, 93 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 26d80ca8a922..dc337c965477 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -16,6 +16,8 @@
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Union
 
+import numpy as np
+import PIL.Image
 import torch
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
@@ -348,6 +350,57 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
         latents = latents * self.scheduler.init_noise_sigma
         return latents
 
+    def controlnet_hint_conversion(self, controlnet_hint, height, width, num_images_per_prompt):
+        channels = 3
+        if isinstance(controlnet_hint, torch.Tensor):
+            # torch.Tensor: acceptble shape are any of chw, bchw(b==1) or bchw(b==num_images_per_prompt)
+            shape_chw = (channels, height, width)
+            shape_bchw = (1, channels, height, width)
+            shape_nchw = (num_images_per_prompt, channels, height, width)
+            if controlnet_hint.shape in [shape_chw, shape_bchw, shape_nchw]:
+                controlnet_hint = controlnet_hint.to(dtype=self.controlnet.dtype, device=self.controlnet.device)
+                if controlnet_hint.shape != shape_nchw:
+                    controlnet_hint = controlnet_hint.repeat(num_images_per_prompt, 1, 1, 1)
+                return controlnet_hint
+            else:
+                raise ValueError(
+                    f"Acceptble shape of `controlnet_hint` are any of ({channels}, {height}, {width}),"
+                    + f" (1, {channels}, {height}, {width}) or ({num_images_per_prompt}, "
+                    + f"{channels}, {height}, {width}) but is {controlnet_hint.shape}"
+                )
+        elif isinstance(controlnet_hint, np.ndarray):
+            # np.ndarray: acceptable shape is any of hwc, bhwc(b==1) or bhwc(b==num_images_per_promot)
+            # hwc is opencv compatible image format
+            shape_hwc = (height, width, channels)
+            shape_bhwc = (1, height, width, channels)
+            shape_nhwc = (num_images_per_prompt, height, width, channels)
+            if controlnet_hint.shape in [shape_hwc, shape_bhwc, shape_nhwc]:
+                controlnet_hint = torch.from_numpy(controlnet_hint.copy())
+                controlnet_hint = controlnet_hint.to(dtype=self.controlnet.dtype, device=self.controlnet.device)
+                controlnet_hint /= 255.0
+                if controlnet_hint.shape != shape_nhwc:
+                    controlnet_hint = controlnet_hint.repeat(num_images_per_prompt, 1, 1, 1)
+                controlnet_hint = controlnet_hint.permute(0, 3, 1, 2)  # b h w c -> b c h w
+                return controlnet_hint
+            else:
+                raise ValueError(
+                    f"Acceptble shape of `controlnet_hint` are any of ({height}, {width}, {channels}),"
+                    + f" (1, {height}, {width}, {channels}) or ({num_images_per_prompt}, "
+                    + f"{channels}, {height}, {width}) but is {controlnet_hint.shape}"
+                )
+        elif isinstance(controlnet_hint, PIL.Image.Image):
+            if controlnet_hint.size == (width, height):
+                # controlnet_hint = controlnet_hint.convert("RGB")  # make sure 3 channel RGB format
+                return self.controlnet_hint_conversion(np.array(controlnet_hint), height, width, num_images_per_prompt)
+            else:
+                raise ValueError(
+                    f"Acceptable image size of `controlnet_hint` is ({width}, {height}) but is {controlnet_hint.size}"
+                )
+        else:
+            raise ValueError(
+                f"Acceptable type of `controlnet_hint` are any of torch.Tensor, np.ndarray, PIL.Image.Image but is {type(controlnet_hint)}"
+            )
+
     @torch.no_grad()
     # @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
@@ -369,7 +422,7 @@ def __call__(
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: Optional[int] = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_hint: Optional[torch.FloatTensor] = None,
+        controlnet_hint: Optional[Union[torch.FloatTensor, np.ndarray, PIL.Image.Image]] = None,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -444,12 +497,15 @@ def __call__(
         height = height or self.unet.config.sample_size * self.vae_scale_factor
         width = width or self.unet.config.sample_size * self.vae_scale_factor
 
-        # 1. Check inputs. Raise error if not correct
+        # 1. Control Embedding check & conversion
+        controlnet_hint = self.controlnet_hint_conversion(controlnet_hint, height, width, num_images_per_prompt)
+
+        # 2. Check inputs. Raise error if not correct
         self.check_inputs(
             prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
         )
 
-        # 2. Define call parameters
+        # 3. Define call parameters
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
         elif prompt is not None and isinstance(prompt, list):
@@ -463,7 +519,7 @@ def __call__(
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
 
-        # 3. Encode input prompt
+        # 4. Encode input prompt
         prompt_embeds = self._encode_prompt(
             prompt,
             device,
@@ -474,11 +530,11 @@ def __call__(
             negative_prompt_embeds=negative_prompt_embeds,
         )
 
-        # 4. Prepare timesteps
+        # 5. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
 
-        # 5. Prepare latent variables
+        # 6. Prepare latent variables
         num_channels_latents = self.unet.in_channels
         latents = self.prepare_latents(
             batch_size * num_images_per_prompt,
@@ -491,10 +547,10 @@ def __call__(
             latents,
         )
 
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
-        # 7. Denoising loop
+        # 8. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_control_net.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_control_net.py
index d8dc66c64667..f9e4e8e06a8f 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_control_net.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_control_net.py
@@ -66,3 +66,32 @@ def test_pixel_match():
 
     max_diff = np.abs(np.array(image).astype(np.int32) - np.array(output_ref_image).astype(np.int32)).max()
     assert max_diff < 10  # must be max_diff == 0 but it appears that there is a tiny difference for some reason.
+
+
+def test_pixel_match_image_argument():
+    pipe = StableDiffusionControlNetPipeline.from_pretrained(model_id_sd15_canny).to("cuda")
+    pipe.enable_attention_slicing(1)
+
+    seed = 0
+    canny_edged_image = load_image(
+        "https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged.png"
+    )
+
+    # reference image generated by https://gist.github.com/takuma104/6cdb6d9aa27f67462f11554cccdf4b34
+    output_ref_image = load_image(
+        f"https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged_seed_{seed}.png"
+    )
+
+    generator = torch.Generator(device="cuda").manual_seed(seed)
+    image = pipe(
+        prompt=test_prompt,
+        negative_prompt=test_negative_prompt,
+        guidance_scale=9.0,
+        num_inference_steps=20,
+        generator=generator,
+        controlnet_hint=canny_edged_image,
+    ).images[0]
+    image.save(f"/tmp/seed_{seed}.png")
+
+    max_diff = np.abs(np.array(image).astype(np.int32) - np.array(output_ref_image).astype(np.int32)).max()
+    assert max_diff < 10  # must be max_diff == 0 but it appears that there is a tiny difference for some reason.

From 3d3a02f0d5d14213d87eab5dd4c3e84281883dd1 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Sat, 18 Feb 2023 23:24:21 +0900
Subject: [PATCH 027/122] set default config file as lllyasviel's

---
 .../stable_diffusion/convert_from_ckpt.py     | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index 2d436eae0724..5ce083b76521 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -1452,21 +1452,22 @@ def load_pipeline_from_control_net_ckpt(
 
             original_config_file = os.path.join(tmpdir, "inference.yaml")
             if key_name in checkpoint and checkpoint[key_name].shape[-1] == 1024:
-                if not os.path.isfile("v2-inference-v.yaml"):
-                    # model_type = "v2"
-                    r = requests.get(
-                        " https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml"
-                    )
-                    open(original_config_file, "wb").write(r.content)
-
-                if global_step == 110000:
-                    # v2.1 needs to upcast attention
-                    upcast_attention = True
+                raise NotImplementedError("Currently only support SD1.x models.")
+                # if not os.path.isfile("v2-inference-v.yaml"):
+                #     # model_type = "v2"
+                #     r = requests.get(
+                #         " https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml"
+                #     )
+                #     open(original_config_file, "wb").write(r.content)
+
+                # if global_step == 110000:
+                #     # v2.1 needs to upcast attention
+                #     upcast_attention = True
             else:
-                if not os.path.isfile("v1-inference.yaml"):
+                if not os.path.isfile("cldm_v15.yaml"):
                     # model_type = "v1"
                     r = requests.get(
-                        " https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
+                        "https://raw.githubusercontent.com/lllyasviel/ControlNet/main/models/cldm_v15.yaml"
                     )
                     open(original_config_file, "wb").write(r.content)
 

From 38bf48d245bfa2e620b031ac7d79c0402ea8bcc6 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Sun, 19 Feb 2023 01:31:00 +0900
Subject: [PATCH 028/122] Add support grayscale (hw) numpy array

---
 .../pipeline_stable_diffusion_controlnet.py         | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index dc337c965477..3c3ae12fdbaf 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -369,8 +369,10 @@ def controlnet_hint_conversion(self, controlnet_hint, height, width, num_images_
                     + f"{channels}, {height}, {width}) but is {controlnet_hint.shape}"
                 )
         elif isinstance(controlnet_hint, np.ndarray):
-            # np.ndarray: acceptable shape is any of hwc, bhwc(b==1) or bhwc(b==num_images_per_promot)
+            # np.ndarray: acceptable shape is any of hw, hwc, bhwc(b==1) or bhwc(b==num_images_per_promot)
             # hwc is opencv compatible image format
+            if controlnet_hint.shape == (height, width):
+                controlnet_hint = np.repeat(controlnet_hint[:, :, np.newaxis], channels, axis=2)  # hw -> hwc(c==3)
             shape_hwc = (height, width, channels)
             shape_bhwc = (1, height, width, channels)
             shape_nhwc = (num_images_per_prompt, height, width, channels)
@@ -384,13 +386,14 @@ def controlnet_hint_conversion(self, controlnet_hint, height, width, num_images_
                 return controlnet_hint
             else:
                 raise ValueError(
-                    f"Acceptble shape of `controlnet_hint` are any of ({height}, {width}, {channels}),"
-                    + f" (1, {height}, {width}, {channels}) or ({num_images_per_prompt}, "
-                    + f"{channels}, {height}, {width}) but is {controlnet_hint.shape}"
+                    f"Acceptble shape of `controlnet_hint` are any of ({width}, {channels}), "
+                    + f"({height}, {width}, {channels}), "
+                    + f"(1, {height}, {width}, {channels}) or "
+                    + f"({num_images_per_prompt}, {channels}, {height}, {width}) but is {controlnet_hint.shape}"
                 )
         elif isinstance(controlnet_hint, PIL.Image.Image):
             if controlnet_hint.size == (width, height):
-                # controlnet_hint = controlnet_hint.convert("RGB")  # make sure 3 channel RGB format
+                controlnet_hint = controlnet_hint.convert("RGB")  # make sure 3 channel RGB format
                 return self.controlnet_hint_conversion(np.array(controlnet_hint), height, width, num_images_per_prompt)
             else:
                 raise ValueError(

From cc597a1b310aac87a1444eaf520b028dc93eefeb Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Sun, 19 Feb 2023 03:13:30 +0900
Subject: [PATCH 029/122] Add and update docstrings

---
 src/diffusers/models/controlnet_blocks.py     | 22 ++++++++
 src/diffusers/models/unet_2d_condition.py     | 23 +++++---
 .../pipeline_stable_diffusion_controlnet.py   | 55 ++++++++++++++++++-
 3 files changed, 90 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/models/controlnet_blocks.py b/src/diffusers/models/controlnet_blocks.py
index 7b7941b42675..3fbcea427a9d 100644
--- a/src/diffusers/models/controlnet_blocks.py
+++ b/src/diffusers/models/controlnet_blocks.py
@@ -1,3 +1,25 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+ControlNet Block Implementation
+
+Paper: Adding Conditional Control to Text-to-Image Diffusion Models. https://arxiv.org/abs/2302.05543
+
+Reference implementation: https://github.com/lllyasviel/ControlNet
+"""
+
 from typing import List, Tuple
 
 import torch
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 2789f2454fed..61b309b1c9e6 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -1,4 +1,4 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
+# Copyright 2023 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -106,6 +106,9 @@ class conditioning with `class_embed_type` equal to `None`.
         conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
         projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
             using the "projection" `class_embed_type`. Required when using the "projection" `class_embed_type`.
+        controlnet_hint_channels (`int`, *optional*, default to `None`):
+            The number of channels in the `controlnet_hint`. If this value is not None, this unet model behaves as
+            ControlNet.
     """
 
     _supports_gradient_checkpointing = True
@@ -506,10 +509,10 @@ def forward(
         timestep_cond: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_hint: Optional[torch.Tensor] = None,
-        control: Optional[List[torch.Tensor]] = None,
+        controlnet_hint: Optional[torch.FloatTensor] = None,
+        control: Optional[List[torch.FloatTensor]] = None,
         return_dict: bool = True,
-    ) -> Union[UNet2DConditionOutput, Tuple, List[torch.Tensor]]:
+    ) -> Union[UNet2DConditionOutput, Tuple, List[torch.FloatTensor]]:
         r"""
         Args:
             sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
@@ -521,11 +524,17 @@ def forward(
                 A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
                 `self.processor` in
                 [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-
+            controlnet_hint (`torch.FloatTensor`, *optional*, defaults to `None`):
+                ControlNet input embedding. If `controlnet_hint_channel` of `__init__()` is not None, it must be
+                specified as a tensors.
+            control (`List[torch.FloatTensor]`, *optional*, defaults to `None`):
+                If `control` is not None, this unet model behaves as ControlledUnet. ControlledUnet is controlled by
+                this list of tensors.
         Returns:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] , `tuple` or [torch.FloatTensor]:
             [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
+            returning a tuple, the first element is the sample tensor. If `controlnet_hint` is not None, the ControlNet
+            result of the processing is output as a list of tensors.
         """
         # By default samples have to be AT least a multiple of the overall upsampling factor.
         # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 3c3ae12fdbaf..7147abe513c1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -1,4 +1,4 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
+# Copyright 2023 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import is_accelerate_available, logging, randn_tensor
+from ...utils import is_accelerate_available, logging, randn_tensor, replace_example_docstring
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
@@ -31,8 +31,52 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import StableDiffusionControlNetPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> # Canny edged image for control
+        >>> canny_edged_image = load_image(
+        ...     "https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged.png"
+        ... )
+        >>> pipe = StableDiffusionControlNetPipeline.from_pretrained("takuma104/control_sd15_canny").to("cuda")
+        >>> image = pipe(prompt="best quality, extremely detailed", controlnet_hint=canny_edged_image).images[0]
+        ```
+"""
+
 
 class StableDiffusionControlNetPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        controlnet ([`UNet2DConditionModel`]):
+            [ControlNet](https://arxiv.org/abs/2302.05543) architecture to generate guidance.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
     def __init__(
         self,
         vae: AutoencoderKL,
@@ -405,7 +449,7 @@ def controlnet_hint_conversion(self, controlnet_hint, height, width, num_images_
             )
 
     @torch.no_grad()
-    # @replace_example_docstring(EXAMPLE_DOC_STRING)
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
@@ -486,6 +530,11 @@ def __call__(
                 A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
                 `self.processor` in
                 [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+            controlnet_hint (`torch.FloatTensor`, `np.ndarray` or `PIL.Image.Image`, *optional*):
+                ControlNet input embedding. ControlNet generates guidances using this input embedding. If the type is
+                specified as `torch.FloatTensor`, it is passed to ControlNet as is. If the type is `np.ndarray`, it is
+                assumed to be an OpenCV compatible image format. PIL.Image.Image` can also be accepted as an image. The
+                size of all these types must correspond to the output image size.
 
         Examples:
 

From 4bcc159bfb37e7b4623f2ab42d7c74645bedb188 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Sun, 19 Feb 2023 03:33:15 +0900
Subject: [PATCH 030/122] add control_net.mdx

---
 .../stable_diffusion/control_net.mdx          | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 docs/source/en/api/pipelines/stable_diffusion/control_net.mdx

diff --git a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
new file mode 100644
index 000000000000..f49f2a47d33a
--- /dev/null
+++ b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
@@ -0,0 +1,36 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Text-to-Image Generation with ControlNet guidance
+
+## StableDiffusionControlNetPipeline
+
+ControlNet by [@lllyasviel](https://huggingface.co/lllyasviel) is a neural network structure to control diffusion models by adding extra conditions.
+
+It has integration with Stable Diffusion and 8 pre-trained models that conditions the models on different attributes 
+(such as edge detection, scribbles, depth maps, semantic segmentations and more)
+
+The original codebase can be found here: 
+- [lllyasviel/ControlNet](https://github.com/lllyasviel/ControlNet)
+
+Available Checkpoints are:
+- TODO: fill here.
+
+[[autodoc]] StableDiffusionControlNetPipeline
+	- all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_vae_slicing
+	- disable_vae_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
\ No newline at end of file

From 33841b635a67f162b1d08c57be4d28f4b584ec00 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Sun, 19 Feb 2023 03:39:35 +0900
Subject: [PATCH 031/122] add control_net.mdx to toctree

---
 docs/source/en/_toctree.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index cfbdac08a3fb..8dba589ec56f 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -165,6 +165,8 @@
         title: Self-Attention Guidance
       - local: api/pipelines/stable_diffusion/panorama
         title: MultiDiffusion Panorama
+      - local: api/pipelines/stable_diffusion/control_net
+        title: Text-to-Image with ControlNet guidance
       title: Stable Diffusion
     - local: api/pipelines/stable_diffusion_2
       title: Stable Diffusion 2

From 9a37409663a53f775fa380db332d37d7ea75c915 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Mon, 20 Feb 2023 00:32:41 +0900
Subject: [PATCH 032/122] Update copyright year

---
 scripts/convert_controlnet_to_diffusers.py                    | 2 +-
 src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/convert_controlnet_to_diffusers.py b/scripts/convert_controlnet_to_diffusers.py
index b73c089cce28..3cf4ba6d28ad 100644
--- a/scripts/convert_controlnet_to_diffusers.py
+++ b/scripts/convert_controlnet_to_diffusers.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
+# Copyright 2023 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index 5ce083b76521..51f6530f5bae 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
+# Copyright 2023 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 0a1bb4574a0b788540eea306d09b12aaab2208ae Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Mon, 20 Feb 2023 00:43:23 +0900
Subject: [PATCH 033/122] Fix to add PIL.Image RGB->BGR conversion - thanks
 @Mystfit

---
 .../pipeline_stable_diffusion_controlnet.py                 | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 7147abe513c1..e77348c3d016 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -414,7 +414,7 @@ def controlnet_hint_conversion(self, controlnet_hint, height, width, num_images_
                 )
         elif isinstance(controlnet_hint, np.ndarray):
             # np.ndarray: acceptable shape is any of hw, hwc, bhwc(b==1) or bhwc(b==num_images_per_promot)
-            # hwc is opencv compatible image format
+            # hwc is opencv compatible image format. Color channel must be BGR Format.
             if controlnet_hint.shape == (height, width):
                 controlnet_hint = np.repeat(controlnet_hint[:, :, np.newaxis], channels, axis=2)  # hw -> hwc(c==3)
             shape_hwc = (height, width, channels)
@@ -438,7 +438,9 @@ def controlnet_hint_conversion(self, controlnet_hint, height, width, num_images_
         elif isinstance(controlnet_hint, PIL.Image.Image):
             if controlnet_hint.size == (width, height):
                 controlnet_hint = controlnet_hint.convert("RGB")  # make sure 3 channel RGB format
-                return self.controlnet_hint_conversion(np.array(controlnet_hint), height, width, num_images_per_prompt)
+                controlnet_hint = np.array(controlnet_hint)  # to numpy
+                controlnet_hint = controlnet_hint[:, :, ::-1]  # RGB -> BGR
+                return self.controlnet_hint_conversion(controlnet_hint, height, width, num_images_per_prompt)
             else:
                 raise ValueError(
                     f"Acceptable image size of `controlnet_hint` is ({width}, {height}) but is {controlnet_hint.size}"

From 90d05e9cc3e57747dedbef82ab9ef2b320a1a6c3 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Mon, 20 Feb 2023 03:22:47 +0900
Subject: [PATCH 034/122] make fix-copies

---
 .../versatile_diffusion/modeling_text_unet.py | 21 +++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 210daaec7f2d..7a363a591b27 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -187,6 +187,9 @@ class conditioning with `class_embed_type` equal to `None`.
         conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
         projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
             using the "projection" `class_embed_type`. Required when using the "projection" `class_embed_type`.
+        controlnet_hint_channels (`int`, *optional*, default to `None`):
+            The number of channels in the `controlnet_hint`. If this value is not None, this unet model behaves as
+            ControlNet.
     """
 
     _supports_gradient_checkpointing = True
@@ -596,10 +599,10 @@ def forward(
         timestep_cond: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_hint: Optional[torch.Tensor] = None,
-        control: Optional[List[torch.Tensor]] = None,
+        controlnet_hint: Optional[torch.FloatTensor] = None,
+        control: Optional[List[torch.FloatTensor]] = None,
         return_dict: bool = True,
-    ) -> Union[UNet2DConditionOutput, Tuple, List[torch.Tensor]]:
+    ) -> Union[UNet2DConditionOutput, Tuple, List[torch.FloatTensor]]:
         r"""
         Args:
             sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
@@ -611,11 +614,17 @@ def forward(
                 A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
                 `self.processor` in
                 [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-
+            controlnet_hint (`torch.FloatTensor`, *optional*, defaults to `None`):
+                ControlNet input embedding. If `controlnet_hint_channel` of `__init__()` is not None, it must be
+                specified as a tensors.
+            control (`List[torch.FloatTensor]`, *optional*, defaults to `None`):
+                If `control` is not None, this unet model behaves as ControlledUnet. ControlledUnet is controlled by
+                this list of tensors.
         Returns:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] , `tuple` or [torch.FloatTensor]:
             [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
+            returning a tuple, the first element is the sample tensor. If `controlnet_hint` is not None, the ControlNet
+            result of the processing is output as a list of tensors.
         """
         # By default samples have to be AT least a multiple of the overall upsampling factor.
         # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).

From 3ade8c0cc3eb6fe6384d97932bd02189405f70fb Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 21 Feb 2023 02:44:33 +0900
Subject: [PATCH 035/122] add basic fast test for controlnet

---
 tests/models/test_models_unet_2d_condition.py | 73 +++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index 6ee8c2ffc002..adee01e9b878 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -83,6 +83,16 @@ def dummy_input(self):
 
         return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
 
+    @property
+    def dummpy_input_for_controlnet(self):
+        hint_channels = 3
+        input_dict = self.dummy_input
+        sample_shape = input_dict["sample"].shape
+        input_dict["controlnet_hint"] = floats_tensor(
+            (sample_shape[0], hint_channels, sample_shape[2] * 8, sample_shape[3] * 8)
+        ).to(torch_device)
+        return input_dict
+
     @property
     def input_shape(self):
         return (4, 32, 32)
@@ -106,6 +116,21 @@ def prepare_init_args_and_inputs_for_common(self):
         inputs_dict = self.dummy_input
         return init_dict, inputs_dict
 
+    def prepare_init_args_and_inputs_for_controlnet(self):
+        init_dict = {
+            "block_out_channels": (32, 64),
+            "down_block_types": ("CrossAttnDownBlock2D", "DownBlock2D"),
+            "up_block_types": ("UpBlock2D", "CrossAttnUpBlock2D"),  # dummy, to avoid ValueError
+            "cross_attention_dim": 32,
+            "attention_head_dim": 8,
+            "in_channels": 4,
+            "layers_per_block": 2,
+            "sample_size": 32,
+            "controlnet_hint_channels": 3,
+        }
+        inputs_dict = self.dummpy_input_for_controlnet
+        return init_dict, inputs_dict
+
     @unittest.skipIf(
         torch_device != "cuda" or not is_xformers_available(),
         reason="XFormers attention is only available with CUDA and `xformers` installed",
@@ -441,6 +466,54 @@ def test_lora_xformers_on_off(self):
         assert (sample - on_sample).abs().max() < 1e-4
         assert (sample - off_sample).abs().max() < 1e-4
 
+    def test_model_controlnet_inference(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_controlnet()
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            conv_in_n = 1
+            cross_attn_n = init_dict["layers_per_block"] + 1  # with down sampler
+            down_block_n = init_dict["layers_per_block"]  # no down sampler
+            mid_block_n = 1
+            assert len(output) == conv_in_n + cross_attn_n + down_block_n + mid_block_n
+
+            cross_attn_ch = init_dict["block_out_channels"][0]
+            down_block_ch = init_dict["block_out_channels"][1]
+            inshape = self.input_shape
+            batch = inputs_dict["sample"].shape[0]
+            s1 = (batch, cross_attn_ch, inshape[1], inshape[2])
+            s2 = (batch, cross_attn_ch, inshape[1] / 2, inshape[2] / 2)
+            s3 = (batch, down_block_ch, inshape[1] / 2, inshape[2] / 2)
+            expected_shape = [s1, s1, s1, s2, s3, s3, s3]
+            assert all([out.shape == shape for out, shape in zip(output, expected_shape)])
+
+    def test_model_controlnet_and_unet_inference(self):
+        controlnet_init_dict, controlnet_inputs_dict = self.prepare_init_args_and_inputs_for_controlnet()
+        unet_init_dict, unet_inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        controlnet_model = self.model_class(**controlnet_init_dict)
+        controlnet_model.to(torch_device)
+        controlnet_model.eval()
+
+        unet_model = self.model_class(**unet_init_dict)
+        unet_model.to(torch_device)
+        unet_model.eval()
+
+        with torch.no_grad():
+            control = controlnet_model(**controlnet_inputs_dict)
+            unet_inputs_dict["control"] = control
+            output = unet_model(**unet_inputs_dict)
+            if isinstance(output, dict):
+                output = output.sample
+
+            batch = unet_inputs_dict["sample"].shape[0]
+            assert output.shape == (batch,) + self.output_shape
+
 
 @slow
 class UNet2DConditionModelIntegrationTests(unittest.TestCase):

From d5965c767cc8d729dc8f6939fa9f6a2034199141 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 21 Feb 2023 03:43:54 +0900
Subject: [PATCH 036/122] add slow test for controlnet/unet

---
 tests/models/test_models_unet_2d_condition.py | 57 ++++++++++++++++++-
 1 file changed, 55 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index adee01e9b878..daae562773cc 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -26,6 +26,7 @@
     floats_tensor,
     load_hf_numpy,
     logging,
+    randn_tensor,
     require_torch_gpu,
     slow,
     torch_all_close,
@@ -531,12 +532,12 @@ def get_latents(self, seed=0, shape=(4, 4, 64, 64), fp16=False):
         image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
         return image
 
-    def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"):
+    def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4", subfolder="unet"):
         revision = "fp16" if fp16 else None
         torch_dtype = torch.float16 if fp16 else torch.float32
 
         model = UNet2DConditionModel.from_pretrained(
-            model_id, subfolder="unet", torch_dtype=torch_dtype, revision=revision
+            model_id, subfolder=subfolder, torch_dtype=torch_dtype, revision=revision
         )
         model.to(torch_device).eval()
 
@@ -820,3 +821,55 @@ def test_stabilityai_sd_v2_fp16(self, seed, timestep, expected_slice):
         expected_output_slice = torch.tensor(expected_slice)
 
         assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [83, 4, [-0.0343, -0.7764, -0.5049, -0.1671, -0.8076, -0.8975, -0.0917,  0.6797]],
+            [17, 0.55, [-0.1732, -0.2542,  0.5425, -0.4189, -0.7910,  0.7544,  0.3892, -0.3232]],
+            [8, 0.89, [-0.6738,  0.3801,  0.1443,  0.1410,  0.7944, -0.4167,  0.1897, -0.0763]],
+            [3, 1000, [0.7334,  0.4519, -0.0319, -0.6343, -0.4348, -0.5205, -0.2534,  0.7998]],
+            # fmt: on
+        ]
+    )
+    @require_torch_gpu
+    def test_controlnet_sd15_canny_fp16(self, seed, timestep, expected_slice):
+        model_id = "takuma104/control_sd15_canny"
+        controlnet_model = self.get_unet_model(model_id=model_id, subfolder="controlnet").to(torch.float16)
+        unet_model = self.get_unet_model(model_id=model_id).to(torch.float16)
+        latents = self.get_latents(seed, shape=(4, 4, 96, 96), fp16=True)
+
+        # for my poor memory environment
+        controlnet_model.set_attention_slice(1)  # TODO: remove
+        unet_model.set_attention_slice(1)  # TODO: remove
+
+        generator = torch.manual_seed(seed)
+        controlnet_hint = randn_tensor(
+            (4, 3, 96 * 8, 96 * 8), generator=generator, device=torch.device(torch_device), dtype=torch.float16
+        )
+
+        # encoder_hidden_states = self.get_encoder_hidden_states(seed, shape=(4, 77, 1024), fp16=True)
+        # TODO: investigate for not accepted (4, 77, 1024)
+        encoder_hidden_states = randn_tensor(
+            (4, 77, 768), generator=generator, device=torch.device(torch_device), dtype=torch.float16
+        )
+
+        timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
+
+        with torch.no_grad():
+            control = controlnet_model(
+                latents,
+                timestep=timestep,
+                encoder_hidden_states=encoder_hidden_states,
+                controlnet_hint=controlnet_hint,
+            )
+            sample = unet_model(
+                latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states, control=control
+            ).sample
+
+        assert sample.shape == latents.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)

From 79c0ecb062557b0eba94b9262b39e78b4ec066c9 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 21 Feb 2023 03:50:43 +0900
Subject: [PATCH 037/122] Ignore down/up_block len check on ControlNet

---
 src/diffusers/models/unet_2d_condition.py     | 2 +-
 tests/models/test_models_unet_2d_condition.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 61b309b1c9e6..c3d936184584 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -159,7 +159,7 @@ def __init__(
         self.sample_size = sample_size
 
         # Check inputs
-        if len(down_block_types) != len(up_block_types):
+        if controlnet_hint_channels is None and len(down_block_types) != len(up_block_types):
             raise ValueError(
                 f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
             )
diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index daae562773cc..95af3f81f7b8 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -121,7 +121,6 @@ def prepare_init_args_and_inputs_for_controlnet(self):
         init_dict = {
             "block_out_channels": (32, 64),
             "down_block_types": ("CrossAttnDownBlock2D", "DownBlock2D"),
-            "up_block_types": ("UpBlock2D", "CrossAttnUpBlock2D"),  # dummy, to avoid ValueError
             "cross_attention_dim": 32,
             "attention_head_dim": 8,
             "in_channels": 4,

From 04f9b8a8347dcdae7086ee6729e2babed7e05946 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 21 Feb 2023 23:57:18 +0900
Subject: [PATCH 038/122] add a copy from test_stable_diffusion.py

---
 .../test_stable_diffusion_controlnet.py       | 955 ++++++++++++++++++
 1 file changed, 955 insertions(+)
 create mode 100644 tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
new file mode 100644
index 000000000000..75e2a44f018f
--- /dev/null
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -0,0 +1,955 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import gc
+import tempfile
+import time
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+    logging,
+)
+from diffusers.utils import load_numpy, nightly, slow, torch_device
+from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
+
+from ...models.test_models_unet_2d_condition import create_lora_layers
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = StableDiffusionPipeline
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "numpy",
+        }
+        return inputs
+
+    def test_stable_diffusion_ddim(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5643, 0.6017, 0.4799, 0.5267, 0.5584, 0.4641, 0.5159, 0.4963, 0.4791])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_lora(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        # forward 1
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        # set lora layers
+        lora_attn_procs = create_lora_layers(sd_pipe.unet)
+        sd_pipe.unet.set_attn_processor(lora_attn_procs)
+        sd_pipe = sd_pipe.to(torch_device)
+
+        # forward 2
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.0})
+        image = output.images
+        image_slice_1 = image[0, -3:, -3:, -1]
+
+        # forward 3
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.5})
+        image = output.images
+        image_slice_2 = image[0, -3:, -3:, -1]
+
+        assert np.abs(image_slice - image_slice_1).max() < 1e-2
+        assert np.abs(image_slice - image_slice_2).max() > 1e-2
+
+    def test_stable_diffusion_prompt_embeds(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        # forward
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = 3 * [inputs.pop("prompt")]
+
+        text_inputs = sd_pipe.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=sd_pipe.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_inputs = text_inputs["input_ids"].to(torch_device)
+
+        prompt_embeds = sd_pipe.text_encoder(text_inputs)[0]
+
+        inputs["prompt_embeds"] = prompt_embeds
+
+        # forward
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+    def test_stable_diffusion_negative_prompt_embeds(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        negative_prompt = 3 * ["this is a negative prompt"]
+        inputs["negative_prompt"] = negative_prompt
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        # forward
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = 3 * [inputs.pop("prompt")]
+
+        embeds = []
+        for p in [prompt, negative_prompt]:
+            text_inputs = sd_pipe.tokenizer(
+                p,
+                padding="max_length",
+                max_length=sd_pipe.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_inputs = text_inputs["input_ids"].to(torch_device)
+
+            embeds.append(sd_pipe.text_encoder(text_inputs)[0])
+
+        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
+
+        # forward
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+    def test_stable_diffusion_ddim_factor_8(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs, height=136, width=136)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 136, 136, 3)
+        expected_slice = np.array([0.5524, 0.5626, 0.6069, 0.4727, 0.386, 0.3995, 0.4613, 0.4328, 0.4269])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_pndm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe.scheduler = PNDMScheduler(skip_prk_steps=True)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5094, 0.5674, 0.4667, 0.5125, 0.5696, 0.4674, 0.5277, 0.4964, 0.4945])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_no_safety_checker(self):
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
+        )
+        assert isinstance(pipe, StableDiffusionPipeline)
+        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
+        assert pipe.safety_checker is None
+
+        image = pipe("example prompt", num_inference_steps=2).images[0]
+        assert image is not None
+
+        # check that there's no error when saving a pipeline with one of the models being None
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname)
+            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
+
+        # sanity check that the pipeline still works
+        assert pipe.safety_checker is None
+        image = pipe("example prompt", num_inference_steps=2).images[0]
+        assert image is not None
+
+    def test_stable_diffusion_k_lms(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array(
+            [
+                0.47082293033599854,
+                0.5371589064598083,
+                0.4562119245529175,
+                0.5220914483070374,
+                0.5733777284622192,
+                0.4795039892196655,
+                0.5465868711471558,
+                0.5074326395988464,
+                0.5042197108268738,
+            ]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_k_euler_ancestral(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array(
+            [
+                0.4707113206386566,
+                0.5372191071510315,
+                0.4563021957874298,
+                0.5220003724098206,
+                0.5734264850616455,
+                0.4794946610927582,
+                0.5463782548904419,
+                0.5074145197868347,
+                0.504422664642334,
+            ]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_k_euler(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array(
+            [
+                0.47082313895225525,
+                0.5371587872505188,
+                0.4562119245529175,
+                0.5220913887023926,
+                0.5733776688575745,
+                0.47950395941734314,
+                0.546586811542511,
+                0.5074326992034912,
+                0.5042197108268738,
+            ]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_vae_slicing(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        image_count = 4
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["prompt"] = [inputs["prompt"]] * image_count
+        output_1 = sd_pipe(**inputs)
+
+        # make sure sliced vae decode yields the same result
+        sd_pipe.enable_vae_slicing()
+        inputs = self.get_dummy_inputs(device)
+        inputs["prompt"] = [inputs["prompt"]] * image_count
+        output_2 = sd_pipe(**inputs)
+
+        # there is a small discrepancy at image borders vs. full batch decode
+        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 3e-3
+
+    def test_stable_diffusion_negative_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        negative_prompt = "french fries"
+        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array(
+            [
+                0.5108221173286438,
+                0.5688379406929016,
+                0.4685141146183014,
+                0.5098261833190918,
+                0.5657756328582764,
+                0.4631010890007019,
+                0.5226285457611084,
+                0.49129390716552734,
+                0.4899061322212219,
+            ]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_num_images_per_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+
+        # test num_images_per_prompt=1 (default)
+        images = sd_pipe(prompt, num_inference_steps=2, output_type="np").images
+
+        assert images.shape == (1, 64, 64, 3)
+
+        # test num_images_per_prompt=1 (default) for batch of prompts
+        batch_size = 2
+        images = sd_pipe([prompt] * batch_size, num_inference_steps=2, output_type="np").images
+
+        assert images.shape == (batch_size, 64, 64, 3)
+
+        # test num_images_per_prompt for single prompt
+        num_images_per_prompt = 2
+        images = sd_pipe(
+            prompt, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
+        ).images
+
+        assert images.shape == (num_images_per_prompt, 64, 64, 3)
+
+        # test num_images_per_prompt for batch of prompts
+        batch_size = 2
+        images = sd_pipe(
+            [prompt] * batch_size, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
+        ).images
+
+        assert images.shape == (batch_size * num_images_per_prompt, 64, 64, 3)
+
+    def test_stable_diffusion_long_prompt(self):
+        components = self.get_dummy_components()
+        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        do_classifier_free_guidance = True
+        negative_prompt = None
+        num_images_per_prompt = 1
+        logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
+
+        prompt = 25 * "@"
+        with CaptureLogger(logger) as cap_logger_3:
+            text_embeddings_3 = sd_pipe._encode_prompt(
+                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
+
+        prompt = 100 * "@"
+        with CaptureLogger(logger) as cap_logger:
+            text_embeddings = sd_pipe._encode_prompt(
+                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
+
+        negative_prompt = "Hello"
+        with CaptureLogger(logger) as cap_logger_2:
+            text_embeddings_2 = sd_pipe._encode_prompt(
+                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
+
+        assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
+        assert text_embeddings.shape[1] == 77
+
+        assert cap_logger.out == cap_logger_2.out
+        # 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25
+        assert cap_logger.out.count("@") == 25
+        assert cap_logger_3.out == ""
+
+    def test_stable_diffusion_height_width_opt(self):
+        components = self.get_dummy_components()
+        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "hey"
+
+        output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
+        image_shape = output.images[0].shape[:2]
+        assert image_shape == (64, 64)
+
+        output = sd_pipe(prompt, num_inference_steps=1, height=96, width=96, output_type="np")
+        image_shape = output.images[0].shape[:2]
+        assert image_shape == (96, 96)
+
+        config = dict(sd_pipe.unet.config)
+        config["sample_size"] = 96
+        sd_pipe.unet = UNet2DConditionModel.from_config(config).to(torch_device)
+        output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
+        image_shape = output.images[0].shape[:2]
+        assert image_shape == (192, 192)
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionPipelineSlowTests(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "prompt": "a photograph of an astronaut riding a horse",
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 7.5,
+            "output_type": "numpy",
+        }
+        return inputs
+
+    def test_stable_diffusion_1_1_pndm(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.43625, 0.43554, 0.36670, 0.40660, 0.39703, 0.38658, 0.43936, 0.43557, 0.40592])
+        assert np.abs(image_slice - expected_slice).max() < 1e-4
+
+    def test_stable_diffusion_1_4_pndm(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.57400, 0.47841, 0.31625, 0.63583, 0.58306, 0.55056, 0.50825, 0.56306, 0.55748])
+        assert np.abs(image_slice - expected_slice).max() < 1e-4
+
+    def test_stable_diffusion_ddim(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
+        assert np.abs(image_slice - expected_slice).max() < 1e-4
+
+    def test_stable_diffusion_lms(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.10542, 0.09620, 0.07332, 0.09015, 0.09382, 0.07597, 0.08496, 0.07806, 0.06455])
+        assert np.abs(image_slice - expected_slice).max() < 1e-4
+
+    def test_stable_diffusion_dpm(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.03503, 0.03494, 0.01087, 0.03128, 0.02552, 0.00803, 0.00742, 0.00372, 0.00000])
+        assert np.abs(image_slice - expected_slice).max() < 1e-4
+
+    def test_stable_diffusion_attention_slicing(self):
+        torch.cuda.reset_peak_memory_stats()
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        # enable attention slicing
+        pipe.enable_attention_slicing()
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        image_sliced = pipe(**inputs).images
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+        # make sure that less than 3.75 GB is allocated
+        assert mem_bytes < 3.75 * 10**9
+
+        # disable slicing
+        pipe.disable_attention_slicing()
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        image = pipe(**inputs).images
+
+        # make sure that more than 3.75 GB is allocated
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes > 3.75 * 10**9
+        assert np.abs(image_sliced - image).max() < 1e-3
+
+    def test_stable_diffusion_vae_slicing(self):
+        torch.cuda.reset_peak_memory_stats()
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        # enable vae slicing
+        pipe.enable_vae_slicing()
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        inputs["prompt"] = [inputs["prompt"]] * 4
+        inputs["latents"] = torch.cat([inputs["latents"]] * 4)
+        image_sliced = pipe(**inputs).images
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+        # make sure that less than 4 GB is allocated
+        assert mem_bytes < 4e9
+
+        # disable vae slicing
+        pipe.disable_vae_slicing()
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        inputs["prompt"] = [inputs["prompt"]] * 4
+        inputs["latents"] = torch.cat([inputs["latents"]] * 4)
+        image = pipe(**inputs).images
+
+        # make sure that more than 4 GB is allocated
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes > 4e9
+        # There is a small discrepancy at the image borders vs. a fully batched version.
+        assert np.abs(image_sliced - image).max() < 1e-2
+
+    def test_stable_diffusion_fp16_vs_autocast(self):
+        # this test makes sure that the original model with autocast
+        # and the new model with fp16 yield the same result
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        image_fp16 = pipe(**inputs).images
+
+        with torch.autocast(torch_device):
+            inputs = self.get_inputs(torch_device)
+            image_autocast = pipe(**inputs).images
+
+        # Make sure results are close enough
+        diff = np.abs(image_fp16.flatten() - image_autocast.flatten())
+        # They ARE different since ops are not run always at the same precision
+        # however, they should be extremely close.
+        assert diff.mean() < 2e-2
+
+    def test_stable_diffusion_intermediate_state(self):
+        number_of_steps = 0
+
+        def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
+            callback_fn.has_been_called = True
+            nonlocal number_of_steps
+            number_of_steps += 1
+            if step == 1:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array(
+                    [-0.5693, -0.3018, -0.9746, 0.0518, -0.8770, 0.7559, -1.7402, 0.1022, 1.1582]
+                )
+
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
+            elif step == 2:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array(
+                    [-0.1958, -0.2993, -1.0166, -0.5005, -0.4810, 0.6162, -0.9492, 0.6621, 1.4492]
+                )
+
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
+
+        callback_fn.has_been_called = False
+
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        pipe(**inputs, callback=callback_fn, callback_steps=1)
+        assert callback_fn.has_been_called
+        assert number_of_steps == inputs["num_inference_steps"]
+
+    def test_stable_diffusion_low_cpu_mem_usage(self):
+        pipeline_id = "CompVis/stable-diffusion-v1-4"
+
+        start_time = time.time()
+        pipeline_low_cpu_mem_usage = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
+        pipeline_low_cpu_mem_usage.to(torch_device)
+        low_cpu_mem_usage_time = time.time() - start_time
+
+        start_time = time.time()
+        _ = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False)
+        normal_load_time = time.time() - start_time
+
+        assert 2 * low_cpu_mem_usage_time < normal_load_time
+
+    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing(1)
+        pipe.enable_sequential_cpu_offload()
+
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        _ = pipe(**inputs)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 2.8 GB is allocated
+        assert mem_bytes < 2.8 * 10**9
+
+    def test_stable_diffusion_pipeline_with_model_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+
+        # Normal inference
+
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+            torch_dtype=torch.float16,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        outputs = pipe(**inputs)
+        mem_bytes = torch.cuda.max_memory_allocated()
+
+        # With model offloading
+
+        # Reload but don't move to cuda
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+            torch_dtype=torch.float16,
+        )
+
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+        outputs_offloaded = pipe(**inputs)
+        mem_bytes_offloaded = torch.cuda.max_memory_allocated()
+
+        assert np.abs(outputs.images - outputs_offloaded.images).max() < 1e-3
+        assert mem_bytes_offloaded < mem_bytes
+        assert mem_bytes_offloaded < 3.5 * 10**9
+        for module in pipe.text_encoder, pipe.unet, pipe.vae, pipe.safety_checker:
+            assert module.device == torch.device("cpu")
+
+        # With attention slicing
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        pipe.enable_attention_slicing()
+        _ = pipe(**inputs)
+        mem_bytes_slicing = torch.cuda.max_memory_allocated()
+
+        assert mem_bytes_slicing < mem_bytes_offloaded
+        assert mem_bytes_slicing < 3 * 10**9
+
+
+@nightly
+@require_torch_gpu
+class StableDiffusionPipelineNightlyTests(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "prompt": "a photograph of an astronaut riding a horse",
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 50,
+            "guidance_scale": 7.5,
+            "output_type": "numpy",
+        }
+        return inputs
+
+    def test_stable_diffusion_1_4_pndm(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_text2img/stable_diffusion_1_4_pndm.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_stable_diffusion_1_5_pndm(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_text2img/stable_diffusion_1_5_pndm.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_stable_diffusion_ddim(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
+        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_text2img/stable_diffusion_1_4_ddim.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_stable_diffusion_lms(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_text2img/stable_diffusion_1_4_lms.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_stable_diffusion_euler(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
+        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_text2img/stable_diffusion_1_4_euler.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_stable_diffusion_dpm(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
+        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 25
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_text2img/stable_diffusion_1_4_dpm_multi.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3

From fe82f10e79c9239b856be265b210ae0620704c1e Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Wed, 22 Feb 2023 01:12:24 +0900
Subject: [PATCH 039/122] Accept controlnet_hint is None

---
 .../stable_diffusion/pipeline_stable_diffusion_controlnet.py    | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index e77348c3d016..235cb38426cf 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -395,6 +395,8 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
         return latents
 
     def controlnet_hint_conversion(self, controlnet_hint, height, width, num_images_per_prompt):
+        if controlnet_hint is None:
+            return None
         channels = 3
         if isinstance(controlnet_hint, torch.Tensor):
             # torch.Tensor: acceptble shape are any of chw, bchw(b==1) or bchw(b==num_images_per_prompt)

From 1c7d311ec6febb33168756620a3f81a761f303e1 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Wed, 22 Feb 2023 01:19:57 +0900
Subject: [PATCH 040/122] merge pipeline_stable_diffusion.py diff

---
 .../pipeline_stable_diffusion_controlnet.py   | 110 +++++++++++++++++-
 1 file changed, 107 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 235cb38426cf..812eba24e5bd 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -19,11 +19,20 @@
 import numpy as np
 import PIL.Image
 import torch
+from packaging import version
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
+from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import is_accelerate_available, logging, randn_tensor, replace_example_docstring
+from ...utils import (
+    deprecate,
+    is_accelerate_available,
+    is_accelerate_version,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
@@ -76,6 +85,7 @@ class StableDiffusionControlNetPipeline(DiffusionPipeline):
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
+    _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
         self,
@@ -91,6 +101,70 @@ def __init__(
     ):
         super().__init__()
 
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -125,6 +199,8 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
         text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
         `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        Note that offloading happens on a submodule basis. Memory savings are higher than with
+        `enable_model_cpu_offload`, but performance is lower.
         """
         if is_accelerate_available():
             from accelerate import cpu_offload
@@ -139,6 +215,30 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
         if self.safety_checker is not None:
             cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
 
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        hook = None
+        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        if self.safety_checker is not None:
+            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
     @property
     def _execution_device(self):
         r"""
@@ -146,7 +246,7 @@ def _execution_device(self):
         `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
         hooks.
         """
-        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+        if not hasattr(self.unet, "_hf_hook"):
             return self.device
         for module in self.unet.modules():
             if (
@@ -471,7 +571,7 @@ def __call__(
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: Optional[int] = 1,
+        callback_steps: int = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         controlnet_hint: Optional[Union[torch.FloatTensor, np.ndarray, PIL.Image.Image]] = None,
     ):
@@ -668,6 +768,10 @@ def __call__(
             # 9. Run safety checker
             image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
 
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
         if not return_dict:
             return (image, has_nsfw_concept)
 

From e492e9d266fb7a335830a0702e46114dc3d3f772 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Wed, 22 Feb 2023 01:34:59 +0900
Subject: [PATCH 041/122] Update class name to SDControlNetPipeline

---
 .../test_stable_diffusion_controlnet.py       | 92 ++++++++++---------
 1 file changed, 51 insertions(+), 41 deletions(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 75e2a44f018f..4716056568bf 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import gc
 import tempfile
 import time
@@ -31,7 +30,7 @@
     EulerDiscreteScheduler,
     LMSDiscreteScheduler,
     PNDMScheduler,
-    StableDiffusionPipeline,
+    StableDiffusionControlNetPipeline,
     UNet2DConditionModel,
     logging,
 )
@@ -45,8 +44,8 @@
 torch.backends.cuda.matmul.allow_tf32 = False
 
 
-class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionPipeline
+class StableDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = StableDiffusionControlNetPipeline
 
     def get_dummy_components(self):
         torch.manual_seed(0)
@@ -60,6 +59,15 @@ def get_dummy_components(self):
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
             cross_attention_dim=32,
         )
+        controlnet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            cross_attention_dim=32,
+            controlnet_hint_channels=3,
+        )
         scheduler = DDIMScheduler(
             beta_start=0.00085,
             beta_end=0.012,
@@ -93,6 +101,7 @@ def get_dummy_components(self):
 
         components = {
             "unet": unet,
+            "controlnet": controlnet,
             "scheduler": scheduler,
             "vae": vae,
             "text_encoder": text_encoder,
@@ -110,6 +119,7 @@ def get_dummy_inputs(self, device, seed=0):
         inputs = {
             "prompt": "A painting of a squirrel eating a burger",
             "generator": generator,
+            # "controlnet_hint": torch.randn((1, 3, 64, 64)),
             "num_inference_steps": 2,
             "guidance_scale": 6.0,
             "output_type": "numpy",
@@ -120,7 +130,7 @@ def test_stable_diffusion_ddim(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
 
         components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -139,7 +149,7 @@ def test_stable_diffusion_lora(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
 
         components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -171,7 +181,7 @@ def test_stable_diffusion_lora(self):
 
     def test_stable_diffusion_prompt_embeds(self):
         components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -207,7 +217,7 @@ def test_stable_diffusion_prompt_embeds(self):
 
     def test_stable_diffusion_negative_prompt_embeds(self):
         components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -249,7 +259,7 @@ def test_stable_diffusion_ddim_factor_8(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
 
         components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -267,7 +277,7 @@ def test_stable_diffusion_ddim_factor_8(self):
     def test_stable_diffusion_pndm(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
         sd_pipe.scheduler = PNDMScheduler(skip_prk_steps=True)
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -283,10 +293,10 @@ def test_stable_diffusion_pndm(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
     def test_stable_diffusion_no_safety_checker(self):
-        pipe = StableDiffusionPipeline.from_pretrained(
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
         )
-        assert isinstance(pipe, StableDiffusionPipeline)
+        assert isinstance(pipe, StableDiffusionControlNetPipeline)
         assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
         assert pipe.safety_checker is None
 
@@ -296,7 +306,7 @@ def test_stable_diffusion_no_safety_checker(self):
         # check that there's no error when saving a pipeline with one of the models being None
         with tempfile.TemporaryDirectory() as tmpdirname:
             pipe.save_pretrained(tmpdirname)
-            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
+            pipe = StableDiffusionControlNetPipeline.from_pretrained(tmpdirname)
 
         # sanity check that the pipeline still works
         assert pipe.safety_checker is None
@@ -307,7 +317,7 @@ def test_stable_diffusion_k_lms(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
 
         components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
         sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -338,7 +348,7 @@ def test_stable_diffusion_k_euler_ancestral(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
 
         components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
         sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -369,7 +379,7 @@ def test_stable_diffusion_k_euler(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
 
         components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
         sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -400,7 +410,7 @@ def test_stable_diffusion_vae_slicing(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         components = self.get_dummy_components()
         components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -423,7 +433,7 @@ def test_stable_diffusion_negative_prompt(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         components = self.get_dummy_components()
         components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -455,7 +465,7 @@ def test_stable_diffusion_num_images_per_prompt(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         components = self.get_dummy_components()
         components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -491,7 +501,7 @@ def test_stable_diffusion_num_images_per_prompt(self):
     def test_stable_diffusion_long_prompt(self):
         components = self.get_dummy_components()
         components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -529,7 +539,7 @@ def test_stable_diffusion_long_prompt(self):
     def test_stable_diffusion_height_width_opt(self):
         components = self.get_dummy_components()
         components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -574,7 +584,7 @@ def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0
         return inputs
 
     def test_stable_diffusion_1_1_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
+        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -587,7 +597,7 @@ def test_stable_diffusion_1_1_pndm(self):
         assert np.abs(image_slice - expected_slice).max() < 1e-4
 
     def test_stable_diffusion_1_4_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -600,7 +610,7 @@ def test_stable_diffusion_1_4_pndm(self):
         assert np.abs(image_slice - expected_slice).max() < 1e-4
 
     def test_stable_diffusion_ddim(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
         sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -614,7 +624,7 @@ def test_stable_diffusion_ddim(self):
         assert np.abs(image_slice - expected_slice).max() < 1e-4
 
     def test_stable_diffusion_lms(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
         sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -628,7 +638,7 @@ def test_stable_diffusion_lms(self):
         assert np.abs(image_slice - expected_slice).max() < 1e-4
 
     def test_stable_diffusion_dpm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
         sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -643,7 +653,7 @@ def test_stable_diffusion_dpm(self):
 
     def test_stable_diffusion_attention_slicing(self):
         torch.cuda.reset_peak_memory_stats()
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
@@ -669,7 +679,7 @@ def test_stable_diffusion_attention_slicing(self):
 
     def test_stable_diffusion_vae_slicing(self):
         torch.cuda.reset_peak_memory_stats()
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -702,7 +712,7 @@ def test_stable_diffusion_vae_slicing(self):
     def test_stable_diffusion_fp16_vs_autocast(self):
         # this test makes sure that the original model with autocast
         # and the new model with fp16 yield the same result
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
@@ -747,7 +757,7 @@ def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
 
         callback_fn.has_been_called = False
 
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -761,12 +771,12 @@ def test_stable_diffusion_low_cpu_mem_usage(self):
         pipeline_id = "CompVis/stable-diffusion-v1-4"
 
         start_time = time.time()
-        pipeline_low_cpu_mem_usage = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
+        pipeline_low_cpu_mem_usage = StableDiffusionControlNetPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
         pipeline_low_cpu_mem_usage.to(torch_device)
         low_cpu_mem_usage_time = time.time() - start_time
 
         start_time = time.time()
-        _ = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False)
+        _ = StableDiffusionControlNetPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False)
         normal_load_time = time.time() - start_time
 
         assert 2 * low_cpu_mem_usage_time < normal_load_time
@@ -776,7 +786,7 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
         torch.cuda.reset_max_memory_allocated()
         torch.cuda.reset_peak_memory_stats()
 
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing(1)
@@ -798,7 +808,7 @@ def test_stable_diffusion_pipeline_with_model_offloading(self):
 
         # Normal inference
 
-        pipe = StableDiffusionPipeline.from_pretrained(
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "CompVis/stable-diffusion-v1-4",
             torch_dtype=torch.float16,
         )
@@ -810,7 +820,7 @@ def test_stable_diffusion_pipeline_with_model_offloading(self):
         # With model offloading
 
         # Reload but don't move to cuda
-        pipe = StableDiffusionPipeline.from_pretrained(
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "CompVis/stable-diffusion-v1-4",
             torch_dtype=torch.float16,
         )
@@ -866,7 +876,7 @@ def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0
         return inputs
 
     def test_stable_diffusion_1_4_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
+        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_inputs(torch_device)
@@ -880,7 +890,7 @@ def test_stable_diffusion_1_4_pndm(self):
         assert max_diff < 1e-3
 
     def test_stable_diffusion_1_5_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to(torch_device)
+        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_inputs(torch_device)
@@ -894,7 +904,7 @@ def test_stable_diffusion_1_5_pndm(self):
         assert max_diff < 1e-3
 
     def test_stable_diffusion_ddim(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
+        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
         sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -909,7 +919,7 @@ def test_stable_diffusion_ddim(self):
         assert max_diff < 1e-3
 
     def test_stable_diffusion_lms(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
+        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
         sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -924,7 +934,7 @@ def test_stable_diffusion_lms(self):
         assert max_diff < 1e-3
 
     def test_stable_diffusion_euler(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
+        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
         sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -939,7 +949,7 @@ def test_stable_diffusion_euler(self):
         assert max_diff < 1e-3
 
     def test_stable_diffusion_dpm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
+        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
         sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe.set_progress_bar_config(disable=None)
 

From 2eab486b07c59007b183d5dfeb8cbdf1d6dff0f6 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Wed, 22 Feb 2023 01:35:24 +0900
Subject: [PATCH 042/122] make style

---
 .../test_stable_diffusion_controlnet.py       | 40 ++++++++++++++-----
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 4716056568bf..b2f5742e377d 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -610,7 +610,9 @@ def test_stable_diffusion_1_4_pndm(self):
         assert np.abs(image_slice - expected_slice).max() < 1e-4
 
     def test_stable_diffusion_ddim(self):
-        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", safety_checker=None
+        )
         sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -624,7 +626,9 @@ def test_stable_diffusion_ddim(self):
         assert np.abs(image_slice - expected_slice).max() < 1e-4
 
     def test_stable_diffusion_lms(self):
-        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", safety_checker=None
+        )
         sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -638,7 +642,9 @@ def test_stable_diffusion_lms(self):
         assert np.abs(image_slice - expected_slice).max() < 1e-4
 
     def test_stable_diffusion_dpm(self):
-        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", safety_checker=None
+        )
         sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -653,7 +659,9 @@ def test_stable_diffusion_dpm(self):
 
     def test_stable_diffusion_attention_slicing(self):
         torch.cuda.reset_peak_memory_stats()
-        pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
+        )
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
@@ -679,7 +687,9 @@ def test_stable_diffusion_attention_slicing(self):
 
     def test_stable_diffusion_vae_slicing(self):
         torch.cuda.reset_peak_memory_stats()
-        pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
+        )
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -712,7 +722,9 @@ def test_stable_diffusion_vae_slicing(self):
     def test_stable_diffusion_fp16_vs_autocast(self):
         # this test makes sure that the original model with autocast
         # and the new model with fp16 yield the same result
-        pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
+        )
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
@@ -757,7 +769,9 @@ def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
 
         callback_fn.has_been_called = False
 
-        pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
+        )
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -771,12 +785,16 @@ def test_stable_diffusion_low_cpu_mem_usage(self):
         pipeline_id = "CompVis/stable-diffusion-v1-4"
 
         start_time = time.time()
-        pipeline_low_cpu_mem_usage = StableDiffusionControlNetPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
+        pipeline_low_cpu_mem_usage = StableDiffusionControlNetPipeline.from_pretrained(
+            pipeline_id, torch_dtype=torch.float16
+        )
         pipeline_low_cpu_mem_usage.to(torch_device)
         low_cpu_mem_usage_time = time.time() - start_time
 
         start_time = time.time()
-        _ = StableDiffusionControlNetPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False)
+        _ = StableDiffusionControlNetPipeline.from_pretrained(
+            pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False
+        )
         normal_load_time = time.time() - start_time
 
         assert 2 * low_cpu_mem_usage_time < normal_load_time
@@ -786,7 +804,9 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
         torch.cuda.reset_max_memory_allocated()
         torch.cuda.reset_peak_memory_stats()
 
-        pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
+        )
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing(1)

From faf1cfbe826c88366524e92fa27b2104effdb8c4 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Wed, 22 Feb 2023 03:05:38 +0900
Subject: [PATCH 043/122] Baseline fast test almost passed (w long desc)

* still needs investigate.

Following didn't passed descriped in TODO comment:
- test_stable_diffusion_long_prompt
- test_stable_diffusion_no_safety_checker

Following didn't passed same as stable_diffusion_pipeline:
- test_attention_slicing_forward_pass
- test_inference_batch_single_identical
- test_xformers_attention_forwardGenerator_pass
these seems come from calc accuracy.
---
 .../stable_diffusion/test_stable_diffusion_controlnet.py   | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index b2f5742e377d..5326326fc338 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -119,7 +119,6 @@ def get_dummy_inputs(self, device, seed=0):
         inputs = {
             "prompt": "A painting of a squirrel eating a burger",
             "generator": generator,
-            # "controlnet_hint": torch.randn((1, 3, 64, 64)),
             "num_inference_steps": 2,
             "guidance_scale": 6.0,
             "output_type": "numpy",
@@ -293,6 +292,8 @@ def test_stable_diffusion_pndm(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
     def test_stable_diffusion_no_safety_checker(self):
+        # TODO: Update model
+        # this test due to this test load dummy pipe without controlnet instance
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
         )
@@ -533,7 +534,9 @@ def test_stable_diffusion_long_prompt(self):
 
         assert cap_logger.out == cap_logger_2.out
         # 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25
-        assert cap_logger.out.count("@") == 25
+        assert (
+            cap_logger.out.count("@") == 25
+        )  # TODO: Investigate. this test should pass, but cap_logger.out.count("@") == 0
         assert cap_logger_3.out == ""
 
     def test_stable_diffusion_height_width_opt(self):

From f6569524a21f851914dd9ceafcb8938cda460154 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Thu, 23 Feb 2023 04:09:57 +0900
Subject: [PATCH 044/122] Add note comment related vae_scale_factor

---
 src/diffusers/models/controlnet_blocks.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/models/controlnet_blocks.py b/src/diffusers/models/controlnet_blocks.py
index 3fbcea427a9d..56f6a32e3a17 100644
--- a/src/diffusers/models/controlnet_blocks.py
+++ b/src/diffusers/models/controlnet_blocks.py
@@ -40,7 +40,11 @@ def zero_conv(channels):
 class ControlNetInputHintBlock(nn.Module):
     def __init__(self, hint_channels: int = 3, channels: int = 320):
         super().__init__()
-        #  Layer configurations are from reference implementation.
+        # Layer configurations are from reference implementation.
+        #
+        # Note: The sequence of convolution operations reduces the width and height by 1/8.
+        # This assumes that the vae_scale_factor for SD is 8.
+        # TODO: support vae_scale_factor != 8 situation
         self.input_hint_block = nn.Sequential(
             nn.Conv2d(hint_channels, 16, 3, padding=1),
             nn.SiLU(),

From 6300a52a7a120ab79730db6efd877d8a1471a695 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Fri, 24 Feb 2023 01:50:22 +0900
Subject: [PATCH 045/122] add test_stable_diffusion_controlnet_ddim

---
 .../test_stable_diffusion_controlnet.py       | 45 +++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 5326326fc338..b4b5355cdf35 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -125,6 +125,28 @@ def get_dummy_inputs(self, device, seed=0):
         }
         return inputs
 
+    def get_dummy_components_for_controlnet(self):
+        components = self.get_dummy_components()
+        # vae_scale_factor 8 version
+        # this for ControlNetInputHintBlock accepts only vae_scale_factor=8
+        components["vae"] = AutoencoderKL(
+            block_out_channels=[32, 64, 64, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        return components
+
+    def get_dummy_inputs_for_controlnet(self, device, seed=0):
+        inputs = self.get_dummy_inputs(device, seed)
+        vae_scale_factor = 8
+        inputs["controlnet_hint"] = torch.randn(
+            (1, 3, 32 * vae_scale_factor, 32 * vae_scale_factor), generator=inputs["generator"]
+        )
+        return inputs
+
     def test_stable_diffusion_ddim(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
 
@@ -563,6 +585,29 @@ def test_stable_diffusion_height_width_opt(self):
         image_shape = output.images[0].shape[:2]
         assert image_shape == (192, 192)
 
+    def test_stable_diffusion_controlnet_ddim(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        vae_scale_factor = 8
+        components = self.get_dummy_components_for_controlnet()
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs_for_controlnet(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+        # print("image_slice", image_slice)
+
+        assert image.shape == (1, 32 * vae_scale_factor, 32 * vae_scale_factor, 3)
+        expected_slice = np.array(
+            [0.4780106, 0.46282214, 0.49179333, 0.437001, 0.4518742, 0.46226522, 0.41771045, 0.4315053, 0.4805042]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
 
 @slow
 @require_torch_gpu

From bac69f1831263bfc932987da3d70a07ff1689fcc Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Fri, 24 Feb 2023 01:56:36 +0900
Subject: [PATCH 046/122] add assertion for vae_scale_factor != 8

---
 .../stable_diffusion/pipeline_stable_diffusion_controlnet.py   | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 812eba24e5bd..08d24dab0e79 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -178,6 +178,9 @@ def __init__(
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
+        if self.vae_scale_factor != 8:
+            raise ValueError("ControlNet currently supports only for vae_scale_factor == 8.")
+
     def enable_vae_slicing(self):
         r"""
         Enable sliced VAE decoding.

From 4f394a834bed5e4dd8b3eedab4177ab6e7eced76 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Fri, 24 Feb 2023 03:29:53 +0900
Subject: [PATCH 047/122] slow test of pipeline almost passed Failed:
 test_stable_diffusion_pipeline_with_model_offloading - ImportError:
 `enable_model_offload` requires `accelerate v0.17.0` or higher

but currently latest version == 0.16.0
---
 .../pipeline_stable_diffusion_controlnet.py   |   5 +-
 .../test_stable_diffusion_controlnet.py       | 245 ++++--------------
 2 files changed, 55 insertions(+), 195 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 08d24dab0e79..33fde9ad2038 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -178,9 +178,6 @@ def __init__(
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
-        if self.vae_scale_factor != 8:
-            raise ValueError("ControlNet currently supports only for vae_scale_factor == 8.")
-
     def enable_vae_slicing(self):
         r"""
         Enable sliced VAE decoding.
@@ -658,6 +655,8 @@ def __call__(
 
         # 1. Control Embedding check & conversion
         controlnet_hint = self.controlnet_hint_conversion(controlnet_hint, height, width, num_images_per_prompt)
+        if controlnet_hint is not None and self.vae_scale_factor != 8:
+            raise ValueError("ControlNet currently supports only for vae_scale_factor == 8.")
 
         # 2. Check inputs. Raise error if not correct
         self.check_inputs(
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index b4b5355cdf35..9e5b22cc8b39 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -34,7 +34,7 @@
     UNet2DConditionModel,
     logging,
 )
-from diffusers.utils import load_numpy, nightly, slow, torch_device
+from diffusers.utils import slow, torch_device
 from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
 
 from ...models.test_models_unet_2d_condition import create_lora_layers
@@ -611,7 +611,10 @@ def test_stable_diffusion_controlnet_ddim(self):
 
 @slow
 @require_torch_gpu
-class StableDiffusionPipelineSlowTests(unittest.TestCase):
+class StableDiffusionControlNetPipelineSlowTests(unittest.TestCase):
+    model_id = "takuma104/control_sd15_canny"
+    controlnet_memsize = 1451078656  # in float32, https://gist.github.com/takuma104/ce954bde6511a1f0b031a87a646b1f7d
+
     def tearDown(self):
         super().tearDown()
         gc.collect()
@@ -619,20 +622,25 @@ def tearDown(self):
 
     def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
         generator = torch.Generator(device=generator_device).manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
-        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        latents = torch.randn((1, 4, 64, 64), generator=generator, dtype=dtype)
+        vae_scale_factor = 8
+        controlnet_hint = torch.randn(
+            (1, 3, 64 * vae_scale_factor, 64 * vae_scale_factor), generator=generator, dtype=dtype
+        )
         inputs = {
             "prompt": "a photograph of an astronaut riding a horse",
             "latents": latents,
             "generator": generator,
-            "num_inference_steps": 3,
+            "num_inference_steps": 50,
             "guidance_scale": 7.5,
             "output_type": "numpy",
+            "controlnet_hint": controlnet_hint,
         }
         return inputs
 
-    def test_stable_diffusion_1_1_pndm(self):
-        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
+    def test_stable_diffusion_controlnet_ddim(self):
+        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained(self.model_id, safety_checker=None)
+        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -640,43 +648,16 @@ def test_stable_diffusion_1_1_pndm(self):
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
 
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.43625, 0.43554, 0.36670, 0.40660, 0.39703, 0.38658, 0.43936, 0.43557, 0.40592])
-        assert np.abs(image_slice - expected_slice).max() < 1e-4
-
-    def test_stable_diffusion_1_4_pndm(self):
-        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
+        # print(image_slice)
 
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.57400, 0.47841, 0.31625, 0.63583, 0.58306, 0.55056, 0.50825, 0.56306, 0.55748])
-        assert np.abs(image_slice - expected_slice).max() < 1e-4
-
-    def test_stable_diffusion_ddim(self):
-        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None
+        expected_slice = np.array(
+            [1.0, 0.9598756, 0.8430315, 0.9999685, 0.9130426, 0.8025453, 0.87997377, 0.8080752, 0.7180274]
         )
-        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
         assert np.abs(image_slice - expected_slice).max() < 1e-4
 
-    def test_stable_diffusion_lms(self):
-        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None
-        )
+    def test_stable_diffusion_controlnet_lms(self):
+        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained(self.model_id, safety_checker=None)
         sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -685,14 +666,16 @@ def test_stable_diffusion_lms(self):
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
 
+        # print(image_slice)
+
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.10542, 0.09620, 0.07332, 0.09015, 0.09382, 0.07597, 0.08496, 0.07806, 0.06455])
+        expected_slice = np.array(
+            [1.0, 0.9631732, 0.84487236, 1.0, 0.914418, 0.8033508, 0.88200307, 0.809505, 0.7186936]
+        )
         assert np.abs(image_slice - expected_slice).max() < 1e-4
 
-    def test_stable_diffusion_dpm(self):
-        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None
-        )
+    def test_stable_diffusion_controlnet_dpm(self):
+        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained(self.model_id, safety_checker=None)
         sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -701,15 +684,17 @@ def test_stable_diffusion_dpm(self):
         image = sd_pipe(**inputs).images
         image_slice = image[0, -3:, -3:, -1].flatten()
 
+        # print(image_slice)
+
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.03503, 0.03494, 0.01087, 0.03128, 0.02552, 0.00803, 0.00742, 0.00372, 0.00000])
+        expected_slice = np.array(
+            [1.0, 0.9627134, 0.8445909, 1.0, 0.9132767, 0.8025819, 0.88159156, 0.8089917, 0.71824443]
+        )
         assert np.abs(image_slice - expected_slice).max() < 1e-4
 
-    def test_stable_diffusion_attention_slicing(self):
+    def test_stable_diffusion_controlnet_attention_slicing(self):
         torch.cuda.reset_peak_memory_stats()
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
-        )
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(self.model_id, torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
@@ -721,7 +706,7 @@ def test_stable_diffusion_attention_slicing(self):
         mem_bytes = torch.cuda.max_memory_allocated()
         torch.cuda.reset_peak_memory_stats()
         # make sure that less than 3.75 GB is allocated
-        assert mem_bytes < 3.75 * 10**9
+        assert mem_bytes < 3.75 * 10**9 + self.controlnet_memsize / 2
 
         # disable slicing
         pipe.disable_attention_slicing()
@@ -730,14 +715,12 @@ def test_stable_diffusion_attention_slicing(self):
 
         # make sure that more than 3.75 GB is allocated
         mem_bytes = torch.cuda.max_memory_allocated()
-        assert mem_bytes > 3.75 * 10**9
+        assert mem_bytes > 3.75 * 10**9 + self.controlnet_memsize / 2
         assert np.abs(image_sliced - image).max() < 1e-3
 
     def test_stable_diffusion_vae_slicing(self):
         torch.cuda.reset_peak_memory_stats()
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
-        )
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(self.model_id, torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -752,7 +735,7 @@ def test_stable_diffusion_vae_slicing(self):
         mem_bytes = torch.cuda.max_memory_allocated()
         torch.cuda.reset_peak_memory_stats()
         # make sure that less than 4 GB is allocated
-        assert mem_bytes < 4e9
+        assert mem_bytes < 4e9 + self.controlnet_memsize / 2
 
         # disable vae slicing
         pipe.disable_vae_slicing()
@@ -763,16 +746,14 @@ def test_stable_diffusion_vae_slicing(self):
 
         # make sure that more than 4 GB is allocated
         mem_bytes = torch.cuda.max_memory_allocated()
-        assert mem_bytes > 4e9
+        assert mem_bytes > 4e9 + self.controlnet_memsize / 2
         # There is a small discrepancy at the image borders vs. a fully batched version.
         assert np.abs(image_sliced - image).max() < 1e-2
 
     def test_stable_diffusion_fp16_vs_autocast(self):
         # this test makes sure that the original model with autocast
         # and the new model with fp16 yield the same result
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
-        )
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(self.model_id, torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
@@ -789,7 +770,7 @@ def test_stable_diffusion_fp16_vs_autocast(self):
         # however, they should be extremely close.
         assert diff.mean() < 2e-2
 
-    def test_stable_diffusion_intermediate_state(self):
+    def test_stable_diffusion_controlnet_intermediate_state(self):
         number_of_steps = 0
 
         def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
@@ -800,26 +781,20 @@ def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 64)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [-0.5693, -0.3018, -0.9746, 0.0518, -0.8770, 0.7559, -1.7402, 0.1022, 1.1582]
-                )
-
+                expected_slice = np.array([-1.981, 1.052, -1.0625, -0.01709, -1.138, -0.592, -0.372, 0.332, 0.845])
+                # print(latents_slice.flatten())
                 assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
             elif step == 2:
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 64)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [-0.1958, -0.2993, -1.0166, -0.5005, -0.4810, 0.6162, -0.9492, 0.6621, 1.4492]
-                )
-
+                expected_slice = np.array([-2.043, 1.113, -1.138, 0.062, -1.133, -0.614, -0.3901, 0.352, 0.8667])
+                # print(latents_slice.flatten())
                 assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
 
         callback_fn.has_been_called = False
 
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
-        )
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(self.model_id, torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -830,7 +805,7 @@ def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
         assert number_of_steps == inputs["num_inference_steps"]
 
     def test_stable_diffusion_low_cpu_mem_usage(self):
-        pipeline_id = "CompVis/stable-diffusion-v1-4"
+        pipeline_id = self.model_id
 
         start_time = time.time()
         pipeline_low_cpu_mem_usage = StableDiffusionControlNetPipeline.from_pretrained(
@@ -852,9 +827,7 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
         torch.cuda.reset_max_memory_allocated()
         torch.cuda.reset_peak_memory_stats()
 
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
-        )
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(self.model_id, torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing(1)
@@ -865,7 +838,7 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
 
         mem_bytes = torch.cuda.max_memory_allocated()
         # make sure that less than 2.8 GB is allocated
-        assert mem_bytes < 2.8 * 10**9
+        assert mem_bytes < 2.8 * 10**9 + self.controlnet_memsize / 2
 
     def test_stable_diffusion_pipeline_with_model_offloading(self):
         torch.cuda.empty_cache()
@@ -877,7 +850,7 @@ def test_stable_diffusion_pipeline_with_model_offloading(self):
         # Normal inference
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4",
+            self.model_id,
             torch_dtype=torch.float16,
         )
         pipe.to(torch_device)
@@ -889,7 +862,7 @@ def test_stable_diffusion_pipeline_with_model_offloading(self):
 
         # Reload but don't move to cuda
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4",
+            self.model_id,
             torch_dtype=torch.float16,
         )
 
@@ -904,7 +877,7 @@ def test_stable_diffusion_pipeline_with_model_offloading(self):
 
         assert np.abs(outputs.images - outputs_offloaded.images).max() < 1e-3
         assert mem_bytes_offloaded < mem_bytes
-        assert mem_bytes_offloaded < 3.5 * 10**9
+        assert mem_bytes_offloaded < 3.5 * 10**9 + self.controlnet_memsize / 2
         for module in pipe.text_encoder, pipe.unet, pipe.vae, pipe.safety_checker:
             assert module.device == torch.device("cpu")
 
@@ -918,116 +891,4 @@ def test_stable_diffusion_pipeline_with_model_offloading(self):
         mem_bytes_slicing = torch.cuda.max_memory_allocated()
 
         assert mem_bytes_slicing < mem_bytes_offloaded
-        assert mem_bytes_slicing < 3 * 10**9
-
-
-@nightly
-@require_torch_gpu
-class StableDiffusionPipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=generator_device).manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
-        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
-        inputs = {
-            "prompt": "a photograph of an astronaut riding a horse",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 50,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_1_4_pndm(self):
-        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_text2img/stable_diffusion_1_4_pndm.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_stable_diffusion_1_5_pndm(self):
-        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_text2img/stable_diffusion_1_5_pndm.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_stable_diffusion_ddim(self):
-        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
-        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_text2img/stable_diffusion_1_4_ddim.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_stable_diffusion_lms(self):
-        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_text2img/stable_diffusion_1_4_lms.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_stable_diffusion_euler(self):
-        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
-        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_text2img/stable_diffusion_1_4_euler.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_stable_diffusion_dpm(self):
-        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        inputs["num_inference_steps"] = 25
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_text2img/stable_diffusion_1_4_dpm_multi.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
+        assert mem_bytes_slicing < 3 * 10**9 + self.controlnet_memsize / 2

From 2b0f04bddebfd8d364afc2a16d26df09c5c1f789 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Fri, 24 Feb 2023 03:57:15 +0900
Subject: [PATCH 048/122] test_stable_diffusion_long_prompt passed

---
 .../stable_diffusion/test_stable_diffusion_controlnet.py    | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 9e5b22cc8b39..63a5930288ae 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -531,7 +531,7 @@ def test_stable_diffusion_long_prompt(self):
         do_classifier_free_guidance = True
         negative_prompt = None
         num_images_per_prompt = 1
-        logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
+        logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet")
 
         prompt = 25 * "@"
         with CaptureLogger(logger) as cap_logger_3:
@@ -556,9 +556,7 @@ def test_stable_diffusion_long_prompt(self):
 
         assert cap_logger.out == cap_logger_2.out
         # 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25
-        assert (
-            cap_logger.out.count("@") == 25
-        )  # TODO: Investigate. this test should pass, but cap_logger.out.count("@") == 0
+        assert cap_logger.out.count("@") == 25
         assert cap_logger_3.out == ""
 
     def test_stable_diffusion_height_width_opt(self):

From c6c7312509a31ba60c9c1e1e260cf67a30156311 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Fri, 24 Feb 2023 04:03:49 +0900
Subject: [PATCH 049/122] test_stable_diffusion_no_safety_checker passed

- due to its model size, move to slow test
---
 .../test_stable_diffusion_controlnet.py       | 42 +++++++++----------
 1 file changed, 19 insertions(+), 23 deletions(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 63a5930288ae..bd7bc31eacea 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -313,29 +313,6 @@ def test_stable_diffusion_pndm(self):
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
-    def test_stable_diffusion_no_safety_checker(self):
-        # TODO: Update model
-        # this test due to this test load dummy pipe without controlnet instance
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
-        )
-        assert isinstance(pipe, StableDiffusionControlNetPipeline)
-        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
-        assert pipe.safety_checker is None
-
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
-
-        # check that there's no error when saving a pipeline with one of the models being None
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = StableDiffusionControlNetPipeline.from_pretrained(tmpdirname)
-
-        # sanity check that the pipeline still works
-        assert pipe.safety_checker is None
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
-
     def test_stable_diffusion_k_lms(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
 
@@ -890,3 +867,22 @@ def test_stable_diffusion_pipeline_with_model_offloading(self):
 
         assert mem_bytes_slicing < mem_bytes_offloaded
         assert mem_bytes_slicing < 3 * 10**9 + self.controlnet_memsize / 2
+
+    def test_stable_diffusion_no_safety_checker(self):
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(self.model_id, safety_checker=None)
+        assert isinstance(pipe, StableDiffusionControlNetPipeline)
+        assert isinstance(pipe.scheduler, DDIMScheduler)
+        assert pipe.safety_checker is None
+
+        image = pipe("example prompt", num_inference_steps=2).images[0]
+        assert image is not None
+
+        # check that there's no error when saving a pipeline with one of the models being None
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname)
+            pipe = StableDiffusionControlNetPipeline.from_pretrained(tmpdirname)
+
+        # sanity check that the pipeline still works
+        assert pipe.safety_checker is None
+        image = pipe("example prompt", num_inference_steps=2).images[0]
+        assert image is not None

From bd5d7b7463de71eb4b7cff8958ff4a0e06c4a289 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Fri, 24 Feb 2023 04:04:43 +0900
Subject: [PATCH 050/122] remove PoC test files

---
 tests/models/test_models_controlnet.py        | 113 ------------------
 .../test_stable_diffusion_control_net.py      |  97 ---------------
 2 files changed, 210 deletions(-)
 delete mode 100644 tests/models/test_models_controlnet.py
 delete mode 100644 tests/pipelines/stable_diffusion/test_stable_diffusion_control_net.py

diff --git a/tests/models/test_models_controlnet.py b/tests/models/test_models_controlnet.py
deleted file mode 100644
index 8b5a0acd7adf..000000000000
--- a/tests/models/test_models_controlnet.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import torch
-
-from diffusers import UNet2DConditionModel
-
-
-################################################################################
-# PoC version
-################################################################################
-
-
-# config from ControlNet_SD1.5
-unet_config = {
-    "sample_size": 64,
-    "in_channels": 4,
-    "out_channels": 4,
-    "down_block_types": ("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"),
-    "up_block_types": ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
-    "block_out_channels": (320, 640, 1280, 1280),
-    "layers_per_block": 2,
-    "cross_attention_dim": 768,
-    "attention_head_dim": 8,
-    "use_linear_projection": False,
-    "upcast_attention": False,
-}
-
-ctrlnet_config = {
-    "sample_size": 64,
-    "in_channels": 4,
-    "down_block_types": ("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"),
-    "block_out_channels": (320, 640, 1280, 1280),
-    "layers_per_block": 2,
-    "cross_attention_dim": 768,
-    "attention_head_dim": 8,
-    "use_linear_projection": False,
-    "controlnet_hint_channels": 3,
-    "upcast_attention": False,
-}
-model_id_sd15_canny = "takuma104/control_sd15_canny"  # currntry this is private model
-
-
-## utils #######################################################################
-
-
-def controlnet_inference(model_id=None):
-    sample = torch.randn((1, 4, 64, 64)).cuda()
-    hint = torch.randn((1, 3, 512, 512)).cuda()
-    timestep = 0
-    encoder_hidden_states = torch.randn((1, 77, 768)).cuda()
-    if model_id is None:
-        model = UNet2DConditionModel(**ctrlnet_config).cuda()
-    else:
-        model = UNet2DConditionModel.from_pretrained(model_id, subfolder="controlnet").cuda()
-    model.eval()
-    with torch.no_grad():
-        outputs = model(
-            sample=sample, controlnet_hint=hint, timestep=timestep, encoder_hidden_states=encoder_hidden_states
-        )
-    return outputs
-
-
-def controlled_unet_inference(control, model_id=None):
-    sample = torch.randn((1, 4, 64, 64)).cuda()
-    timestep = 0
-    encoder_hidden_states = torch.randn((1, 77, 768)).cuda()
-    if model_id is None:
-        model = UNet2DConditionModel(**unet_config).cuda()
-    else:
-        model = UNet2DConditionModel.from_pretrained(model_id_sd15_canny, subfolder="unet").cuda()
-    model.eval()
-    with torch.no_grad():
-        out = model(sample=sample, control=control, timestep=timestep, encoder_hidden_states=encoder_hidden_states)
-    return out
-
-
-## tests #######################################################################
-
-
-def test_unet_inference():
-    sample = torch.randn((1, 4, 64, 64)).cuda()
-    timestep = 0
-    encoder_hidden_states = torch.randn((1, 77, 768)).cuda()
-    model = UNet2DConditionModel(**unet_config).cuda()
-    model.eval()
-    with torch.no_grad():
-        out = model(sample=sample, timestep=timestep, encoder_hidden_states=encoder_hidden_states)
-    assert out.sample.shape == (1, 4, 64, 64)
-    print(out.sample)
-
-
-def test_controlnet_inference():
-    outputs = controlnet_inference()
-    assert len(outputs) == 12 + 1  # 12 layer down and one middle
-    print(outputs)
-
-
-def test_controlled_unet_inference():
-    control = controlnet_inference()
-    out = controlled_unet_inference(control=control)
-    assert out.sample.shape == (1, 4, 64, 64)
-    print(out.sample)
-
-
-def test_controlnet_from_pretrained_and_inference():
-    outputs = controlnet_inference(model_id=model_id_sd15_canny)
-    assert len(outputs) == 12 + 1  # 12 layer down and one middle
-    print(outputs)
-
-
-def test_controlled_unet_from_pretrained_and_inference():
-    control = controlnet_inference()
-    out = controlled_unet_inference(control=control, model_id=model_id_sd15_canny)
-    assert out.sample.shape == (1, 4, 64, 64)
-    print(out.sample)
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_control_net.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_control_net.py
deleted file mode 100644
index f9e4e8e06a8f..000000000000
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_control_net.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import numpy as np
-import pytest
-import torch
-
-from diffusers import StableDiffusionControlNetPipeline
-from diffusers.utils import load_image
-
-
-################################################################################
-# PoC version
-################################################################################
-
-model_id_sd15_canny = "takuma104/control_sd15_canny"
-test_prompt = "best quality, extremely detailed, illustration, looking at viewer"
-test_negative_prompt = (
-    "longbody, lowres, bad anatomy, bad hands, missing fingers, "
-    + "pubic hair,extra digit, fewer digits, cropped, worst quality, low quality"
-)
-
-
-@pytest.mark.skip
-def test_from_pretrained():
-    pipe = StableDiffusionControlNetPipeline.from_pretrained(model_id_sd15_canny)
-    print(pipe)
-
-
-@pytest.mark.skip
-def test_from_pretrained_and_unet_inference():
-    pipe = StableDiffusionControlNetPipeline.from_pretrained(model_id_sd15_canny, torch_dtype=torch.bfloat16).to(
-        "cuda"
-    )
-    image = pipe(prompt="an apple", num_inference_steps=15).images[0]
-    image.save("/tmp/an_apple_generated.png")
-    print(image.size)
-
-
-def test_pixel_match():
-    pipe = StableDiffusionControlNetPipeline.from_pretrained(model_id_sd15_canny).to("cuda")
-    pipe.enable_attention_slicing(1)
-
-    seed = 0
-    canny_edged_image = load_image(
-        "https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged.png"
-    )
-
-    # reference image generated by https://gist.github.com/takuma104/6cdb6d9aa27f67462f11554cccdf4b34
-    output_ref_image = load_image(
-        f"https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged_seed_{seed}.png"
-    )
-
-    batch = 1
-    control = torch.from_numpy(np.array(canny_edged_image).copy()).float().cuda() / 255.0
-    control = control.repeat(batch, 1, 1, 1)
-    control = control.permute(0, 3, 1, 2)  # b h w c -> b c h w
-
-    generator = torch.Generator(device="cuda").manual_seed(seed)
-    image = pipe(
-        prompt=test_prompt,
-        negative_prompt=test_negative_prompt,
-        guidance_scale=9.0,
-        num_inference_steps=20,
-        generator=generator,
-        controlnet_hint=control,
-    ).images[0]
-    image.save(f"/tmp/seed_{seed}.png")
-
-    max_diff = np.abs(np.array(image).astype(np.int32) - np.array(output_ref_image).astype(np.int32)).max()
-    assert max_diff < 10  # must be max_diff == 0 but it appears that there is a tiny difference for some reason.
-
-
-def test_pixel_match_image_argument():
-    pipe = StableDiffusionControlNetPipeline.from_pretrained(model_id_sd15_canny).to("cuda")
-    pipe.enable_attention_slicing(1)
-
-    seed = 0
-    canny_edged_image = load_image(
-        "https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged.png"
-    )
-
-    # reference image generated by https://gist.github.com/takuma104/6cdb6d9aa27f67462f11554cccdf4b34
-    output_ref_image = load_image(
-        f"https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged_seed_{seed}.png"
-    )
-
-    generator = torch.Generator(device="cuda").manual_seed(seed)
-    image = pipe(
-        prompt=test_prompt,
-        negative_prompt=test_negative_prompt,
-        guidance_scale=9.0,
-        num_inference_steps=20,
-        generator=generator,
-        controlnet_hint=canny_edged_image,
-    ).images[0]
-    image.save(f"/tmp/seed_{seed}.png")
-
-    max_diff = np.abs(np.array(image).astype(np.int32) - np.array(output_ref_image).astype(np.int32)).max()
-    assert max_diff < 10  # must be max_diff == 0 but it appears that there is a tiny difference for some reason.

From 808376c182dab02135f92686714f628551ab4fc9 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Fri, 24 Feb 2023 14:51:01 +0900
Subject: [PATCH 051/122] fix num_of_image, prompt length issue add add test

---
 .../pipeline_stable_diffusion_controlnet.py   | 35 ++++++----
 .../test_stable_diffusion_controlnet.py       | 70 ++++++++++++++++++-
 2 files changed, 88 insertions(+), 17 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 33fde9ad2038..657d8bcdf409 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -494,7 +494,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
         latents = latents * self.scheduler.init_noise_sigma
         return latents
 
-    def controlnet_hint_conversion(self, controlnet_hint, height, width, num_images_per_prompt):
+    def controlnet_hint_conversion(self, controlnet_hint, height, width, batch_size):
         if controlnet_hint is None:
             return None
         channels = 3
@@ -502,16 +502,16 @@ def controlnet_hint_conversion(self, controlnet_hint, height, width, num_images_
             # torch.Tensor: acceptble shape are any of chw, bchw(b==1) or bchw(b==num_images_per_prompt)
             shape_chw = (channels, height, width)
             shape_bchw = (1, channels, height, width)
-            shape_nchw = (num_images_per_prompt, channels, height, width)
+            shape_nchw = (batch_size, channels, height, width)
             if controlnet_hint.shape in [shape_chw, shape_bchw, shape_nchw]:
                 controlnet_hint = controlnet_hint.to(dtype=self.controlnet.dtype, device=self.controlnet.device)
                 if controlnet_hint.shape != shape_nchw:
-                    controlnet_hint = controlnet_hint.repeat(num_images_per_prompt, 1, 1, 1)
+                    controlnet_hint = controlnet_hint.repeat(batch_size, 1, 1, 1)
                 return controlnet_hint
             else:
                 raise ValueError(
                     f"Acceptble shape of `controlnet_hint` are any of ({channels}, {height}, {width}),"
-                    + f" (1, {channels}, {height}, {width}) or ({num_images_per_prompt}, "
+                    + f" (1, {channels}, {height}, {width}) or ({batch_size}, "
                     + f"{channels}, {height}, {width}) but is {controlnet_hint.shape}"
                 )
         elif isinstance(controlnet_hint, np.ndarray):
@@ -521,13 +521,13 @@ def controlnet_hint_conversion(self, controlnet_hint, height, width, num_images_
                 controlnet_hint = np.repeat(controlnet_hint[:, :, np.newaxis], channels, axis=2)  # hw -> hwc(c==3)
             shape_hwc = (height, width, channels)
             shape_bhwc = (1, height, width, channels)
-            shape_nhwc = (num_images_per_prompt, height, width, channels)
+            shape_nhwc = (batch_size, height, width, channels)
             if controlnet_hint.shape in [shape_hwc, shape_bhwc, shape_nhwc]:
                 controlnet_hint = torch.from_numpy(controlnet_hint.copy())
                 controlnet_hint = controlnet_hint.to(dtype=self.controlnet.dtype, device=self.controlnet.device)
                 controlnet_hint /= 255.0
                 if controlnet_hint.shape != shape_nhwc:
-                    controlnet_hint = controlnet_hint.repeat(num_images_per_prompt, 1, 1, 1)
+                    controlnet_hint = controlnet_hint.repeat(batch_size, 1, 1, 1)
                 controlnet_hint = controlnet_hint.permute(0, 3, 1, 2)  # b h w c -> b c h w
                 return controlnet_hint
             else:
@@ -535,14 +535,14 @@ def controlnet_hint_conversion(self, controlnet_hint, height, width, num_images_
                     f"Acceptble shape of `controlnet_hint` are any of ({width}, {channels}), "
                     + f"({height}, {width}, {channels}), "
                     + f"(1, {height}, {width}, {channels}) or "
-                    + f"({num_images_per_prompt}, {channels}, {height}, {width}) but is {controlnet_hint.shape}"
+                    + f"({batch_size}, {channels}, {height}, {width}) but is {controlnet_hint.shape}"
                 )
         elif isinstance(controlnet_hint, PIL.Image.Image):
             if controlnet_hint.size == (width, height):
                 controlnet_hint = controlnet_hint.convert("RGB")  # make sure 3 channel RGB format
                 controlnet_hint = np.array(controlnet_hint)  # to numpy
                 controlnet_hint = controlnet_hint[:, :, ::-1]  # RGB -> BGR
-                return self.controlnet_hint_conversion(controlnet_hint, height, width, num_images_per_prompt)
+                return self.controlnet_hint_conversion(controlnet_hint, height, width, batch_size)
             else:
                 raise ValueError(
                     f"Acceptable image size of `controlnet_hint` is ({width}, {height}) but is {controlnet_hint.size}"
@@ -653,17 +653,12 @@ def __call__(
         height = height or self.unet.config.sample_size * self.vae_scale_factor
         width = width or self.unet.config.sample_size * self.vae_scale_factor
 
-        # 1. Control Embedding check & conversion
-        controlnet_hint = self.controlnet_hint_conversion(controlnet_hint, height, width, num_images_per_prompt)
-        if controlnet_hint is not None and self.vae_scale_factor != 8:
-            raise ValueError("ControlNet currently supports only for vae_scale_factor == 8.")
-
-        # 2. Check inputs. Raise error if not correct
+        # 1. Check inputs. Raise error if not correct
         self.check_inputs(
             prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
         )
 
-        # 3. Define call parameters
+        # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
         elif prompt is not None and isinstance(prompt, list):
@@ -677,6 +672,16 @@ def __call__(
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
 
+        # 3. Control Embedding check & conversion
+        controlnet_hint = self.controlnet_hint_conversion(
+            controlnet_hint, height, width, batch_size * num_images_per_prompt
+        )
+        if controlnet_hint is not None:
+            if self.vae_scale_factor != 8:
+                raise ValueError("ControlNet currently supports only for vae_scale_factor == 8.")
+            if do_classifier_free_guidance:
+                controlnet_hint = torch.cat([controlnet_hint] * 2)
+
         # 4. Encode input prompt
         prompt_embeds = self._encode_prompt(
             prompt,
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index bd7bc31eacea..a9a961c28700 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -139,12 +139,16 @@ def get_dummy_components_for_controlnet(self):
         )
         return components
 
-    def get_dummy_inputs_for_controlnet(self, device, seed=0):
+    def get_dummy_inputs_for_controlnet(self, device, seed=0, num_of_prompts=1, num_images_per_prompt=1):
         inputs = self.get_dummy_inputs(device, seed)
         vae_scale_factor = 8
+        if num_of_prompts > 1:
+            inputs["prompt"] = [f"a photo of {i} cats" for i in range(num_of_prompts)]
         inputs["controlnet_hint"] = torch.randn(
-            (1, 3, 32 * vae_scale_factor, 32 * vae_scale_factor), generator=inputs["generator"]
+            (num_of_prompts * num_images_per_prompt, 3, 32 * vae_scale_factor, 32 * vae_scale_factor),
+            generator=inputs["generator"],
         )
+        inputs["num_images_per_prompt"] = num_images_per_prompt
         return inputs
 
     def test_stable_diffusion_ddim(self):
@@ -583,6 +587,68 @@ def test_stable_diffusion_controlnet_ddim(self):
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
+    def test_stable_diffusion_controlnet_ddim_two_prompts(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        vae_scale_factor = 8
+        components = self.get_dummy_components_for_controlnet()
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs_for_controlnet(device, num_of_prompts=2)
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice0 = image[0, -3:, -3:, -1]
+        image_slice1 = image[1, -3:, -3:, -1]
+
+        # print("image_slice0", image_slice0)
+        # print("image_slice1", image_slice1)
+
+        assert image.shape == (2, 32 * vae_scale_factor, 32 * vae_scale_factor, 3)
+
+        expected_slice0 = np.array(
+            [0.47709626, 0.48531038, 0.4648616, 0.39797616, 0.4541167, 0.47469646, 0.3775609, 0.4033805, 0.4765025]
+        )
+        expected_slice1 = np.array(
+            [0.4621172, 0.4676137, 0.5062453, 0.5052618, 0.5217055, 0.5249935, 0.4406457, 0.4661678, 0.5214513]
+        )
+
+        assert np.abs(image_slice0.flatten() - expected_slice0).max() < 1e-2
+        assert np.abs(image_slice1.flatten() - expected_slice1).max() < 1e-2
+
+    def test_stable_diffusion_controlnet_ddim_two_images_per_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        vae_scale_factor = 8
+        components = self.get_dummy_components_for_controlnet()
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs_for_controlnet(device, num_images_per_prompt=2)
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice0 = image[0, -3:, -3:, -1]
+        image_slice1 = image[1, -3:, -3:, -1]
+
+        # print("image_slice0", image_slice0)
+        # print("image_slice1", image_slice1)
+
+        assert image.shape == (2, 32 * vae_scale_factor, 32 * vae_scale_factor, 3)
+
+        expected_slice0 = np.array(
+            [0.4763651, 0.48430225, 0.46508622, 0.3978958, 0.45416373, 0.4748904, 0.3773327, 0.40261823, 0.47642976]
+        )
+        expected_slice1 = np.array(
+            [0.462687, 0.46804678, 0.50551665, 0.50405616, 0.5203774, 0.52371174, 0.43939433, 0.46502018, 0.52084166]
+        )
+
+        assert np.abs(image_slice0.flatten() - expected_slice0).max() < 1e-2
+        assert np.abs(image_slice1.flatten() - expected_slice1).max() < 1e-2
+
 
 @slow
 @require_torch_gpu

From cd850864c9e5d1510782a79382b65e8815032749 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Fri, 24 Feb 2023 15:46:55 +0900
Subject: [PATCH 052/122] add support List[PIL.Image] for controlnet_hint

---
 .../pipeline_stable_diffusion_controlnet.py   | 33 ++++++---
 .../test_stable_diffusion_controlnet.py       | 74 ++++++++++++++++++-
 2 files changed, 94 insertions(+), 13 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 657d8bcdf409..accc8899860d 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -495,6 +495,18 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
         return latents
 
     def controlnet_hint_conversion(self, controlnet_hint, height, width, batch_size):
+        def pil_image_to_numpy(image, width, height):
+            assert isinstance(image, PIL.Image.Image)
+            if image.size == (width, height):
+                controlnet_hint = image.convert("RGB")  # make sure 3 channel RGB format
+                controlnet_hint = np.array(controlnet_hint)  # to numpy
+                controlnet_hint = controlnet_hint[:, :, ::-1]  # RGB -> BGR
+                return controlnet_hint
+            else:
+                raise ValueError(
+                    f"Acceptable image size of `controlnet_hint` is ({width}, {height}) but is {image.size}"
+                )
+
         if controlnet_hint is None:
             return None
         channels = 3
@@ -538,15 +550,14 @@ def controlnet_hint_conversion(self, controlnet_hint, height, width, batch_size)
                     + f"({batch_size}, {channels}, {height}, {width}) but is {controlnet_hint.shape}"
                 )
         elif isinstance(controlnet_hint, PIL.Image.Image):
-            if controlnet_hint.size == (width, height):
-                controlnet_hint = controlnet_hint.convert("RGB")  # make sure 3 channel RGB format
-                controlnet_hint = np.array(controlnet_hint)  # to numpy
-                controlnet_hint = controlnet_hint[:, :, ::-1]  # RGB -> BGR
-                return self.controlnet_hint_conversion(controlnet_hint, height, width, batch_size)
-            else:
-                raise ValueError(
-                    f"Acceptable image size of `controlnet_hint` is ({width}, {height}) but is {controlnet_hint.size}"
-                )
+            return self.controlnet_hint_conversion(
+                pil_image_to_numpy(controlnet_hint, width, height), height, width, batch_size
+            )
+        elif isinstance(controlnet_hint, list):
+            n_arrays = []
+            for image in controlnet_hint:
+                n_arrays.append(pil_image_to_numpy(image, width, height))
+            return self.controlnet_hint_conversion(np.array(n_arrays), height, width, batch_size)
         else:
             raise ValueError(
                 f"Acceptable type of `controlnet_hint` are any of torch.Tensor, np.ndarray, PIL.Image.Image but is {type(controlnet_hint)}"
@@ -573,7 +584,7 @@ def __call__(
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: int = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_hint: Optional[Union[torch.FloatTensor, np.ndarray, PIL.Image.Image]] = None,
+        controlnet_hint: Optional[Union[torch.FloatTensor, np.ndarray, PIL.Image.Image, List[PIL.Image.Image]]] = None,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -634,7 +645,7 @@ def __call__(
                 A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
                 `self.processor` in
                 [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-            controlnet_hint (`torch.FloatTensor`, `np.ndarray` or `PIL.Image.Image`, *optional*):
+            controlnet_hint (`torch.FloatTensor`, `np.ndarray`, `PIL.Image.Image` or `List[PIL.Image.Image]`, *optional*):
                 ControlNet input embedding. ControlNet generates guidances using this input embedding. If the type is
                 specified as `torch.FloatTensor`, it is passed to ControlNet as is. If the type is `np.ndarray`, it is
                 assumed to be an OpenCV compatible image format. PIL.Image.Image` can also be accepted as an image. The
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index a9a961c28700..2b4517bbe121 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -19,6 +19,7 @@
 import unittest
 
 import numpy as np
+import PIL.Image
 import torch
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
@@ -139,15 +140,30 @@ def get_dummy_components_for_controlnet(self):
         )
         return components
 
-    def get_dummy_inputs_for_controlnet(self, device, seed=0, num_of_prompts=1, num_images_per_prompt=1):
+    def get_dummy_inputs_for_controlnet(
+        self, device, seed=0, num_of_prompts=1, num_images_per_prompt=1, pil_image=False
+    ):
         inputs = self.get_dummy_inputs(device, seed)
         vae_scale_factor = 8
         if num_of_prompts > 1:
             inputs["prompt"] = [f"a photo of {i} cats" for i in range(num_of_prompts)]
-        inputs["controlnet_hint"] = torch.randn(
+
+        controlnet_hint = torch.randn(
             (num_of_prompts * num_images_per_prompt, 3, 32 * vae_scale_factor, 32 * vae_scale_factor),
             generator=inputs["generator"],
         )
+
+        if pil_image:
+            controlnet_hint = controlnet_hint.detach().numpy().copy()
+            images = np.zeros_like(controlnet_hint, dtype=np.uint8)
+            images[controlnet_hint > 0.5] = 255
+            images = images.transpose(0, 3, 2, 1)  # b c h w -> b w h c
+            if images.shape[0] == 1:
+                controlnet_hint = PIL.Image.fromarray(images[0])  # PIL.Image
+            else:
+                controlnet_hint = [PIL.Image.fromarray(images[b]) for b in range(images.shape[0])]  # List of PIL.Image
+
+        inputs["controlnet_hint"] = controlnet_hint
         inputs["num_images_per_prompt"] = num_images_per_prompt
         return inputs
 
@@ -587,6 +603,29 @@ def test_stable_diffusion_controlnet_ddim(self):
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
+    def test_stable_diffusion_controlnet_ddim_pil_image(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        vae_scale_factor = 8
+        components = self.get_dummy_components_for_controlnet()
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs_for_controlnet(device, pil_image=True)
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+        # print("image_slice", image_slice)
+
+        assert image.shape == (1, 32 * vae_scale_factor, 32 * vae_scale_factor, 3)
+        expected_slice = np.array(
+            [0.4780106, 0.46282214, 0.49179333, 0.437001, 0.4518742, 0.46226522, 0.41771045, 0.4315053, 0.4805042]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
     def test_stable_diffusion_controlnet_ddim_two_prompts(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
 
@@ -649,6 +688,37 @@ def test_stable_diffusion_controlnet_ddim_two_images_per_prompt(self):
         assert np.abs(image_slice0.flatten() - expected_slice0).max() < 1e-2
         assert np.abs(image_slice1.flatten() - expected_slice1).max() < 1e-2
 
+    def test_stable_diffusion_controlnet_ddim_two_images_per_prompt_pil_image(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        vae_scale_factor = 8
+        components = self.get_dummy_components_for_controlnet()
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs_for_controlnet(device, num_images_per_prompt=2, pil_image=True)
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice0 = image[0, -3:, -3:, -1]
+        image_slice1 = image[1, -3:, -3:, -1]
+
+        # print("image_slice0", image_slice0)
+        # print("image_slice1", image_slice1)
+
+        assert image.shape == (2, 32 * vae_scale_factor, 32 * vae_scale_factor, 3)
+
+        expected_slice0 = np.array(
+            [0.4763651, 0.48430225, 0.46508622, 0.3978958, 0.45416373, 0.4748904, 0.3773327, 0.40261823, 0.47642976]
+        )
+        expected_slice1 = np.array(
+            [0.462687, 0.46804678, 0.50551665, 0.50405616, 0.5203774, 0.52371174, 0.43939433, 0.46502018, 0.52084166]
+        )
+
+        assert np.abs(image_slice0.flatten() - expected_slice0).max() < 1e-2
+        assert np.abs(image_slice1.flatten() - expected_slice1).max() < 1e-2
+
 
 @slow
 @require_torch_gpu

From e376edb1ff24bacf74eecf16ea685ac18e8b41f1 Mon Sep 17 00:00:00 2001
From: William Berman <WLBberman@gmail.com>
Date: Thu, 23 Feb 2023 15:58:12 -0800
Subject: [PATCH 053/122] wip

---
 scripts/convert_controlnet_to_diffusers.py    | 118 ----
 ..._original_stable_diffusion_to_diffusers.py |   4 +
 src/diffusers/__init__.py                     |   1 +
 src/diffusers/models/__init__.py              |   2 +-
 src/diffusers/models/controlnet.py            | 538 ++++++++++++++++++
 src/diffusers/models/controlnet_blocks.py     | 105 ----
 src/diffusers/models/unet_2d_condition.py     |  73 +--
 .../stable_diffusion/convert_from_ckpt.py     | 471 +++------------
 .../pipeline_stable_diffusion_controlnet.py   | 210 ++-----
 .../versatile_diffusion/modeling_text_unet.py |  71 +--
 src/diffusers/utils/dummy_pt_objects.py       |  15 +
 tests/models/test_models_unet_2d_condition.py | 129 +----
 .../test_stable_diffusion_controlnet.py       | 488 +---------------
 tests/test_pipelines_common.py                |  12 +-
 14 files changed, 768 insertions(+), 1469 deletions(-)
 delete mode 100644 scripts/convert_controlnet_to_diffusers.py
 create mode 100644 src/diffusers/models/controlnet.py
 delete mode 100644 src/diffusers/models/controlnet_blocks.py

diff --git a/scripts/convert_controlnet_to_diffusers.py b/scripts/convert_controlnet_to_diffusers.py
deleted file mode 100644
index 3cf4ba6d28ad..000000000000
--- a/scripts/convert_controlnet_to_diffusers.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Conversion script for the ControlNet checkpoints. """
-
-import argparse
-
-from diffusers.pipelines.stable_diffusion.convert_from_ckpt import load_pipeline_from_control_net_ckpt
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
-    )
-    # !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml
-    parser.add_argument(
-        "--original_config_file",
-        default=None,
-        type=str,
-        help="The YAML config file corresponding to the original architecture.",
-    )
-    parser.add_argument(
-        "--num_in_channels",
-        default=None,
-        type=int,
-        help="The number of input channels. If `None` number of input channels will be automatically inferred.",
-    )
-    parser.add_argument(
-        "--scheduler_type",
-        default="ddim",
-        type=str,
-        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancestral', 'dpm']",
-    )
-    parser.add_argument(
-        "--pipeline_type",
-        default=None,
-        type=str,
-        help=(
-            "The pipeline type. One of 'FrozenOpenCLIPEmbedder', 'FrozenCLIPEmbedder', 'PaintByExample'"
-            ". If `None` pipeline will be automatically inferred."
-        ),
-    )
-    parser.add_argument(
-        "--image_size",
-        default=None,
-        type=int,
-        help=(
-            "The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Siffusion v2"
-            " Base. Use 768 for Stable Diffusion v2."
-        ),
-    )
-    parser.add_argument(
-        "--prediction_type",
-        default=None,
-        type=str,
-        help=(
-            "The prediction type that the model was trained on. Use 'epsilon' for Stable Diffusion v1.X and Stable"
-            " Diffusion v2 Base. Use 'v_prediction' for Stable Diffusion v2."
-        ),
-    )
-    parser.add_argument(
-        "--extract_ema",
-        action="store_true",
-        help=(
-            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
-            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
-            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
-        ),
-    )
-    parser.add_argument(
-        "--upcast_attention",
-        action="store_true",
-        help=(
-            "Whether the attention computation should always be upcasted. This is necessary when running stable"
-            " diffusion 2.1."
-        ),
-    )
-    parser.add_argument(
-        "--from_safetensors",
-        action="store_true",
-        help="If `--checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.",
-    )
-    parser.add_argument(
-        "--to_safetensors",
-        action="store_true",
-        help="Whether to store pipeline in safetensors format or not.",
-    )
-    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
-    parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
-    args = parser.parse_args()
-
-    pipe = load_pipeline_from_control_net_ckpt(
-        checkpoint_path=args.checkpoint_path,
-        original_config_file=args.original_config_file,
-        image_size=args.image_size,
-        prediction_type=args.prediction_type,
-        model_type=args.pipeline_type,
-        extract_ema=args.extract_ema,
-        scheduler_type=args.scheduler_type,
-        num_in_channels=args.num_in_channels,
-        upcast_attention=args.upcast_attention,
-        from_safetensors=args.from_safetensors,
-        device=args.device,
-    )
-    pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
diff --git a/scripts/convert_original_stable_diffusion_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py
index 11e35211b242..d0f4b8efa93e 100644
--- a/scripts/convert_original_stable_diffusion_to_diffusers.py
+++ b/scripts/convert_original_stable_diffusion_to_diffusers.py
@@ -120,6 +120,9 @@
         help="Path to the clip stats file. Only required if the stable unclip model's config specifies `model.params.noise_aug_config.params.clip_stats_path`.",
         required=False,
     )
+    parser.add_argument(
+        "--controlnet", action="store_true", default=None, help="Set flag if this is a controlnet checkpoint."
+    )
     args = parser.parse_args()
 
     pipe = load_pipeline_from_original_stable_diffusion_ckpt(
@@ -137,5 +140,6 @@
         stable_unclip=args.stable_unclip,
         stable_unclip_prior=args.stable_unclip_prior,
         clip_stats_path=args.clip_stats_path,
+        controlnet=args.controlnet,
     )
     pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 60d8a1210c97..59b0a5ec206c 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -34,6 +34,7 @@
 else:
     from .models import (
         AutoencoderKL,
+        ControlNetModel,
         ModelMixin,
         PriorTransformer,
         Transformer2DModel,
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 9917f4b24f51..a2857b7f9f52 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -17,7 +17,7 @@
 
 if is_torch_available():
     from .autoencoder_kl import AutoencoderKL
-    from .controlnet_blocks import ControlNetInputHintBlock, ControlNetZeroConvBlock
+    from .controlnet import ControlNetModel
     from .dual_transformer_2d import DualTransformer2DModel
     from .modeling_utils import ModelMixin
     from .prior_transformer import PriorTransformer
diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py
new file mode 100644
index 000000000000..7afa1eccf65d
--- /dev/null
+++ b/src/diffusers/models/controlnet.py
@@ -0,0 +1,538 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput, logging
+from .cross_attention import AttnProcessor
+from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
+from .modeling_utils import ModelMixin
+from .unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    DownBlock2D,
+    UNetMidBlock2DCrossAttn,
+    UNetMidBlock2DSimpleCrossAttn,
+    get_down_block,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class ControlNetOutput(BaseOutput):
+    down_block_res_samples: Tuple[torch.Tensor]
+    mid_block_res_sample: torch.Tensor
+
+
+class ControlNetConditioningDefaultEmbedding(nn.Module):
+    """
+    "Stable Diffusion uses a pre-processing method similar to VQ-GAN [11] to convert the entire dataset of 512 × 512
+    images into smaller 64 × 64 “latent images” for stabilized training. This requires ControlNets to convert
+    image-based conditions to 64 × 64 feature space to match the convolution size. We use a tiny network E(·) of four
+    convolution layers with 4 × 4 kernels and 2 × 2 strides (activated by ReLU, channels are 16, 32, 64, 128,
+    initialized with Gaussian weights, trained jointly with the full model) to encode image-space conditions ... into
+    feature maps ..."
+    """
+
+    def __init__(self, conditioning_channels: int, conditioning_embedding_channels: int):
+        super().__init__()
+
+        self.conditioning_embedder = nn.Sequential(
+            nn.Conv2d(conditioning_channels, 16, kernel_size=3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 16, kernel_size=3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 32, kernel_size=3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(32, 32, kernel_size=3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(32, 96, kernel_size=3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(96, 96, kernel_size=3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(96, 256, kernel_size=3, padding=1, stride=2),
+            nn.SiLU(),
+            zero_module(nn.Conv2d(256, conditioning_embedding_channels, kernel_size=3, padding=1)),
+        )
+
+    def forward(self, conditioning):
+        embedding = self.conditioning_embedder(conditioning)
+        return embedding
+
+
+class ControlNetModel(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        time_embedding_type: str = "positional",
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        controlnet_conditioning_embedding_type: str = "default",
+        controlnet_conditioning_channels: int = 3,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+
+        # Check inputs
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+
+        # time
+        if time_embedding_type == "fourier":
+            time_embed_dim = block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = block_out_channels[0] * 4
+
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+
+        # control net conditioning embedding
+        if controlnet_conditioning_embedding_type == "default":
+            self.controlnet_cond_embedding = ControlNetConditioningDefaultEmbedding(
+                conditioning_channels=controlnet_conditioning_channels,
+                conditioning_embedding_channels=block_out_channels[0],
+            )
+        else:
+            raise ValueError(
+                f"unknown `controlnet_conditioning_embedding_type`: {controlnet_conditioning_embedding_type}. Options are 'default'"
+            )
+
+        self.down_blocks = nn.ModuleList([])
+        self.controlnet_down_blocks = nn.ModuleList([])
+
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+        # down
+        output_channel = block_out_channels[0]
+
+        controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+        controlnet_block = zero_module(controlnet_block)
+        self.controlnet_down_blocks.append(controlnet_block)
+
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+            )
+            self.down_blocks.append(down_block)
+
+            for _ in range(layers_per_block):
+                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+                controlnet_block = zero_module(controlnet_block)
+                self.controlnet_down_blocks.append(controlnet_block)
+
+            if not is_final_block:
+                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+                controlnet_block = zero_module(controlnet_block)
+                self.controlnet_down_blocks.append(controlnet_block)
+
+        # mid
+        mid_block_channel = block_out_channels[-1]
+
+        controlnet_block = nn.Conv2d(mid_block_channel, mid_block_channel, kernel_size=1)
+        controlnet_block = zero_module(controlnet_block)
+        self.controlnet_mid_block = controlnet_block
+
+        if mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                in_channels=mid_block_channel,
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+            )
+        elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
+            self.mid_block = UNetMidBlock2DSimpleCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+            )
+        elif mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+
+    @property
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttnProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttnProcessor]):
+            if hasattr(module, "set_processor"):
+                processors[f"{name}.processor"] = module.processor
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttnProcessor, Dict[str, AttnProcessor]]):
+        r"""
+        Parameters:
+            `processor (`dict` of `AttnProcessor` or `AttnProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                of **all** `CrossAttention` layers.
+            In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainablae attention processors.:
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attention_slice
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+
+        def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+
+            for child in module.children():
+                fn_recursive_retrieve_slicable_dims(child)
+
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_slicable_dims(module)
+
+        num_slicable_layers = len(sliceable_head_dims)
+
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_slicable_layers * [1]
+
+        slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        controlnet_cond: torch.FloatTensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[ControlNetOutput, Tuple]:
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
+
+        sample += controlnet_cond
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+            )
+
+        # 5. Control net blocks
+
+        controlnet_down_block_res_samples = ()
+
+        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
+            down_block_res_sample = controlnet_block(down_block_res_sample)
+            controlnet_down_block_res_samples += (down_block_res_sample,)
+
+        down_block_res_samples = controlnet_down_block_res_samples
+
+        mid_block_res_sample = self.controlnet_mid_block(sample)
+
+        if not return_dict:
+            return (down_block_res_samples, mid_block_res_sample)
+
+        return ControlNetOutput(
+            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
+        )
+
+
+def zero_module(module):
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module
diff --git a/src/diffusers/models/controlnet_blocks.py b/src/diffusers/models/controlnet_blocks.py
deleted file mode 100644
index 56f6a32e3a17..000000000000
--- a/src/diffusers/models/controlnet_blocks.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-r"""
-ControlNet Block Implementation
-
-Paper: Adding Conditional Control to Text-to-Image Diffusion Models. https://arxiv.org/abs/2302.05543
-
-Reference implementation: https://github.com/lllyasviel/ControlNet
-"""
-
-from typing import List, Tuple
-
-import torch
-import torch.nn as nn
-
-
-def set_zero_parameters(module):
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
-
-
-# ControlNet: Zero Convolution
-def zero_conv(channels):
-    return set_zero_parameters(nn.Conv2d(channels, channels, 1, padding=0))
-
-
-class ControlNetInputHintBlock(nn.Module):
-    def __init__(self, hint_channels: int = 3, channels: int = 320):
-        super().__init__()
-        # Layer configurations are from reference implementation.
-        #
-        # Note: The sequence of convolution operations reduces the width and height by 1/8.
-        # This assumes that the vae_scale_factor for SD is 8.
-        # TODO: support vae_scale_factor != 8 situation
-        self.input_hint_block = nn.Sequential(
-            nn.Conv2d(hint_channels, 16, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(16, 16, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(16, 32, 3, padding=1, stride=2),
-            nn.SiLU(),
-            nn.Conv2d(32, 32, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(32, 96, 3, padding=1, stride=2),
-            nn.SiLU(),
-            nn.Conv2d(96, 96, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(96, 256, 3, padding=1, stride=2),
-            nn.SiLU(),
-            set_zero_parameters(nn.Conv2d(256, channels, 3, padding=1)),
-        )
-
-    def forward(self, hint: torch.Tensor):
-        return self.input_hint_block(hint)
-
-
-class ControlNetZeroConvBlock(nn.Module):
-    def __init__(
-        self,
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
-        down_block_types: Tuple[str] = (
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "DownBlock2D",
-        ),
-        layers_per_block: int = 2,
-    ):
-        super().__init__()
-        self.input_zero_conv = zero_conv(block_out_channels[0])
-        zero_convs = []
-        for i, down_block_type in enumerate(down_block_types):
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-            for _ in range(layers_per_block):
-                zero_convs.append(zero_conv(output_channel))
-            if not is_final_block:
-                zero_convs.append(zero_conv(output_channel))
-        self.zero_convs = nn.ModuleList(zero_convs)
-        self.mid_zero_conv = zero_conv(block_out_channels[-1])
-
-    def forward(
-        self,
-        down_block_res_samples: List[torch.Tensor],
-        mid_block_sample: torch.Tensor,
-    ) -> List[torch.Tensor]:
-        outputs = []
-        outputs.append(self.input_zero_conv(down_block_res_samples[0]))
-        for res_sample, zero_conv in zip(down_block_res_samples[1:], self.zero_convs):
-            outputs.append(zero_conv(res_sample))
-        outputs.append(self.mid_zero_conv(mid_block_sample))
-        return outputs
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index c3d936184584..a5f997e9b400 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -21,7 +21,6 @@
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..loaders import UNet2DConditionLoadersMixin
 from ..utils import BaseOutput, logging
-from .controlnet_blocks import ControlNetInputHintBlock, ControlNetZeroConvBlock
 from .cross_attention import AttnProcessor
 from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
 from .modeling_utils import ModelMixin
@@ -106,9 +105,6 @@ class conditioning with `class_embed_type` equal to `None`.
         conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
         projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
             using the "projection" `class_embed_type`. Required when using the "projection" `class_embed_type`.
-        controlnet_hint_channels (`int`, *optional*, default to `None`):
-            The number of channels in the `controlnet_hint`. If this value is not None, this unet model behaves as
-            ControlNet.
     """
 
     _supports_gradient_checkpointing = True
@@ -146,20 +142,19 @@ def __init__(
         num_class_embeds: Optional[int] = None,
         upcast_attention: bool = False,
         resnet_time_scale_shift: str = "default",
-        time_embedding_type: str = "positional",  # fourier, positional
+        time_embedding_type: str = "positional",
         timestep_post_act: Optional[str] = None,
         time_cond_proj_dim: Optional[int] = None,
         conv_in_kernel: int = 3,
         conv_out_kernel: int = 3,
         projection_class_embeddings_input_dim: Optional[int] = None,
-        controlnet_hint_channels: Optional[int] = None,
     ):
         super().__init__()
 
         self.sample_size = sample_size
 
         # Check inputs
-        if controlnet_hint_channels is None and len(down_block_types) != len(up_block_types):
+        if len(down_block_types) != len(up_block_types):
             raise ValueError(
                 f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
             )
@@ -308,18 +303,6 @@ def __init__(
         # count how many layers upsample the images
         self.num_upsamplers = 0
 
-        if controlnet_hint_channels is not None:
-            # ControlNet: add input_hint_block, zero_conv_block
-            self.controlnet_input_hint_block = ControlNetInputHintBlock(
-                hint_channels=controlnet_hint_channels, channels=block_out_channels[0]
-            )
-            self.controlnet_zero_conv_block = ControlNetZeroConvBlock(
-                block_out_channels=block_out_channels,
-                down_block_types=down_block_types,
-                layers_per_block=layers_per_block,
-            )
-            return  # Modules from the following lines are not defined in ControlNet
-
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
         reversed_attention_head_dim = list(reversed(attention_head_dim))
@@ -509,10 +492,10 @@ def forward(
         timestep_cond: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_hint: Optional[torch.FloatTensor] = None,
-        control: Optional[List[torch.FloatTensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
         return_dict: bool = True,
-    ) -> Union[UNet2DConditionOutput, Tuple, List[torch.FloatTensor]]:
+    ) -> Union[UNet2DConditionOutput, Tuple]:
         r"""
         Args:
             sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
@@ -524,17 +507,11 @@ def forward(
                 A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
                 `self.processor` in
                 [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-            controlnet_hint (`torch.FloatTensor`, *optional*, defaults to `None`):
-                ControlNet input embedding. If `controlnet_hint_channel` of `__init__()` is not None, it must be
-                specified as a tensors.
-            control (`List[torch.FloatTensor]`, *optional*, defaults to `None`):
-                If `control` is not None, this unet model behaves as ControlledUnet. ControlledUnet is controlled by
-                this list of tensors.
+
         Returns:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] , `tuple` or [torch.FloatTensor]:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
             [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor. If `controlnet_hint` is not None, the ControlNet
-            result of the processing is output as a list of tensors.
+            returning a tuple, the first element is the sample tensor.
         """
         # By default samples have to be AT least a multiple of the overall upsampling factor.
         # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
@@ -597,8 +574,6 @@ def forward(
 
         # 2. pre-process
         sample = self.conv_in(sample)
-        if controlnet_hint is not None:
-            sample += self.controlnet_input_hint_block(controlnet_hint)
 
         # 3. down
         down_block_res_samples = (sample,)
@@ -616,6 +591,17 @@ def forward(
 
             down_block_res_samples += res_samples
 
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample += down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
+
         # 4. mid
         if self.mid_block is not None:
             sample = self.mid_block(
@@ -626,15 +612,8 @@ def forward(
                 cross_attention_kwargs=cross_attention_kwargs,
             )
 
-        if controlnet_hint is not None:
-            # ControlNet: zero convs
-            return self.controlnet_zero_conv_block(
-                down_block_res_samples=down_block_res_samples, mid_block_sample=sample
-            )
-
-        if control is not None:
-            # ControlledUnet: apply mid_zero_conv output
-            sample += control.pop()
+        if mid_block_additional_residual is not None:
+            sample += mid_block_additional_residual
 
         # 5. up
         for i, upsample_block in enumerate(self.up_blocks):
@@ -643,12 +622,6 @@ def forward(
             res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
             down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
 
-            if control is not None:
-                # ControlledUnet: apply ControlNet downblock zero_convs output
-                control_samples = control[-len(upsample_block.resnets) :]
-                control = control[: -len(upsample_block.resnets)]
-                res_samples = [r + c for r, c in zip(res_samples, control_samples)]
-
             # if we have not reached the final block and need to forward the
             # upsample size, we do it here
             if not is_final_block and forward_upsample_size:
@@ -669,10 +642,6 @@ def forward(
                     hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
                 )
 
-        # TODO: remove this block
-        if control is not None:
-            assert len(control) == 0, f"must consume all control array ({len(control)})"
-
         # 6. post-process
         if self.conv_norm_out:
             sample = self.conv_norm_out(sample)
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index 51f6530f5bae..b3e5d3278594 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -34,6 +34,7 @@
 
 from diffusers import (
     AutoencoderKL,
+    ControlNetModel,
     DDIMScheduler,
     DDPMScheduler,
     DPMSolverMultistepScheduler,
@@ -225,11 +226,15 @@ def conv_attn_to_linear(checkpoint):
                 checkpoint[key] = checkpoint[key][:, :, 0]
 
 
-def create_unet_diffusers_config(original_config, image_size: int):
+def create_unet_diffusers_config(original_config, image_size: int, controlnet=False):
     """
     Creates a config for the diffusers based on the config of the LDM model.
     """
-    unet_params = original_config.model.params.unet_config.params
+    if controlnet:
+        unet_params = original_config.model.params.control_stage_config.params
+    else:
+        unet_params = original_config.model.params.unet_config.params
+
     vae_params = original_config.model.params.first_stage_config.params.ddconfig
 
     block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
@@ -273,9 +278,7 @@ def create_unet_diffusers_config(original_config, image_size: int):
     config = dict(
         sample_size=image_size // vae_scale_factor,
         in_channels=unet_params.in_channels,
-        out_channels=unet_params.out_channels,
         down_block_types=tuple(down_block_types),
-        up_block_types=tuple(up_block_types),
         block_out_channels=tuple(block_out_channels),
         layers_per_block=unet_params.num_res_blocks,
         cross_attention_dim=unet_params.context_dim,
@@ -284,54 +287,10 @@ def create_unet_diffusers_config(original_config, image_size: int):
         class_embed_type=class_embed_type,
         projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
     )
-    return config
-
-
-def create_controlnet_diffusers_config(original_config, image_size: int):
-    """
-    Creates a config for the diffusers based on the config of the LDM model.
-    """
-    controlnet_params = original_config.model.params.control_stage_config.params
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
-
-    block_out_channels = [controlnet_params.model_channels * mult for mult in controlnet_params.channel_mult]
 
-    down_block_types = []
-    resolution = 1
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnDownBlock2D" if resolution in controlnet_params.attention_resolutions else "DownBlock2D"
-        down_block_types.append(block_type)
-        if i != len(block_out_channels) - 1:
-            resolution *= 2
-
-    up_block_types = []
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnUpBlock2D" if resolution in controlnet_params.attention_resolutions else "UpBlock2D"
-        up_block_types.append(block_type)
-        resolution //= 2
-
-    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
-
-    head_dim = controlnet_params.num_heads if "num_heads" in controlnet_params else None
-    use_linear_projection = (
-        controlnet_params.use_linear_in_transformer if "use_linear_in_transformer" in controlnet_params else False
-    )
-    if use_linear_projection:
-        # stable diffusion 2-base-512 and 2-768
-        if head_dim is None:
-            head_dim = [5, 10, 20, 20]
-
-    config = dict(
-        sample_size=image_size // vae_scale_factor,
-        in_channels=controlnet_params.in_channels,
-        down_block_types=tuple(down_block_types),
-        block_out_channels=tuple(block_out_channels),
-        layers_per_block=controlnet_params.num_res_blocks,
-        cross_attention_dim=controlnet_params.context_dim,
-        attention_head_dim=head_dim,
-        use_linear_projection=use_linear_projection,
-        controlnet_hint_channels=controlnet_params.hint_channels,
-    )
+    if not controlnet:
+        config["out_channels"] = unet_params.out_channels
+        config["up_block_types"] = tuple(up_block_types)
 
     return config
 
@@ -380,7 +339,7 @@ def create_ldm_bert_config(original_config):
     return config
 
 
-def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False, controlnet=False):
     """
     Takes a state dict and a config, and returns a converted checkpoint.
     """
@@ -389,7 +348,11 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
     unet_state_dict = {}
     keys = list(checkpoint.keys())
 
-    unet_key = "model.diffusion_model."
+    if controlnet:
+        unet_key = "control_model."
+    else:
+        unet_key = "model.diffusion_model."
+
     # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
     if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
         print(f"Checkpoint {path} has both EMA and non-EMA weights.")
@@ -433,10 +396,11 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
     new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
     new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
 
-    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
-    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
-    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
-    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+    if not controlnet:
+        new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+        new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+        new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+        new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
 
     # Retrieves the keys for the input blocks only
     num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
@@ -561,117 +525,25 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
 
                 new_checkpoint[new_path] = unet_state_dict[old_path]
 
-    return new_checkpoint
-
-
-def convert_controlnet_checkpoint(checkpoint, config):
-    """
-    Takes a state dict and a config, and returns a converted checkpoint.
-    """
-
-    # extract state_dict for UNet
-    unet_state_dict = {}
-    keys = list(checkpoint.keys())
-
-    unet_key = "control_model."
-
-    for key in keys:
-        if key.startswith(unet_key):
-            unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
-
-    new_checkpoint = {}
-
-    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
-    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
-    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
-    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
-
-    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
-    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
-
-    # Retrieves the keys for the input blocks only
-    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
-    input_blocks = {
-        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
-        for layer_id in range(num_input_blocks)
-    }
-
-    # Retrieves the keys for the middle blocks only
-    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
-    middle_blocks = {
-        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
-        for layer_id in range(num_middle_blocks)
-    }
-
-    for i in range(1, num_input_blocks):
-        block_id = (i - 1) // (config["layers_per_block"] + 1)
-        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
-
-        resnets = [
-            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
-        ]
-        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
-
-        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
-                f"input_blocks.{i}.0.op.weight"
+    if controlnet:
+        # conditioning embedding
+        # NOTE: `8` is hardcoded based off of the number of blocks in `ControlNetConditioningDefaultEmbedding`
+        for i in range(8):
+            new_checkpoint[f"controlnet_cond_embedding.conditioning_embedder.{i*2}.weight"] = unet_state_dict.pop(
+                f"input_hint_block.{i*2}.weight"
             )
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
-                f"input_blocks.{i}.0.op.bias"
+            new_checkpoint[f"controlnet_cond_embedding.conditioning_embedder.{i*2}.bias"] = unet_state_dict.pop(
+                f"input_hint_block.{i*2}.bias"
             )
 
-        paths = renew_resnet_paths(resnets)
-        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
-        assign_to_checkpoint(
-            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-        )
-
-        if len(attentions):
-            paths = renew_attention_paths(attentions)
-            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
-            assign_to_checkpoint(
-                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-            )
+        # down blocks
+        for i in range(num_input_blocks):
+            new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
+            new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
 
-    resnet_0 = middle_blocks[0]
-    attentions = middle_blocks[1]
-    resnet_1 = middle_blocks[2]
-
-    resnet_0_paths = renew_resnet_paths(resnet_0)
-    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
-
-    resnet_1_paths = renew_resnet_paths(resnet_1)
-    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
-
-    attentions_paths = renew_attention_paths(attentions)
-    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(
-        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-    )
-
-    # ControlNet Specific Weights & Biases
-
-    # input_hint_block
-    for i in range(8):
-        key_dst = f"controlnet_input_hint_block.input_hint_block.{i*2}."
-        key_src = f"input_hint_block.{i*2}."
-        new_checkpoint[key_dst + "weight"] = unet_state_dict.pop(key_src + "weight")
-        new_checkpoint[key_dst + "bias"] = unet_state_dict.pop(key_src + "bias")
-
-    # zero_convs
-    new_checkpoint["controlnet_zero_conv_block.input_zero_conv.weight"] = unet_state_dict.pop("zero_convs.0.0.weight")
-    new_checkpoint["controlnet_zero_conv_block.input_zero_conv.bias"] = unet_state_dict.pop("zero_convs.0.0.bias")
-    for i in range(1, num_input_blocks):
-        key_dst = f"controlnet_zero_conv_block.zero_convs.{i-1}."
-        key_src = f"zero_convs.{i}.0."
-        new_checkpoint[key_dst + "weight"] = unet_state_dict.pop(key_src + "weight")
-        new_checkpoint[key_dst + "bias"] = unet_state_dict.pop(key_src + "bias")
-
-    # mid_zero_conv
-    key_dst = "controlnet_zero_conv_block.mid_zero_conv."
-    key_src = "middle_block_out.0."
-    new_checkpoint[key_dst + "weight"] = unet_state_dict.pop(key_src + "weight")
-    new_checkpoint[key_dst + "bias"] = unet_state_dict.pop(key_src + "bias")
+        # mid block
+        new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
+        new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
 
     return new_checkpoint
 
@@ -1073,6 +945,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
     stable_unclip: Optional[str] = None,
     stable_unclip_prior: Optional[str] = None,
     clip_stats_path: Optional[str] = None,
+    controlnet: Optional[bool] = None,
 ) -> StableDiffusionPipeline:
     """
     Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
@@ -1254,6 +1127,12 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
         model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
         logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}")
 
+    if controlnet is None:
+        controlnet = "control_stage_config" in original_config.model.params
+
+    if controlnet and model_type != "FrozenCLIPEmbedder":
+        raise ValueError("`controlnet`=True only supports `model_type`='FrozenCLIPEmbedder'")
+
     if model_type == "FrozenOpenCLIPEmbedder":
         text_model = convert_open_clip_checkpoint(checkpoint)
         tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer")
@@ -1341,243 +1220,43 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
         tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
         safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
         feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
-        pipe = StableDiffusionPipeline(
-            vae=vae,
-            text_encoder=text_model,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-    else:
-        text_config = create_ldm_bert_config(original_config)
-        text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
-        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
-        pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
-
-    return pipe
-
-
-def load_pipeline_from_control_net_ckpt(
-    checkpoint_path: str,
-    original_config_file: str = None,
-    image_size: int = 512,
-    prediction_type: str = None,
-    model_type: str = None,
-    extract_ema: bool = False,
-    scheduler_type: str = "ddim",
-    num_in_channels: Optional[int] = None,
-    upcast_attention: Optional[bool] = None,
-    device: str = None,
-    from_safetensors: bool = False,
-) -> StableDiffusionControlNetPipeline:
-    """
-    Load a Stable Diffusion pipeline object from a ControlNet `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
-    config file.
-
-    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
-    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
-    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
-
-    Args:
-        checkpoint_path (`str`): Path to `.ckpt` file.
-        original_config_file (`str`):
-            Path to `.yaml` config file corresponding to the original architecture. If `None`, will be automatically
-            inferred by looking for a key that only exists in SD2.0 models.
-        image_size (`int`, *optional*, defaults to 512):
-            The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2
-            Base. Use 768 for Stable Diffusion v2.
-        prediction_type (`str`, *optional*):
-            The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion v1.X and Stable
-            Diffusion v2 Base. Use `'v_prediction'` for Stable Diffusion v2.
-        num_in_channels (`int`, *optional*, defaults to None):
-            The number of input channels. If `None`, it will be automatically inferred.
-        scheduler_type (`str`, *optional*, defaults to 'pndm'):
-            Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
-            "ddim"]`.
-        model_type (`str`, *optional*, defaults to `None`):
-            The pipeline type. `None` to automatically infer, or one of `["FrozenOpenCLIPEmbedder",
-            "FrozenCLIPEmbedder", "PaintByExample"]`.
-        extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
-            checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults to
-            `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
-            inference. Non-EMA weights are usually better to continue fine-tuning.
-        upcast_attention (`bool`, *optional*, defaults to `None`):
-            Whether the attention computation should always be upcasted. This is necessary when running stable
-            diffusion 2.1.
-        device (`str`, *optional*, defaults to `None`):
-            The device to use. Pass `None` to determine automatically. :param from_safetensors: If `checkpoint_path` is
-            in `safetensors` format, load checkpoint with safetensors instead of PyTorch. :return: A
-            StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
-    """
-    if prediction_type == "v-prediction":
-        prediction_type = "v_prediction"
 
-    if not is_omegaconf_available():
-        raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
+        if controlnet:
+            # Convert the ControlNetModel model.
+            ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True)
+            ctrlnet_config["upcast_attention"] = upcast_attention
 
-    from omegaconf import OmegaConf
-
-    if from_safetensors:
-        if not is_safetensors_available():
-            raise ValueError(BACKENDS_MAPPING["safetensors"][1])
+            controlnet_model = ControlNetModel(**ctrlnet_config)
 
-        from safetensors import safe_open
+            converted_ctrl_checkpoint = convert_ldm_unet_checkpoint(
+                checkpoint, ctrlnet_config, path=checkpoint_path, extract_ema=extract_ema, controlnet=True
+            )
+            controlnet_model.load_state_dict(converted_ctrl_checkpoint)
 
-        checkpoint = {}
-        with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
-            for key in f.keys():
-                checkpoint[key] = f.get_tensor(key)
-    else:
-        if device is None:
-            device = "cuda" if torch.cuda.is_available() else "cpu"
-            checkpoint = torch.load(checkpoint_path, map_location=device)
+            pipe = StableDiffusionControlNetPipeline(
+                vae=vae,
+                text_encoder=text_model,
+                tokenizer=tokenizer,
+                unet=unet,
+                controlnet=controlnet_model,
+                scheduler=scheduler,
+                safety_checker=safety_checker,
+                feature_extractor=feature_extractor,
+            )
         else:
-            checkpoint = torch.load(checkpoint_path, map_location=device)
-
-    # Sometimes models don't have the global_step item
-    if "global_step" in checkpoint:
-        global_step = checkpoint["global_step"]
-    else:
-        print("global_step key not found in model")
-        global_step = None
-
-    if "state_dict" in checkpoint:
-        checkpoint = checkpoint["state_dict"]
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        if original_config_file is None:
-            key_name = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight"
-
-            original_config_file = os.path.join(tmpdir, "inference.yaml")
-            if key_name in checkpoint and checkpoint[key_name].shape[-1] == 1024:
-                raise NotImplementedError("Currently only support SD1.x models.")
-                # if not os.path.isfile("v2-inference-v.yaml"):
-                #     # model_type = "v2"
-                #     r = requests.get(
-                #         " https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml"
-                #     )
-                #     open(original_config_file, "wb").write(r.content)
-
-                # if global_step == 110000:
-                #     # v2.1 needs to upcast attention
-                #     upcast_attention = True
-            else:
-                if not os.path.isfile("cldm_v15.yaml"):
-                    # model_type = "v1"
-                    r = requests.get(
-                        "https://raw.githubusercontent.com/lllyasviel/ControlNet/main/models/cldm_v15.yaml"
-                    )
-                    open(original_config_file, "wb").write(r.content)
-
-        original_config = OmegaConf.load(original_config_file)
-
-    if num_in_channels is not None:
-        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
-
-    if (
-        "parameterization" in original_config["model"]["params"]
-        and original_config["model"]["params"]["parameterization"] == "v"
-    ):
-        if prediction_type is None:
-            # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
-            # as it relies on a brittle global step parameter here
-            prediction_type = "epsilon" if global_step == 875000 else "v_prediction"
-        if image_size is None:
-            # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
-            # as it relies on a brittle global step parameter here
-            image_size = 512 if global_step == 875000 else 768
-    else:
-        if prediction_type is None:
-            prediction_type = "epsilon"
-        if image_size is None:
-            image_size = 512
-
-    num_train_timesteps = original_config.model.params.timesteps
-    beta_start = original_config.model.params.linear_start
-    beta_end = original_config.model.params.linear_end
-
-    scheduler = DDIMScheduler(
-        beta_end=beta_end,
-        beta_schedule="scaled_linear",
-        beta_start=beta_start,
-        num_train_timesteps=num_train_timesteps,
-        steps_offset=1,
-        clip_sample=False,
-        set_alpha_to_one=False,
-        prediction_type=prediction_type,
-    )
-    # make sure scheduler works correctly with DDIM
-    scheduler.register_to_config(clip_sample=False)
-
-    if scheduler_type == "pndm":
-        config = dict(scheduler.config)
-        config["skip_prk_steps"] = True
-        scheduler = PNDMScheduler.from_config(config)
-    elif scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "heun":
-        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "euler":
-        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "dpm":
-        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
-    elif scheduler_type == "ddim":
-        scheduler = scheduler
-    else:
-        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
-
-    # Convert the ControlledUNet2DConditionModel model.
-    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
-    unet_config["upcast_attention"] = upcast_attention
-    unet = UNet2DConditionModel(**unet_config)
-
-    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
-        checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
-    )
-
-    unet.load_state_dict(converted_unet_checkpoint)
-
-    # Convert the ControlNetModel model.
-    ctrlnet_config = create_controlnet_diffusers_config(original_config, image_size=image_size)
-    ctrlnet_config["upcast_attention"] = upcast_attention
-    controlnet = UNet2DConditionModel(**ctrlnet_config)
-
-    converted_ctrl_checkpoint = convert_controlnet_checkpoint(checkpoint, unet_config)
-
-    controlnet.load_state_dict(converted_ctrl_checkpoint)
-
-    # Convert the VAE model.
-    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
-    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
-
-    vae = AutoencoderKL(**vae_config)
-    vae.load_state_dict(converted_vae_checkpoint)
-
-    # Convert the text model.
-    if model_type is None:
-        model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
-        logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}")
-
-    if model_type == "FrozenCLIPEmbedder":
-        text_model = convert_ldm_clip_checkpoint(checkpoint)
-        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
-        feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
-        pipe = StableDiffusionControlNetPipeline(
-            vae=vae,
-            text_encoder=text_model,
-            tokenizer=tokenizer,
-            unet=unet,
-            controlnet=controlnet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
+            pipe = StableDiffusionPipeline(
+                vae=vae,
+                text_encoder=text_model,
+                tokenizer=tokenizer,
+                unet=unet,
+                scheduler=scheduler,
+                safety_checker=safety_checker,
+                feature_extractor=feature_extractor,
+            )
     else:
-        raise NotImplementedError("Currently supported only for FrozenCLIPEmbedder.")
+        text_config = create_ldm_bert_config(original_config)
+        text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
+        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
+        pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
 
     return pipe
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 33fde9ad2038..8ff9315ee0c3 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -19,14 +19,12 @@
 import numpy as np
 import PIL.Image
 import torch
-from packaging import version
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
-from ...configuration_utils import FrozenDict
-from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
-    deprecate,
+    PIL_INTERPOLATION,
     is_accelerate_available,
     is_accelerate_version,
     logging,
@@ -40,6 +38,24 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+
+def preprocess(image, width, height):
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        image = [np.array(i.resize((width, height), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
@@ -74,8 +90,8 @@ class StableDiffusionControlNetPipeline(DiffusionPipeline):
             Tokenizer of class
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        controlnet ([`UNet2DConditionModel`]):
-            [ControlNet](https://arxiv.org/abs/2302.05543) architecture to generate guidance.
+        controlnet ([`ControlNetModel`]):
+            Provides additional conditioning to the unet during the denoising process
         scheduler ([`SchedulerMixin`]):
             A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
@@ -93,7 +109,7 @@ def __init__(
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
         unet: UNet2DConditionModel,
-        controlnet: UNet2DConditionModel,
+        controlnet: ControlNetModel,
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
@@ -101,33 +117,6 @@ def __init__(
     ):
         super().__init__()
 
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
-            )
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["clip_sample"] = False
-            scheduler._internal_dict = FrozenDict(new_config)
-
         if safety_checker is None and requires_safety_checker:
             logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
@@ -144,27 +133,6 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
-            version.parse(unet.config._diffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -178,6 +146,7 @@ def __init__(
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
     def enable_vae_slicing(self):
         r"""
         Enable sliced VAE decoding.
@@ -187,6 +156,7 @@ def enable_vae_slicing(self):
         """
         self.vae.enable_slicing()
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
     def disable_vae_slicing(self):
         r"""
         Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
@@ -197,7 +167,7 @@ def disable_vae_slicing(self):
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
-        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        text_encoder, vae, controlnet, and safety checker have their state dicts saved to CPU and then are moved to a
         `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
         Note that offloading happens on a submodule basis. Memory savings are higher than with
         `enable_model_cpu_offload`, but performance is lower.
@@ -209,7 +179,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         device = torch.device(f"cuda:{gpu_id}")
 
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.controlnet]:
             cpu_offload(cpu_offloaded_model, device)
 
         if self.safety_checker is not None:
@@ -230,7 +200,7 @@ def enable_model_cpu_offload(self, gpu_id=0):
         device = torch.device(f"cuda:{gpu_id}")
 
         hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
+        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae, self.controlnet]:
             _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
 
         if self.safety_checker is not None:
@@ -240,6 +210,7 @@ def enable_model_cpu_offload(self, gpu_id=0):
         self.final_offload_hook = hook
 
     @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
     def _execution_device(self):
         r"""
         Returns the device on which the pipeline's models will be executed. After calling
@@ -257,6 +228,7 @@ def _execution_device(self):
                 return torch.device(module._hf_hook.execution_device)
         return self.device
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
         prompt,
@@ -395,6 +367,7 @@ def _encode_prompt(
 
         return prompt_embeds
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, device, dtype):
         if self.safety_checker is not None:
             safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
@@ -405,6 +378,7 @@ def run_safety_checker(self, image, device, dtype):
             has_nsfw_concept = None
         return image, has_nsfw_concept
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
     def decode_latents(self, latents):
         latents = 1 / self.vae.config.scaling_factor * latents
         image = self.vae.decode(latents).sample
@@ -413,6 +387,7 @@ def decode_latents(self, latents):
         image = image.cpu().permute(0, 2, 3, 1).float().numpy()
         return image
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
     def prepare_extra_step_kwargs(self, generator, eta):
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
@@ -430,6 +405,7 @@ def prepare_extra_step_kwargs(self, generator, eta):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
     def check_inputs(
         self,
         prompt,
@@ -477,6 +453,7 @@ def check_inputs(
                     f" {negative_prompt_embeds.shape}."
                 )
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
         shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
         if isinstance(generator, list) and len(generator) != batch_size:
@@ -494,69 +471,12 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
         latents = latents * self.scheduler.init_noise_sigma
         return latents
 
-    def controlnet_hint_conversion(self, controlnet_hint, height, width, num_images_per_prompt):
-        if controlnet_hint is None:
-            return None
-        channels = 3
-        if isinstance(controlnet_hint, torch.Tensor):
-            # torch.Tensor: acceptble shape are any of chw, bchw(b==1) or bchw(b==num_images_per_prompt)
-            shape_chw = (channels, height, width)
-            shape_bchw = (1, channels, height, width)
-            shape_nchw = (num_images_per_prompt, channels, height, width)
-            if controlnet_hint.shape in [shape_chw, shape_bchw, shape_nchw]:
-                controlnet_hint = controlnet_hint.to(dtype=self.controlnet.dtype, device=self.controlnet.device)
-                if controlnet_hint.shape != shape_nchw:
-                    controlnet_hint = controlnet_hint.repeat(num_images_per_prompt, 1, 1, 1)
-                return controlnet_hint
-            else:
-                raise ValueError(
-                    f"Acceptble shape of `controlnet_hint` are any of ({channels}, {height}, {width}),"
-                    + f" (1, {channels}, {height}, {width}) or ({num_images_per_prompt}, "
-                    + f"{channels}, {height}, {width}) but is {controlnet_hint.shape}"
-                )
-        elif isinstance(controlnet_hint, np.ndarray):
-            # np.ndarray: acceptable shape is any of hw, hwc, bhwc(b==1) or bhwc(b==num_images_per_promot)
-            # hwc is opencv compatible image format. Color channel must be BGR Format.
-            if controlnet_hint.shape == (height, width):
-                controlnet_hint = np.repeat(controlnet_hint[:, :, np.newaxis], channels, axis=2)  # hw -> hwc(c==3)
-            shape_hwc = (height, width, channels)
-            shape_bhwc = (1, height, width, channels)
-            shape_nhwc = (num_images_per_prompt, height, width, channels)
-            if controlnet_hint.shape in [shape_hwc, shape_bhwc, shape_nhwc]:
-                controlnet_hint = torch.from_numpy(controlnet_hint.copy())
-                controlnet_hint = controlnet_hint.to(dtype=self.controlnet.dtype, device=self.controlnet.device)
-                controlnet_hint /= 255.0
-                if controlnet_hint.shape != shape_nhwc:
-                    controlnet_hint = controlnet_hint.repeat(num_images_per_prompt, 1, 1, 1)
-                controlnet_hint = controlnet_hint.permute(0, 3, 1, 2)  # b h w c -> b c h w
-                return controlnet_hint
-            else:
-                raise ValueError(
-                    f"Acceptble shape of `controlnet_hint` are any of ({width}, {channels}), "
-                    + f"({height}, {width}, {channels}), "
-                    + f"(1, {height}, {width}, {channels}) or "
-                    + f"({num_images_per_prompt}, {channels}, {height}, {width}) but is {controlnet_hint.shape}"
-                )
-        elif isinstance(controlnet_hint, PIL.Image.Image):
-            if controlnet_hint.size == (width, height):
-                controlnet_hint = controlnet_hint.convert("RGB")  # make sure 3 channel RGB format
-                controlnet_hint = np.array(controlnet_hint)  # to numpy
-                controlnet_hint = controlnet_hint[:, :, ::-1]  # RGB -> BGR
-                return self.controlnet_hint_conversion(controlnet_hint, height, width, num_images_per_prompt)
-            else:
-                raise ValueError(
-                    f"Acceptable image size of `controlnet_hint` is ({width}, {height}) but is {controlnet_hint.size}"
-                )
-        else:
-            raise ValueError(
-                f"Acceptable type of `controlnet_hint` are any of torch.Tensor, np.ndarray, PIL.Image.Image but is {type(controlnet_hint)}"
-            )
-
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
+        image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: int = 50,
@@ -573,7 +493,6 @@ def __call__(
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: int = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_hint: Optional[Union[torch.FloatTensor, np.ndarray, PIL.Image.Image]] = None,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -634,11 +553,6 @@ def __call__(
                 A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
                 `self.processor` in
                 [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-            controlnet_hint (`torch.FloatTensor`, `np.ndarray` or `PIL.Image.Image`, *optional*):
-                ControlNet input embedding. ControlNet generates guidances using this input embedding. If the type is
-                specified as `torch.FloatTensor`, it is passed to ControlNet as is. If the type is `np.ndarray`, it is
-                assumed to be an OpenCV compatible image format. PIL.Image.Image` can also be accepted as an image. The
-                size of all these types must correspond to the output image size.
 
         Examples:
 
@@ -653,17 +567,12 @@ def __call__(
         height = height or self.unet.config.sample_size * self.vae_scale_factor
         width = width or self.unet.config.sample_size * self.vae_scale_factor
 
-        # 1. Control Embedding check & conversion
-        controlnet_hint = self.controlnet_hint_conversion(controlnet_hint, height, width, num_images_per_prompt)
-        if controlnet_hint is not None and self.vae_scale_factor != 8:
-            raise ValueError("ControlNet currently supports only for vae_scale_factor == 8.")
-
-        # 2. Check inputs. Raise error if not correct
+        # 1. Check inputs. Raise error if not correct
         self.check_inputs(
             prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
         )
 
-        # 3. Define call parameters
+        # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
         elif prompt is not None and isinstance(prompt, list):
@@ -677,7 +586,7 @@ def __call__(
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
 
-        # 4. Encode input prompt
+        # 3. Encode input prompt
         prompt_embeds = self._encode_prompt(
             prompt,
             device,
@@ -688,6 +597,14 @@ def __call__(
             negative_prompt_embeds=negative_prompt_embeds,
         )
 
+        # 4. Prepare image
+        image = preprocess(image, width, height)
+
+        image = image.to(device=device, dtype=self.controlnet.dtype)
+
+        if do_classifier_free_guidance:
+            image = torch.cat([image] * 2)
+
         # 5. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
@@ -716,26 +633,19 @@ def __call__(
                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
-                if controlnet_hint is not None:
-                    # ControlNet predict the noise residual
-                    control = self.controlnet(
-                        latent_model_input, t, encoder_hidden_states=prompt_embeds, controlnet_hint=controlnet_hint
-                    )
-                    noise_pred = self.unet(
-                        latent_model_input,
-                        t,
-                        encoder_hidden_states=prompt_embeds,
-                        cross_attention_kwargs=cross_attention_kwargs,
-                        control=control,
-                    ).sample
-                else:
-                    # predict the noise residual
-                    noise_pred = self.unet(
-                        latent_model_input,
-                        t,
-                        encoder_hidden_states=prompt_embeds,
-                        cross_attention_kwargs=cross_attention_kwargs,
-                    ).sample
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    latent_model_input, t, encoder_hidden_states=prompt_embeds, controlnet_cond=image, return_dict=False
+                )
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 7a363a591b27..261135d33318 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -7,7 +7,6 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...models import ModelMixin
 from ...models.attention import CrossAttention
-from ...models.controlnet_blocks import ControlNetInputHintBlock, ControlNetZeroConvBlock
 from ...models.cross_attention import AttnProcessor, CrossAttnAddedKVProcessor
 from ...models.dual_transformer_2d import DualTransformer2DModel
 from ...models.embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
@@ -187,9 +186,6 @@ class conditioning with `class_embed_type` equal to `None`.
         conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
         projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
             using the "projection" `class_embed_type`. Required when using the "projection" `class_embed_type`.
-        controlnet_hint_channels (`int`, *optional*, default to `None`):
-            The number of channels in the `controlnet_hint`. If this value is not None, this unet model behaves as
-            ControlNet.
     """
 
     _supports_gradient_checkpointing = True
@@ -232,13 +228,12 @@ def __init__(
         num_class_embeds: Optional[int] = None,
         upcast_attention: bool = False,
         resnet_time_scale_shift: str = "default",
-        time_embedding_type: str = "positional",  # fourier, positional
+        time_embedding_type: str = "positional",
         timestep_post_act: Optional[str] = None,
         time_cond_proj_dim: Optional[int] = None,
         conv_in_kernel: int = 3,
         conv_out_kernel: int = 3,
         projection_class_embeddings_input_dim: Optional[int] = None,
-        controlnet_hint_channels: Optional[int] = None,
     ):
         super().__init__()
 
@@ -398,18 +393,6 @@ def __init__(
         # count how many layers upsample the images
         self.num_upsamplers = 0
 
-        if controlnet_hint_channels is not None:
-            # ControlNet: add input_hint_block, zero_conv_block
-            self.controlnet_input_hint_block = ControlNetInputHintBlock(
-                hint_channels=controlnet_hint_channels, channels=block_out_channels[0]
-            )
-            self.controlnet_zero_conv_block = ControlNetZeroConvBlock(
-                block_out_channels=block_out_channels,
-                down_block_types=down_block_types,
-                layers_per_block=layers_per_block,
-            )
-            return  # Modules from the following lines are not defined in ControlNet
-
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
         reversed_attention_head_dim = list(reversed(attention_head_dim))
@@ -599,10 +582,10 @@ def forward(
         timestep_cond: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_hint: Optional[torch.FloatTensor] = None,
-        control: Optional[List[torch.FloatTensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
         return_dict: bool = True,
-    ) -> Union[UNet2DConditionOutput, Tuple, List[torch.FloatTensor]]:
+    ) -> Union[UNet2DConditionOutput, Tuple]:
         r"""
         Args:
             sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
@@ -614,17 +597,11 @@ def forward(
                 A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
                 `self.processor` in
                 [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-            controlnet_hint (`torch.FloatTensor`, *optional*, defaults to `None`):
-                ControlNet input embedding. If `controlnet_hint_channel` of `__init__()` is not None, it must be
-                specified as a tensors.
-            control (`List[torch.FloatTensor]`, *optional*, defaults to `None`):
-                If `control` is not None, this unet model behaves as ControlledUnet. ControlledUnet is controlled by
-                this list of tensors.
+
         Returns:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] , `tuple` or [torch.FloatTensor]:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
             [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor. If `controlnet_hint` is not None, the ControlNet
-            result of the processing is output as a list of tensors.
+            returning a tuple, the first element is the sample tensor.
         """
         # By default samples have to be AT least a multiple of the overall upsampling factor.
         # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
@@ -687,8 +664,6 @@ def forward(
 
         # 2. pre-process
         sample = self.conv_in(sample)
-        if controlnet_hint is not None:
-            sample += self.controlnet_input_hint_block(controlnet_hint)
 
         # 3. down
         down_block_res_samples = (sample,)
@@ -706,6 +681,17 @@ def forward(
 
             down_block_res_samples += res_samples
 
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample += down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
+
         # 4. mid
         if self.mid_block is not None:
             sample = self.mid_block(
@@ -716,15 +702,8 @@ def forward(
                 cross_attention_kwargs=cross_attention_kwargs,
             )
 
-        if controlnet_hint is not None:
-            # ControlNet: zero convs
-            return self.controlnet_zero_conv_block(
-                down_block_res_samples=down_block_res_samples, mid_block_sample=sample
-            )
-
-        if control is not None:
-            # ControlledUnet: apply mid_zero_conv output
-            sample += control.pop()
+        if mid_block_additional_residual is not None:
+            sample += mid_block_additional_residual
 
         # 5. up
         for i, upsample_block in enumerate(self.up_blocks):
@@ -733,12 +712,6 @@ def forward(
             res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
             down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
 
-            if control is not None:
-                # ControlledUnet: apply ControlNet downblock zero_convs output
-                control_samples = control[-len(upsample_block.resnets) :]
-                control = control[: -len(upsample_block.resnets)]
-                res_samples = [r + c for r, c in zip(res_samples, control_samples)]
-
             # if we have not reached the final block and need to forward the
             # upsample size, we do it here
             if not is_final_block and forward_upsample_size:
@@ -759,10 +732,6 @@ def forward(
                     hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
                 )
 
-        # TODO: remove this block
-        if control is not None:
-            assert len(control) == 0, f"must consume all control array ({len(control)})"
-
         # 6. post-process
         if self.conv_norm_out:
             sample = self.conv_norm_out(sample)
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 67e537a63596..c731a1f1ddf3 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -17,6 +17,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class ControlNetModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class ModelMixin(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index 95af3f81f7b8..6ee8c2ffc002 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -26,7 +26,6 @@
     floats_tensor,
     load_hf_numpy,
     logging,
-    randn_tensor,
     require_torch_gpu,
     slow,
     torch_all_close,
@@ -84,16 +83,6 @@ def dummy_input(self):
 
         return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
 
-    @property
-    def dummpy_input_for_controlnet(self):
-        hint_channels = 3
-        input_dict = self.dummy_input
-        sample_shape = input_dict["sample"].shape
-        input_dict["controlnet_hint"] = floats_tensor(
-            (sample_shape[0], hint_channels, sample_shape[2] * 8, sample_shape[3] * 8)
-        ).to(torch_device)
-        return input_dict
-
     @property
     def input_shape(self):
         return (4, 32, 32)
@@ -117,20 +106,6 @@ def prepare_init_args_and_inputs_for_common(self):
         inputs_dict = self.dummy_input
         return init_dict, inputs_dict
 
-    def prepare_init_args_and_inputs_for_controlnet(self):
-        init_dict = {
-            "block_out_channels": (32, 64),
-            "down_block_types": ("CrossAttnDownBlock2D", "DownBlock2D"),
-            "cross_attention_dim": 32,
-            "attention_head_dim": 8,
-            "in_channels": 4,
-            "layers_per_block": 2,
-            "sample_size": 32,
-            "controlnet_hint_channels": 3,
-        }
-        inputs_dict = self.dummpy_input_for_controlnet
-        return init_dict, inputs_dict
-
     @unittest.skipIf(
         torch_device != "cuda" or not is_xformers_available(),
         reason="XFormers attention is only available with CUDA and `xformers` installed",
@@ -466,54 +441,6 @@ def test_lora_xformers_on_off(self):
         assert (sample - on_sample).abs().max() < 1e-4
         assert (sample - off_sample).abs().max() < 1e-4
 
-    def test_model_controlnet_inference(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_controlnet()
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            output = model(**inputs_dict)
-
-            conv_in_n = 1
-            cross_attn_n = init_dict["layers_per_block"] + 1  # with down sampler
-            down_block_n = init_dict["layers_per_block"]  # no down sampler
-            mid_block_n = 1
-            assert len(output) == conv_in_n + cross_attn_n + down_block_n + mid_block_n
-
-            cross_attn_ch = init_dict["block_out_channels"][0]
-            down_block_ch = init_dict["block_out_channels"][1]
-            inshape = self.input_shape
-            batch = inputs_dict["sample"].shape[0]
-            s1 = (batch, cross_attn_ch, inshape[1], inshape[2])
-            s2 = (batch, cross_attn_ch, inshape[1] / 2, inshape[2] / 2)
-            s3 = (batch, down_block_ch, inshape[1] / 2, inshape[2] / 2)
-            expected_shape = [s1, s1, s1, s2, s3, s3, s3]
-            assert all([out.shape == shape for out, shape in zip(output, expected_shape)])
-
-    def test_model_controlnet_and_unet_inference(self):
-        controlnet_init_dict, controlnet_inputs_dict = self.prepare_init_args_and_inputs_for_controlnet()
-        unet_init_dict, unet_inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        controlnet_model = self.model_class(**controlnet_init_dict)
-        controlnet_model.to(torch_device)
-        controlnet_model.eval()
-
-        unet_model = self.model_class(**unet_init_dict)
-        unet_model.to(torch_device)
-        unet_model.eval()
-
-        with torch.no_grad():
-            control = controlnet_model(**controlnet_inputs_dict)
-            unet_inputs_dict["control"] = control
-            output = unet_model(**unet_inputs_dict)
-            if isinstance(output, dict):
-                output = output.sample
-
-            batch = unet_inputs_dict["sample"].shape[0]
-            assert output.shape == (batch,) + self.output_shape
-
 
 @slow
 class UNet2DConditionModelIntegrationTests(unittest.TestCase):
@@ -531,12 +458,12 @@ def get_latents(self, seed=0, shape=(4, 4, 64, 64), fp16=False):
         image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
         return image
 
-    def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4", subfolder="unet"):
+    def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"):
         revision = "fp16" if fp16 else None
         torch_dtype = torch.float16 if fp16 else torch.float32
 
         model = UNet2DConditionModel.from_pretrained(
-            model_id, subfolder=subfolder, torch_dtype=torch_dtype, revision=revision
+            model_id, subfolder="unet", torch_dtype=torch_dtype, revision=revision
         )
         model.to(torch_device).eval()
 
@@ -820,55 +747,3 @@ def test_stabilityai_sd_v2_fp16(self, seed, timestep, expected_slice):
         expected_output_slice = torch.tensor(expected_slice)
 
         assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [83, 4, [-0.0343, -0.7764, -0.5049, -0.1671, -0.8076, -0.8975, -0.0917,  0.6797]],
-            [17, 0.55, [-0.1732, -0.2542,  0.5425, -0.4189, -0.7910,  0.7544,  0.3892, -0.3232]],
-            [8, 0.89, [-0.6738,  0.3801,  0.1443,  0.1410,  0.7944, -0.4167,  0.1897, -0.0763]],
-            [3, 1000, [0.7334,  0.4519, -0.0319, -0.6343, -0.4348, -0.5205, -0.2534,  0.7998]],
-            # fmt: on
-        ]
-    )
-    @require_torch_gpu
-    def test_controlnet_sd15_canny_fp16(self, seed, timestep, expected_slice):
-        model_id = "takuma104/control_sd15_canny"
-        controlnet_model = self.get_unet_model(model_id=model_id, subfolder="controlnet").to(torch.float16)
-        unet_model = self.get_unet_model(model_id=model_id).to(torch.float16)
-        latents = self.get_latents(seed, shape=(4, 4, 96, 96), fp16=True)
-
-        # for my poor memory environment
-        controlnet_model.set_attention_slice(1)  # TODO: remove
-        unet_model.set_attention_slice(1)  # TODO: remove
-
-        generator = torch.manual_seed(seed)
-        controlnet_hint = randn_tensor(
-            (4, 3, 96 * 8, 96 * 8), generator=generator, device=torch.device(torch_device), dtype=torch.float16
-        )
-
-        # encoder_hidden_states = self.get_encoder_hidden_states(seed, shape=(4, 77, 1024), fp16=True)
-        # TODO: investigate for not accepted (4, 77, 1024)
-        encoder_hidden_states = randn_tensor(
-            (4, 77, 768), generator=generator, device=torch.device(torch_device), dtype=torch.float16
-        )
-
-        timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
-
-        with torch.no_grad():
-            control = controlnet_model(
-                latents,
-                timestep=timestep,
-                encoder_hidden_states=encoder_hidden_states,
-                controlnet_hint=controlnet_hint,
-            )
-            sample = unet_model(
-                latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states, control=control
-            ).sample
-
-        assert sample.shape == latents.shape
-
-        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index bd7bc31eacea..02460b11b26f 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -24,20 +24,17 @@
 
 from diffusers import (
     AutoencoderKL,
+    ControlNetModel,
     DDIMScheduler,
     DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
     LMSDiscreteScheduler,
-    PNDMScheduler,
     StableDiffusionControlNetPipeline,
     UNet2DConditionModel,
-    logging,
 )
-from diffusers.utils import slow, torch_device
-from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
+from diffusers.utils import randn_tensor, slow, torch_device
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import require_torch_gpu
 
-from ...models.test_models_unet_2d_condition import create_lora_layers
 from ...test_pipelines_common import PipelineTesterMixin
 
 
@@ -59,15 +56,17 @@ def get_dummy_components(self):
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
             cross_attention_dim=32,
         )
-        controlnet = UNet2DConditionModel(
+        torch.manual_seed(0)
+        controlnet = ControlNetModel(
             block_out_channels=(32, 64),
             layers_per_block=2,
             sample_size=32,
             in_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             cross_attention_dim=32,
-            controlnet_hint_channels=3,
+            controlnet_conditioning_channels=3,
         )
+        torch.manual_seed(0)
         scheduler = DDIMScheduler(
             beta_start=0.00085,
             beta_end=0.012,
@@ -116,472 +115,35 @@ def get_dummy_inputs(self, device, seed=0):
             generator = torch.manual_seed(seed)
         else:
             generator = torch.Generator(device=device).manual_seed(seed)
+
+        controlnet_embedder_scale_factor = 8
+        image = randn_tensor(
+            (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor), generator=generator, device=torch.device(device)
+        )
+
         inputs = {
             "prompt": "A painting of a squirrel eating a burger",
             "generator": generator,
             "num_inference_steps": 2,
             "guidance_scale": 6.0,
             "output_type": "numpy",
+            "image": image,
         }
-        return inputs
-
-    def get_dummy_components_for_controlnet(self):
-        components = self.get_dummy_components()
-        # vae_scale_factor 8 version
-        # this for ControlNetInputHintBlock accepts only vae_scale_factor=8
-        components["vae"] = AutoencoderKL(
-            block_out_channels=[32, 64, 64, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        return components
 
-    def get_dummy_inputs_for_controlnet(self, device, seed=0):
-        inputs = self.get_dummy_inputs(device, seed)
-        vae_scale_factor = 8
-        inputs["controlnet_hint"] = torch.randn(
-            (1, 3, 32 * vae_scale_factor, 32 * vae_scale_factor), generator=inputs["generator"]
-        )
         return inputs
 
-    def test_stable_diffusion_ddim(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionControlNetPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5643, 0.6017, 0.4799, 0.5267, 0.5584, 0.4641, 0.5159, 0.4963, 0.4791])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_lora(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionControlNetPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        # forward 1
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        # set lora layers
-        lora_attn_procs = create_lora_layers(sd_pipe.unet)
-        sd_pipe.unet.set_attn_processor(lora_attn_procs)
-        sd_pipe = sd_pipe.to(torch_device)
-
-        # forward 2
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.0})
-        image = output.images
-        image_slice_1 = image[0, -3:, -3:, -1]
-
-        # forward 3
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.5})
-        image = output.images
-        image_slice_2 = image[0, -3:, -3:, -1]
-
-        assert np.abs(image_slice - image_slice_1).max() < 1e-2
-        assert np.abs(image_slice - image_slice_2).max() > 1e-2
-
-    def test_stable_diffusion_prompt_embeds(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionControlNetPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        inputs["prompt"] = 3 * [inputs["prompt"]]
-
-        # forward
-        output = sd_pipe(**inputs)
-        image_slice_1 = output.images[0, -3:, -3:, -1]
-
-        inputs = self.get_dummy_inputs(torch_device)
-        prompt = 3 * [inputs.pop("prompt")]
-
-        text_inputs = sd_pipe.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=sd_pipe.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        text_inputs = text_inputs["input_ids"].to(torch_device)
-
-        prompt_embeds = sd_pipe.text_encoder(text_inputs)[0]
-
-        inputs["prompt_embeds"] = prompt_embeds
-
-        # forward
-        output = sd_pipe(**inputs)
-        image_slice_2 = output.images[0, -3:, -3:, -1]
-
-        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
-
-    def test_stable_diffusion_negative_prompt_embeds(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionControlNetPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        negative_prompt = 3 * ["this is a negative prompt"]
-        inputs["negative_prompt"] = negative_prompt
-        inputs["prompt"] = 3 * [inputs["prompt"]]
-
-        # forward
-        output = sd_pipe(**inputs)
-        image_slice_1 = output.images[0, -3:, -3:, -1]
-
-        inputs = self.get_dummy_inputs(torch_device)
-        prompt = 3 * [inputs.pop("prompt")]
-
-        embeds = []
-        for p in [prompt, negative_prompt]:
-            text_inputs = sd_pipe.tokenizer(
-                p,
-                padding="max_length",
-                max_length=sd_pipe.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_inputs = text_inputs["input_ids"].to(torch_device)
-
-            embeds.append(sd_pipe.text_encoder(text_inputs)[0])
+    def test_attention_slicing_forward_pass(self):
+        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
 
-        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
-
-        # forward
-        output = sd_pipe(**inputs)
-        image_slice_2 = output.images[0, -3:, -3:, -1]
-
-        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
-
-    def test_stable_diffusion_ddim_factor_8(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionControlNetPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs, height=136, width=136)
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 136, 136, 3)
-        expected_slice = np.array([0.5524, 0.5626, 0.6069, 0.4727, 0.386, 0.3995, 0.4613, 0.4328, 0.4269])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_pndm(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionControlNetPipeline(**components)
-        sd_pipe.scheduler = PNDMScheduler(skip_prk_steps=True)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5094, 0.5674, 0.4667, 0.5125, 0.5696, 0.4674, 0.5277, 0.4964, 0.4945])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_k_lms(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionControlNetPipeline(**components)
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [
-                0.47082293033599854,
-                0.5371589064598083,
-                0.4562119245529175,
-                0.5220914483070374,
-                0.5733777284622192,
-                0.4795039892196655,
-                0.5465868711471558,
-                0.5074326395988464,
-                0.5042197108268738,
-            ]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_k_euler_ancestral(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionControlNetPipeline(**components)
-        sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [
-                0.4707113206386566,
-                0.5372191071510315,
-                0.4563021957874298,
-                0.5220003724098206,
-                0.5734264850616455,
-                0.4794946610927582,
-                0.5463782548904419,
-                0.5074145197868347,
-                0.504422664642334,
-            ]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_k_euler(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionControlNetPipeline(**components)
-        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [
-                0.47082313895225525,
-                0.5371587872505188,
-                0.4562119245529175,
-                0.5220913887023926,
-                0.5733776688575745,
-                0.47950395941734314,
-                0.546586811542511,
-                0.5074326992034912,
-                0.5042197108268738,
-            ]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_vae_slicing(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionControlNetPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        image_count = 4
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["prompt"] = [inputs["prompt"]] * image_count
-        output_1 = sd_pipe(**inputs)
-
-        # make sure sliced vae decode yields the same result
-        sd_pipe.enable_vae_slicing()
-        inputs = self.get_dummy_inputs(device)
-        inputs["prompt"] = [inputs["prompt"]] * image_count
-        output_2 = sd_pipe(**inputs)
-
-        # there is a small discrepancy at image borders vs. full batch decode
-        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 3e-3
-
-    def test_stable_diffusion_negative_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        sd_pipe = StableDiffusionControlNetPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        negative_prompt = "french fries"
-        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
-
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [
-                0.5108221173286438,
-                0.5688379406929016,
-                0.4685141146183014,
-                0.5098261833190918,
-                0.5657756328582764,
-                0.4631010890007019,
-                0.5226285457611084,
-                0.49129390716552734,
-                0.4899061322212219,
-            ]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_num_images_per_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        sd_pipe = StableDiffusionControlNetPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-
-        # test num_images_per_prompt=1 (default)
-        images = sd_pipe(prompt, num_inference_steps=2, output_type="np").images
-
-        assert images.shape == (1, 64, 64, 3)
-
-        # test num_images_per_prompt=1 (default) for batch of prompts
-        batch_size = 2
-        images = sd_pipe([prompt] * batch_size, num_inference_steps=2, output_type="np").images
-
-        assert images.shape == (batch_size, 64, 64, 3)
-
-        # test num_images_per_prompt for single prompt
-        num_images_per_prompt = 2
-        images = sd_pipe(
-            prompt, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
-        ).images
-
-        assert images.shape == (num_images_per_prompt, 64, 64, 3)
-
-        # test num_images_per_prompt for batch of prompts
-        batch_size = 2
-        images = sd_pipe(
-            [prompt] * batch_size, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
-        ).images
-
-        assert images.shape == (batch_size * num_images_per_prompt, 64, 64, 3)
-
-    def test_stable_diffusion_long_prompt(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionControlNetPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        do_classifier_free_guidance = True
-        negative_prompt = None
-        num_images_per_prompt = 1
-        logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet")
-
-        prompt = 25 * "@"
-        with CaptureLogger(logger) as cap_logger_3:
-            text_embeddings_3 = sd_pipe._encode_prompt(
-                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
-
-        prompt = 100 * "@"
-        with CaptureLogger(logger) as cap_logger:
-            text_embeddings = sd_pipe._encode_prompt(
-                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
-
-        negative_prompt = "Hello"
-        with CaptureLogger(logger) as cap_logger_2:
-            text_embeddings_2 = sd_pipe._encode_prompt(
-                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
-
-        assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
-        assert text_embeddings.shape[1] == 77
-
-        assert cap_logger.out == cap_logger_2.out
-        # 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25
-        assert cap_logger.out.count("@") == 25
-        assert cap_logger_3.out == ""
-
-    def test_stable_diffusion_height_width_opt(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionControlNetPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "hey"
-
-        output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
-        image_shape = output.images[0].shape[:2]
-        assert image_shape == (64, 64)
-
-        output = sd_pipe(prompt, num_inference_steps=1, height=96, width=96, output_type="np")
-        image_shape = output.images[0].shape[:2]
-        assert image_shape == (96, 96)
-
-        config = dict(sd_pipe.unet.config)
-        config["sample_size"] = 96
-        sd_pipe.unet = UNet2DConditionModel.from_config(config).to(torch_device)
-        output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
-        image_shape = output.images[0].shape[:2]
-        assert image_shape == (192, 192)
-
-    def test_stable_diffusion_controlnet_ddim(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        vae_scale_factor = 8
-        components = self.get_dummy_components_for_controlnet()
-        sd_pipe = StableDiffusionControlNetPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs_for_controlnet(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-        # print("image_slice", image_slice)
-
-        assert image.shape == (1, 32 * vae_scale_factor, 32 * vae_scale_factor, 3)
-        expected_slice = np.array(
-            [0.4780106, 0.46282214, 0.49179333, 0.437001, 0.4518742, 0.46226522, 0.41771045, 0.4315053, 0.4805042]
-        )
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
 
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
 
 
 @slow
diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py
index 7c145be21b34..a7c218fe7e77 100644
--- a/tests/test_pipelines_common.py
+++ b/tests/test_pipelines_common.py
@@ -197,7 +197,7 @@ def test_inference_batch_single_identical(self):
         self._test_inference_batch_single_identical()
 
     def _test_inference_batch_single_identical(
-        self, test_max_difference=None, test_mean_pixel_difference=None, relax_max_difference=False
+        self, test_max_difference=None, test_mean_pixel_difference=None, relax_max_difference=False, expected_max_diff=1e-4
     ):
         if self.pipeline_class.__name__ in [
             "CycleDiffusionPipeline",
@@ -277,7 +277,7 @@ def _test_inference_batch_single_identical(
                 max_diff = np.median(diff[-5:])
             else:
                 max_diff = np.abs(output_batch[0][0] - output[0][0]).max()
-            assert max_diff < 1e-4
+            assert max_diff < expected_max_diff
 
         if test_mean_pixel_difference:
             assert_mean_pixel_difference(output_batch[0][0], output[0][0])
@@ -436,7 +436,7 @@ def test_to_device(self):
     def test_attention_slicing_forward_pass(self):
         self._test_attention_slicing_forward_pass()
 
-    def _test_attention_slicing_forward_pass(self, test_max_difference=True):
+    def _test_attention_slicing_forward_pass(self, test_max_difference=True, expected_max_diff=1e-3):
         if not self.test_attention_slicing:
             return
 
@@ -468,7 +468,7 @@ def _test_attention_slicing_forward_pass(self, test_max_difference=True):
 
         if test_max_difference:
             max_diff = np.abs(output_with_slicing - output_without_slicing).max()
-            self.assertLess(max_diff, 1e-3, "Attention slicing should not affect the inference results")
+            self.assertLess(max_diff, expected_max_diff, "Attention slicing should not affect the inference results")
 
         assert_mean_pixel_difference(output_with_slicing[0], output_without_slicing[0])
 
@@ -502,7 +502,7 @@ def test_cpu_offload_forward_pass(self):
     def test_xformers_attention_forwardGenerator_pass(self):
         self._test_xformers_attention_forwardGenerator_pass()
 
-    def _test_xformers_attention_forwardGenerator_pass(self, test_max_difference=True):
+    def _test_xformers_attention_forwardGenerator_pass(self, test_max_difference=True, expected_max_diff=1e-4):
         if not self.test_xformers_attention:
             return
 
@@ -520,7 +520,7 @@ def _test_xformers_attention_forwardGenerator_pass(self, test_max_difference=Tru
 
         if test_max_difference:
             max_diff = np.abs(output_with_offload - output_without_offload).max()
-            self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results")
+            self.assertLess(max_diff, expected_max_diff, "XFormers attention should not affect the inference results")
 
         assert_mean_pixel_difference(output_with_offload[0], output_without_offload[0])
 

From b74ef10d740dfe5a826b87b979e0489f19511d7b Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Fri, 24 Feb 2023 19:48:00 +0900
Subject: [PATCH 054/122] all slow test passed

---
 .../test_stable_diffusion_controlnet.py       | 87 ++++++++++---------
 1 file changed, 48 insertions(+), 39 deletions(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 3e645a8f941d..70c891d09a4a 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -119,7 +119,9 @@ def get_dummy_inputs(self, device, seed=0):
 
         controlnet_embedder_scale_factor = 8
         image = randn_tensor(
-            (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor), generator=generator, device=torch.device(device)
+            (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+            generator=generator,
+            device=torch.device(device),
         )
 
         inputs = {
@@ -133,6 +135,44 @@ def get_dummy_inputs(self, device, seed=0):
 
         return inputs
 
+    def get_dummy_components_for_controlnet(self):
+        components = self.get_dummy_components()
+        # vae_scale_factor 8 version
+        # this for ControlNetInputHintBlock accepts only vae_scale_factor=8
+        components["vae"] = AutoencoderKL(
+            block_out_channels=[32, 64, 64, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        return components
+
+    def get_dummy_inputs_for_controlnet(self, device, seed=0, num_of_prompts=1, num_images_per_prompt=1):
+        inputs = self.get_dummy_inputs(device, seed)
+        vae_scale_factor = 8
+        if num_of_prompts > 1:
+            inputs["prompt"] = [f"a photo of {i} cats" for i in range(num_of_prompts)]
+
+        controlnet_hint = torch.randn(
+            (num_of_prompts * num_images_per_prompt, 3, 32 * vae_scale_factor, 32 * vae_scale_factor),
+            generator=inputs["generator"],
+        )
+
+        controlnet_hint = controlnet_hint.detach().numpy().copy()
+        images = np.zeros_like(controlnet_hint, dtype=np.uint8)
+        images[controlnet_hint > 0.5] = 255
+        images = images.transpose(0, 3, 2, 1)  # b c h w -> b w h c
+        if images.shape[0] == 1:
+            controlnet_hint = PIL.Image.fromarray(images[0])  # PIL.Image
+        else:
+            controlnet_hint = [PIL.Image.fromarray(images[b]) for b in range(images.shape[0])]  # List of PIL.Image
+
+        inputs["image"] = controlnet_hint
+        inputs["num_images_per_prompt"] = num_images_per_prompt
+        return inputs
+
     def test_attention_slicing_forward_pass(self):
         return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
 
@@ -146,7 +186,7 @@ def test_xformers_attention_forwardGenerator_pass(self):
     def test_inference_batch_single_identical(self):
         self._test_inference_batch_single_identical(expected_max_diff=2e-3)
 
-    def test_stable_diffusion_controlnet_ddim_pil_image(self):
+    def test_stable_diffusion_controlnet_ddim(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
 
         vae_scale_factor = 8
@@ -155,7 +195,7 @@ def test_stable_diffusion_controlnet_ddim_pil_image(self):
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
 
-        inputs = self.get_dummy_inputs_for_controlnet(device, pil_image=True)
+        inputs = self.get_dummy_inputs_for_controlnet(device)
         output = sd_pipe(**inputs)
         image = output.images
 
@@ -164,7 +204,7 @@ def test_stable_diffusion_controlnet_ddim_pil_image(self):
 
         assert image.shape == (1, 32 * vae_scale_factor, 32 * vae_scale_factor, 3)
         expected_slice = np.array(
-            [0.4780106, 0.46282214, 0.49179333, 0.437001, 0.4518742, 0.46226522, 0.41771045, 0.4315053, 0.4805042]
+            [0.47653976, 0.4843403, 0.46522307, 0.39793792, 0.454136, 0.4749748, 0.37724984, 0.4025603, 0.47651842]
         )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@@ -191,10 +231,10 @@ def test_stable_diffusion_controlnet_ddim_two_prompts(self):
         assert image.shape == (2, 32 * vae_scale_factor, 32 * vae_scale_factor, 3)
 
         expected_slice0 = np.array(
-            [0.47709626, 0.48531038, 0.4648616, 0.39797616, 0.4541167, 0.47469646, 0.3775609, 0.4033805, 0.4765025]
+            [0.4394728, 0.46073985, 0.49796283, 0.52271855, 0.51414967, 0.5314792, 0.47262335, 0.47206822, 0.48990324]
         )
         expected_slice1 = np.array(
-            [0.4621172, 0.4676137, 0.5062453, 0.5052618, 0.5217055, 0.5249935, 0.4406457, 0.4661678, 0.5214513]
+            [0.5315275, 0.4819456, 0.4750305, 0.4453807, 0.44164768, 0.47079763, 0.40049344, 0.39453578, 0.47368276]
         )
 
         assert np.abs(image_slice0.flatten() - expected_slice0).max() < 1e-2
@@ -222,41 +262,10 @@ def test_stable_diffusion_controlnet_ddim_two_images_per_prompt(self):
         assert image.shape == (2, 32 * vae_scale_factor, 32 * vae_scale_factor, 3)
 
         expected_slice0 = np.array(
-            [0.4763651, 0.48430225, 0.46508622, 0.3978958, 0.45416373, 0.4748904, 0.3773327, 0.40261823, 0.47642976]
-        )
-        expected_slice1 = np.array(
-            [0.462687, 0.46804678, 0.50551665, 0.50405616, 0.5203774, 0.52371174, 0.43939433, 0.46502018, 0.52084166]
-        )
-
-        assert np.abs(image_slice0.flatten() - expected_slice0).max() < 1e-2
-        assert np.abs(image_slice1.flatten() - expected_slice1).max() < 1e-2
-
-    def test_stable_diffusion_controlnet_ddim_two_images_per_prompt_pil_image(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        vae_scale_factor = 8
-        components = self.get_dummy_components_for_controlnet()
-        sd_pipe = StableDiffusionControlNetPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs_for_controlnet(device, num_images_per_prompt=2, pil_image=True)
-        output = sd_pipe(**inputs)
-        image = output.images
-
-        image_slice0 = image[0, -3:, -3:, -1]
-        image_slice1 = image[1, -3:, -3:, -1]
-
-        # print("image_slice0", image_slice0)
-        # print("image_slice1", image_slice1)
-
-        assert image.shape == (2, 32 * vae_scale_factor, 32 * vae_scale_factor, 3)
-
-        expected_slice0 = np.array(
-            [0.4763651, 0.48430225, 0.46508622, 0.3978958, 0.45416373, 0.4748904, 0.3773327, 0.40261823, 0.47642976]
+            [0.44349974, 0.46209368, 0.4967181, 0.5238648, 0.5147134, 0.5299364, 0.47317895, 0.47206104, 0.48903918]
         )
         expected_slice1 = np.array(
-            [0.462687, 0.46804678, 0.50551665, 0.50405616, 0.5203774, 0.52371174, 0.43939433, 0.46502018, 0.52084166]
+            [0.5333272, 0.48134372, 0.47437134, 0.44782317, 0.44065917, 0.4701641, 0.40167314, 0.39400867, 0.47319612]
         )
 
         assert np.abs(image_slice0.flatten() - expected_slice0).max() < 1e-2

From b8e689e32bead5d91227edb6b342216470b7d1fe Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Fri, 24 Feb 2023 19:48:34 +0900
Subject: [PATCH 055/122] make style

---
 .../pipeline_stable_diffusion_controlnet.py                 | 6 +++++-
 tests/test_pipelines_common.py                              | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 8ff9315ee0c3..65428263be4a 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -634,7 +634,11 @@ def __call__(
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 down_block_res_samples, mid_block_res_sample = self.controlnet(
-                    latent_model_input, t, encoder_hidden_states=prompt_embeds, controlnet_cond=image, return_dict=False
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    controlnet_cond=image,
+                    return_dict=False,
                 )
 
                 # predict the noise residual
diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py
index a7c218fe7e77..ec770ec3d41a 100644
--- a/tests/test_pipelines_common.py
+++ b/tests/test_pipelines_common.py
@@ -197,7 +197,11 @@ def test_inference_batch_single_identical(self):
         self._test_inference_batch_single_identical()
 
     def _test_inference_batch_single_identical(
-        self, test_max_difference=None, test_mean_pixel_difference=None, relax_max_difference=False, expected_max_diff=1e-4
+        self,
+        test_max_difference=None,
+        test_mean_pixel_difference=None,
+        relax_max_difference=False,
+        expected_max_diff=1e-4,
     ):
         if self.pipeline_class.__name__ in [
             "CycleDiffusionPipeline",

From 2d8cca1a43fec6a38ef3ca0af1304124b970d267 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Fri, 24 Feb 2023 20:08:38 +0900
Subject: [PATCH 056/122] update for slow test

---
 .../stable_diffusion/test_stable_diffusion_controlnet.py    | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 70c891d09a4a..c3d9bca073e2 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -287,9 +287,7 @@ def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0
         generator = torch.Generator(device=generator_device).manual_seed(seed)
         latents = torch.randn((1, 4, 64, 64), generator=generator, dtype=dtype)
         vae_scale_factor = 8
-        controlnet_hint = torch.randn(
-            (1, 3, 64 * vae_scale_factor, 64 * vae_scale_factor), generator=generator, dtype=dtype
-        )
+        image = torch.randn((1, 3, 64 * vae_scale_factor, 64 * vae_scale_factor), generator=generator, dtype=dtype)
         inputs = {
             "prompt": "a photograph of an astronaut riding a horse",
             "latents": latents,
@@ -297,7 +295,7 @@ def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0
             "num_inference_steps": 50,
             "guidance_scale": 7.5,
             "output_type": "numpy",
-            "controlnet_hint": controlnet_hint,
+            "image": image,
         }
         return inputs
 

From 0f70cf573b2374d08bbea43e79b8ad28810a70f8 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Fri, 24 Feb 2023 23:05:25 +0900
Subject: [PATCH 057/122] RGB(PIL)->BGR(ctrlnet) conversion

---
 .../stable_diffusion/pipeline_stable_diffusion_controlnet.py   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 65428263be4a..923e3a32d029 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -49,8 +49,9 @@ def preprocess(image, width, height):
         image = [np.array(i.resize((width, height), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
         image = np.concatenate(image, axis=0)
         image = np.array(image).astype(np.float32) / 255.0
+        image = image[:, :, :, ::-1]  # RGB -> BGR
         image = image.transpose(0, 3, 1, 2)
-        image = torch.from_numpy(image)
+        image = torch.from_numpy(image.copy())  # copy: ::-1 workaround
     elif isinstance(image[0], torch.Tensor):
         image = torch.cat(image, dim=0)
     return image

From 91623a9027ab94aedb49f0d6fe8808597a19be38 Mon Sep 17 00:00:00 2001
From: William Berman <WLBberman@gmail.com>
Date: Fri, 24 Feb 2023 13:19:40 -0800
Subject: [PATCH 058/122] fixes

---
 .../test_stable_diffusion_controlnet.py       | 105 +-----------------
 1 file changed, 4 insertions(+), 101 deletions(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index c3d9bca073e2..a9fe9985adaa 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -19,7 +19,6 @@
 import unittest
 
 import numpy as np
-import PIL.Image
 import torch
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
@@ -39,9 +38,6 @@
 from ...test_pipelines_common import PipelineTesterMixin
 
 
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
 class StableDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = StableDiffusionControlNetPipeline
 
@@ -135,44 +131,6 @@ def get_dummy_inputs(self, device, seed=0):
 
         return inputs
 
-    def get_dummy_components_for_controlnet(self):
-        components = self.get_dummy_components()
-        # vae_scale_factor 8 version
-        # this for ControlNetInputHintBlock accepts only vae_scale_factor=8
-        components["vae"] = AutoencoderKL(
-            block_out_channels=[32, 64, 64, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        return components
-
-    def get_dummy_inputs_for_controlnet(self, device, seed=0, num_of_prompts=1, num_images_per_prompt=1):
-        inputs = self.get_dummy_inputs(device, seed)
-        vae_scale_factor = 8
-        if num_of_prompts > 1:
-            inputs["prompt"] = [f"a photo of {i} cats" for i in range(num_of_prompts)]
-
-        controlnet_hint = torch.randn(
-            (num_of_prompts * num_images_per_prompt, 3, 32 * vae_scale_factor, 32 * vae_scale_factor),
-            generator=inputs["generator"],
-        )
-
-        controlnet_hint = controlnet_hint.detach().numpy().copy()
-        images = np.zeros_like(controlnet_hint, dtype=np.uint8)
-        images[controlnet_hint > 0.5] = 255
-        images = images.transpose(0, 3, 2, 1)  # b c h w -> b w h c
-        if images.shape[0] == 1:
-            controlnet_hint = PIL.Image.fromarray(images[0])  # PIL.Image
-        else:
-            controlnet_hint = [PIL.Image.fromarray(images[b]) for b in range(images.shape[0])]  # List of PIL.Image
-
-        inputs["image"] = controlnet_hint
-        inputs["num_images_per_prompt"] = num_images_per_prompt
-        return inputs
-
     def test_attention_slicing_forward_pass(self):
         return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
 
@@ -186,79 +144,24 @@ def test_xformers_attention_forwardGenerator_pass(self):
     def test_inference_batch_single_identical(self):
         self._test_inference_batch_single_identical(expected_max_diff=2e-3)
 
-    def test_stable_diffusion_controlnet_ddim(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        vae_scale_factor = 8
-        components = self.get_dummy_components_for_controlnet()
-        sd_pipe = StableDiffusionControlNetPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs_for_controlnet(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-        # print("image_slice", image_slice)
-
-        assert image.shape == (1, 32 * vae_scale_factor, 32 * vae_scale_factor, 3)
-        expected_slice = np.array(
-            [0.47653976, 0.4843403, 0.46522307, 0.39793792, 0.454136, 0.4749748, 0.37724984, 0.4025603, 0.47651842]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_controlnet_ddim_two_prompts(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        vae_scale_factor = 8
-        components = self.get_dummy_components_for_controlnet()
-        sd_pipe = StableDiffusionControlNetPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs_for_controlnet(device, num_of_prompts=2)
-        output = sd_pipe(**inputs)
-        image = output.images
-
-        image_slice0 = image[0, -3:, -3:, -1]
-        image_slice1 = image[1, -3:, -3:, -1]
-
-        # print("image_slice0", image_slice0)
-        # print("image_slice1", image_slice1)
-
-        assert image.shape == (2, 32 * vae_scale_factor, 32 * vae_scale_factor, 3)
-
-        expected_slice0 = np.array(
-            [0.4394728, 0.46073985, 0.49796283, 0.52271855, 0.51414967, 0.5314792, 0.47262335, 0.47206822, 0.48990324]
-        )
-        expected_slice1 = np.array(
-            [0.5315275, 0.4819456, 0.4750305, 0.4453807, 0.44164768, 0.47079763, 0.40049344, 0.39453578, 0.47368276]
-        )
-
-        assert np.abs(image_slice0.flatten() - expected_slice0).max() < 1e-2
-        assert np.abs(image_slice1.flatten() - expected_slice1).max() < 1e-2
-
+    # TODO(will) - remove
     def test_stable_diffusion_controlnet_ddim_two_images_per_prompt(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
 
         vae_scale_factor = 8
-        components = self.get_dummy_components_for_controlnet()
+        components = self.get_dummy_components()
         sd_pipe = StableDiffusionControlNetPipeline(**components)
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
 
-        inputs = self.get_dummy_inputs_for_controlnet(device, num_images_per_prompt=2)
+        inputs = self.get_dummy_inputs(device)
+        inputs["num_images_per_prompt"] = 2
         output = sd_pipe(**inputs)
         image = output.images
 
         image_slice0 = image[0, -3:, -3:, -1]
         image_slice1 = image[1, -3:, -3:, -1]
 
-        # print("image_slice0", image_slice0)
-        # print("image_slice1", image_slice1)
-
         assert image.shape == (2, 32 * vae_scale_factor, 32 * vae_scale_factor, 3)
 
         expected_slice0 = np.array(

From 855580db326cdb99f12a21f5786b013b24c009ee Mon Sep 17 00:00:00 2001
From: William Berman <WLBberman@gmail.com>
Date: Fri, 24 Feb 2023 16:06:51 -0800
Subject: [PATCH 059/122] remove manual num_images_per_prompt test

---
 .../test_stable_diffusion_controlnet.py       | 30 -------------------
 1 file changed, 30 deletions(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index a9fe9985adaa..a31b6fbbb47e 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -144,36 +144,6 @@ def test_xformers_attention_forwardGenerator_pass(self):
     def test_inference_batch_single_identical(self):
         self._test_inference_batch_single_identical(expected_max_diff=2e-3)
 
-    # TODO(will) - remove
-    def test_stable_diffusion_controlnet_ddim_two_images_per_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        vae_scale_factor = 8
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionControlNetPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["num_images_per_prompt"] = 2
-        output = sd_pipe(**inputs)
-        image = output.images
-
-        image_slice0 = image[0, -3:, -3:, -1]
-        image_slice1 = image[1, -3:, -3:, -1]
-
-        assert image.shape == (2, 32 * vae_scale_factor, 32 * vae_scale_factor, 3)
-
-        expected_slice0 = np.array(
-            [0.44349974, 0.46209368, 0.4967181, 0.5238648, 0.5147134, 0.5299364, 0.47317895, 0.47206104, 0.48903918]
-        )
-        expected_slice1 = np.array(
-            [0.5333272, 0.48134372, 0.47437134, 0.44782317, 0.44065917, 0.4701641, 0.40167314, 0.39400867, 0.47319612]
-        )
-
-        assert np.abs(image_slice0.flatten() - expected_slice0).max() < 1e-2
-        assert np.abs(image_slice1.flatten() - expected_slice1).max() < 1e-2
-
 
 @slow
 @require_torch_gpu

From 788e03df9607830318c52699905fc198093afc58 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Sun, 26 Feb 2023 00:58:41 +0900
Subject: [PATCH 060/122] add document

---
 .../stable_diffusion/control_net.mdx          | 56 ++++++++++++++++++-
 1 file changed, 53 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
index f49f2a47d33a..4718f4ae8749 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
@@ -19,11 +19,61 @@ ControlNet by [@lllyasviel](https://huggingface.co/lllyasviel) is a neural netwo
 It has integration with Stable Diffusion and 8 pre-trained models that conditions the models on different attributes 
 (such as edge detection, scribbles, depth maps, semantic segmentations and more)
 
-The original codebase can be found here: 
+The original codebase/paper can be found here: 
 - [lllyasviel/ControlNet](https://github.com/lllyasviel/ControlNet)
+- [Paper](https://arxiv.org/abs/2302.05543)
 
-Available Checkpoints are:
-- TODO: fill here.
+
+## Available converted checkpoints
+- *ControlNet+SD1.5 using canny edge detection:* [takuma104/control_sd15_canny](https://huggingface.co/takuma104/control_sd15_canny)
+- *ControlNet+SD1.5 using Midas depth estimation:* [takuma104/control_sd15_depth](https://huggingface.co/takuma104/control_sd15_depth)
+- *ControlNet+SD1.5 using HED edge detection (soft edge):* [takuma104/control_sd15_hed](https://huggingface.co/takuma104/control_sd15_hed)
+- *ControlNet+SD1.5 using M-LSD line detection:* [takuma104/control_sd15_mlsd](https://huggingface.co/takuma104/control_sd15_mlsd)
+- *ControlNet+SD1.5 using normal map:* [takuma104/control_sd15_normal](https://huggingface.co/takuma104/control_sd15_normal)
+- *ControlNet+SD1.5 using OpenPose pose detection:* [takuma104/control_sd15_openpose](https://huggingface.co/takuma104/control_sd15_openpose)
+- *ControlNet+SD1.5 using human scribbles:* [takuma104/control_sd15_scribble](https://huggingface.co/takuma104/control_sd15_scribble)
+- *ControlNet+SD1.5 using semantic segmentation:* [takuma104/control_sd15_seg](https://huggingface.co/takuma104/control_sd15_seg)
+
+## Resources
+- [Colab Notebook Example](https://colab.research.google.com/drive/1AiR7Q-sBqO88NCyswpfiuwXZc7DfMyKA?usp=sharing)
+- [controlnet_hinter](https://github.com/takuma104/controlnet_hinter): Image Preprocess Library for ControlNet
+
+## Usage example
+
+- Basic Example (Canny Edge)
+```python
+from diffusers import StableDiffusionControlNetPipeline
+from diffusers.utils import load_image
+
+# Canny edged image for control
+canny_edged_image = load_image(
+    "https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged.png"
+)
+pipe = StableDiffusionControlNetPipeline.from_pretrained("takuma104/control_sd15_canny").to("cuda")
+image = pipe(prompt="best quality, extremely detailed", image=canny_edged_image).images[0]
+image.save("generated.png")
+```
+
+- Using SD1.x variant model to control
+```py
+from diffusers import StableDiffusionControlNetPipeline, AutoencoderKL, UNet2DConditionModel
+from diffusers.utils import load_image
+
+# Canny edged image for control
+canny_edged_image = load_image(
+    "https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged.png"
+)
+
+base_model_id = 'prompthero/openjourney' # an example: openjourney model
+vae = AutoencoderKL.from_pretrained(base_model_id, subfolder="vae").to("cuda")
+unet = UNet2DConditionModel.from_pretrained(base_model_id, subfolder="unet").to("cuda")
+
+pipe = StableDiffusionControlNetPipeline.from_pretrained("takuma104/control_sd15_canny", 
+                                                         unet=unet, vae=vae).to("cuda")
+image = pipe(prompt="best quality, extremely detailed", image=canny_edged_image, 
+             width=512, height=512).images[0]
+image.save("generated.png")
+```
 
 [[autodoc]] StableDiffusionControlNetPipeline
 	- all

From 42ebc458d0e474da226779957ce875641998b6db Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Sun, 26 Feb 2023 01:12:32 +0900
Subject: [PATCH 061/122] add `image` argument docstring

---
 .../stable_diffusion/pipeline_stable_diffusion_controlnet.py  | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 923e3a32d029..ded847dbcd3f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -502,6 +502,10 @@ def __call__(
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                 instead.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]`):
+                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
+                the type is is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. PIL.Image.Image` can
+                also be accepted as an image. The control image is automatically resized to fit the output image.
             height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image.
             width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):

From 8bb964db278e783fd3eb99e2d5f15a838722a062 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Sun, 26 Feb 2023 01:12:46 +0900
Subject: [PATCH 062/122] make style

---
 .../pipelines/stable_diffusion/control_net.mdx   | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
index 4718f4ae8749..f15a3170b865 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
@@ -46,9 +46,7 @@ from diffusers import StableDiffusionControlNetPipeline
 from diffusers.utils import load_image
 
 # Canny edged image for control
-canny_edged_image = load_image(
-    "https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged.png"
-)
+canny_edged_image = load_image("https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged.png")
 pipe = StableDiffusionControlNetPipeline.from_pretrained("takuma104/control_sd15_canny").to("cuda")
 image = pipe(prompt="best quality, extremely detailed", image=canny_edged_image).images[0]
 image.save("generated.png")
@@ -60,18 +58,14 @@ from diffusers import StableDiffusionControlNetPipeline, AutoencoderKL, UNet2DCo
 from diffusers.utils import load_image
 
 # Canny edged image for control
-canny_edged_image = load_image(
-    "https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged.png"
-)
+canny_edged_image = load_image("https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged.png")
 
-base_model_id = 'prompthero/openjourney' # an example: openjourney model
+base_model_id = "prompthero/openjourney"  # an example: openjourney model
 vae = AutoencoderKL.from_pretrained(base_model_id, subfolder="vae").to("cuda")
 unet = UNet2DConditionModel.from_pretrained(base_model_id, subfolder="unet").to("cuda")
 
-pipe = StableDiffusionControlNetPipeline.from_pretrained("takuma104/control_sd15_canny", 
-                                                         unet=unet, vae=vae).to("cuda")
-image = pipe(prompt="best quality, extremely detailed", image=canny_edged_image, 
-             width=512, height=512).images[0]
+pipe = StableDiffusionControlNetPipeline.from_pretrained("takuma104/control_sd15_canny", unet=unet, vae=vae).to("cuda")
+image = pipe(prompt="best quality, extremely detailed", image=canny_edged_image, width=512, height=512).images[0]
 image.save("generated.png")
 ```
 

From 49024f6664b913ead093100317ada0e1fa9368c1 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Sun, 26 Feb 2023 02:06:26 +0900
Subject: [PATCH 063/122] Add line to correct conversion

---
 docs/source/en/api/pipelines/stable_diffusion/control_net.mdx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
index f15a3170b865..d3b592f96587 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
@@ -25,6 +25,7 @@ The original codebase/paper can be found here:
 
 
 ## Available converted checkpoints
+
 - *ControlNet+SD1.5 using canny edge detection:* [takuma104/control_sd15_canny](https://huggingface.co/takuma104/control_sd15_canny)
 - *ControlNet+SD1.5 using Midas depth estimation:* [takuma104/control_sd15_depth](https://huggingface.co/takuma104/control_sd15_depth)
 - *ControlNet+SD1.5 using HED edge detection (soft edge):* [takuma104/control_sd15_hed](https://huggingface.co/takuma104/control_sd15_hed)
@@ -35,6 +36,7 @@ The original codebase/paper can be found here:
 - *ControlNet+SD1.5 using semantic segmentation:* [takuma104/control_sd15_seg](https://huggingface.co/takuma104/control_sd15_seg)
 
 ## Resources
+
 - [Colab Notebook Example](https://colab.research.google.com/drive/1AiR7Q-sBqO88NCyswpfiuwXZc7DfMyKA?usp=sharing)
 - [controlnet_hinter](https://github.com/takuma104/controlnet_hinter): Image Preprocess Library for ControlNet
 

From 30f7570c1d351e11b78ca0b7ec99a39ef860d1a0 Mon Sep 17 00:00:00 2001
From: William Berman <WLBberman@gmail.com>
Date: Sat, 25 Feb 2023 13:43:57 -0800
Subject: [PATCH 064/122] add controlnet_conditioning_scale (aka control_scales
 strength)

---
 .../pipeline_stable_diffusion_controlnet.py            | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index ded847dbcd3f..aecdc46f8ec8 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -494,6 +494,7 @@ def __call__(
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: int = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: float = 1.0,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -558,6 +559,9 @@ def __call__(
                 A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
                 `self.processor` in
                 [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+            controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original unet.
 
         Examples:
 
@@ -646,6 +650,12 @@ def __call__(
                     return_dict=False,
                 )
 
+                down_block_res_samples = [
+                    down_block_res_sample * controlnet_conditioning_scale
+                    for down_block_res_sample in down_block_res_samples
+                ]
+                mid_block_res_sample *= controlnet_conditioning_scale
+
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,

From e169f3229e8f3ffac8632c9e7391a244bc22ff5e Mon Sep 17 00:00:00 2001
From: William Berman <WLBberman@gmail.com>
Date: Sat, 25 Feb 2023 14:04:05 -0800
Subject: [PATCH 065/122] rgb channel ordering by default

---
 .../stable_diffusion/pipeline_stable_diffusion_controlnet.py   | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index aecdc46f8ec8..b5217ad97f17 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -49,9 +49,8 @@ def preprocess(image, width, height):
         image = [np.array(i.resize((width, height), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
         image = np.concatenate(image, axis=0)
         image = np.array(image).astype(np.float32) / 255.0
-        image = image[:, :, :, ::-1]  # RGB -> BGR
         image = image.transpose(0, 3, 1, 2)
-        image = torch.from_numpy(image.copy())  # copy: ::-1 workaround
+        image = torch.from_numpy(image)
     elif isinstance(image[0], torch.Tensor):
         image = torch.cat(image, dim=0)
     return image

From e1b8b49efd2d4ea3a36606e00bed28ff65ffa099 Mon Sep 17 00:00:00 2001
From: William Berman <WLBberman@gmail.com>
Date: Sat, 25 Feb 2023 15:53:41 -0800
Subject: [PATCH 066/122] image batching logic

---
 .../pipeline_stable_diffusion_controlnet.py   | 94 ++++++++++++++-----
 1 file changed, 72 insertions(+), 22 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index b5217ad97f17..27422f50f60f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -39,23 +39,6 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-def preprocess(image, width, height):
-    if isinstance(image, torch.Tensor):
-        return image
-    elif isinstance(image, PIL.Image.Image):
-        image = [image]
-
-    if isinstance(image[0], PIL.Image.Image):
-        image = [np.array(i.resize((width, height), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
-        image = np.concatenate(image, axis=0)
-        image = np.array(image).astype(np.float32) / 255.0
-        image = image.transpose(0, 3, 1, 2)
-        image = torch.from_numpy(image)
-    elif isinstance(image[0], torch.Tensor):
-        image = torch.cat(image, dim=0)
-    return image
-
-
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
@@ -405,10 +388,10 @@ def prepare_extra_step_kwargs(self, generator, eta):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
     def check_inputs(
         self,
         prompt,
+        image,
         height,
         width,
         callback_steps,
@@ -453,6 +436,67 @@ def check_inputs(
                     f" {negative_prompt_embeds.shape}."
                 )
 
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+
+        if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list:
+            raise TypeError(
+                "image must be one of PIL image, torch tensor, list of PIL images, or list of torch tensors"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        elif image_is_tensor:
+            image_batch_size = image.shape[0]
+        elif image_is_pil_list:
+            image_batch_size = len(image)
+        elif image_is_tensor_list:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    def prepare_image(self, image, width, height, batch_size, num_images_per_prompt, device, dtype):
+        if not isinstance(image, torch.Tensor):
+            if isinstance(image, PIL.Image.Image):
+                image = [image]
+
+            if isinstance(image[0], PIL.Image.Image):
+                image = [
+                    np.array(i.resize((width, height), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image
+                ]
+                image = np.concatenate(image, axis=0)
+                image = np.array(image).astype(np.float32) / 255.0
+                image = image.transpose(0, 3, 1, 2)
+                image = torch.from_numpy(image)
+            elif isinstance(image[0], torch.Tensor):
+                image = torch.cat(image, dim=0)
+
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        return image
+
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
         shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
@@ -577,7 +621,7 @@ def __call__(
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+            prompt, image, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
         )
 
         # 2. Define call parameters
@@ -606,9 +650,15 @@ def __call__(
         )
 
         # 4. Prepare image
-        image = preprocess(image, width, height)
-
-        image = image.to(device=device, dtype=self.controlnet.dtype)
+        image = self.prepare_image(
+            image,
+            width,
+            height,
+            batch_size * num_images_per_prompt,
+            num_images_per_prompt,
+            device,
+            self.controlnet.dtype,
+        )
 
         if do_classifier_free_guidance:
             image = torch.cat([image] * 2)

From f5cd24a2af71dd7b0c039fd1f543ed7a82462fd7 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 28 Feb 2023 01:46:42 +0900
Subject: [PATCH 067/122] Add control image descriptions for each checkpoint

---
 .../stable_diffusion/control_net.mdx          | 32 +++++++++++++------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
index d3b592f96587..a6caec310285 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
@@ -20,20 +20,32 @@ It has integration with Stable Diffusion and 8 pre-trained models that condition
 (such as edge detection, scribbles, depth maps, semantic segmentations and more)
 
 The original codebase/paper can be found here: 
-- [lllyasviel/ControlNet](https://github.com/lllyasviel/ControlNet)
+- [Code](https://github.com/lllyasviel/ControlNet)
 - [Paper](https://arxiv.org/abs/2302.05543)
 
 
-## Available converted checkpoints
+## Available checkpoints
+
+ControlNet requires a *control image* in addition to the text-to-image *prompt*. 
+Each pretrained model is trained using a different method. 
+For this reason, the image content passed to the `image` argument of `pipe()` is different. 
+See the overview and image examples.
+
+All checkpoints are converted from [lllyasviel/ControlNet](https://huggingface.co/lllyasviel/ControlNet).
+
+### ControlNet + SD1.5
+
+| Model Name | Control Image Overview| Control Image Example
+|---|---|---|
+|[takuma104/control_sd15_canny](https://huggingface.co/takuma104/control_sd15_canny)<br/> *Trained with canny edge detection* | A monochrome image with white edges on a black background.  | <a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_bird_canny.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_bird_canny.png"/></a> |
+|[takuma104/control_sd15_depth](https://huggingface.co/takuma104/control_sd15_depth)<br/> *Trained with Midas depth estimation*  |A grayscale image with black representing deep areas and white representing shallow areas.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_vermeer_depth.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_vermeer_depth.png"/></a>|
+|[takuma104/control_sd15_hed](https://huggingface.co/takuma104/control_sd15_hed)<br/> *Trained with HED edge detection (soft edge)*  |A monochrome image with white soft edges on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_bird_hed.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_bird_hed.png"/></a>|
+|[takuma104/control_sd15_mlsd](https://huggingface.co/takuma104/control_sd15_mlsd)<br/> *Trained with M-LSD line detection*  |A monochrome image composed only of white straight lines on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_room_mlsd.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_room_mlsd.png"/></a>|
+|[takuma104/control_sd15_normal](https://huggingface.co/takuma104/control_sd15_normal)<br/> *Trained with normal map*  |A [normal mapped](https://en.wikipedia.org/wiki/Normal_mapping) image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_human_normal.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_human_normal.png"/></a>|
+|[takuma104/control_sd15_openpose](https://huggingface.co/takuma104/control_sd15_openpose)<br/> *Trained with OpenPose bone image*  |A [OpenPose bone](https://github.com/CMU-Perceptual-Computing-Lab/openpose) image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_human_openpose.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_human_openpose.png"/></a>|
+|[takuma104/control_sd15_scribble](https://huggingface.co/takuma104/control_sd15_scribble)<br/> *Trained with human scribbles*  |A hand-drawn monochrome image with white outlines on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_vermeer_scribble.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_vermeer_scribble.png"/></a>|
+|[takuma104/control_sd15_seg](https://huggingface.co/takuma104/control_sd15_seg)<br/>*Trained with semantic segmentation*  |An [ADE20K](https://groups.csail.mit.edu/vision/datasets/ADE20K/)'s segmentation protocol image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_room_seg.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_room_seg.png"/></a>|
 
-- *ControlNet+SD1.5 using canny edge detection:* [takuma104/control_sd15_canny](https://huggingface.co/takuma104/control_sd15_canny)
-- *ControlNet+SD1.5 using Midas depth estimation:* [takuma104/control_sd15_depth](https://huggingface.co/takuma104/control_sd15_depth)
-- *ControlNet+SD1.5 using HED edge detection (soft edge):* [takuma104/control_sd15_hed](https://huggingface.co/takuma104/control_sd15_hed)
-- *ControlNet+SD1.5 using M-LSD line detection:* [takuma104/control_sd15_mlsd](https://huggingface.co/takuma104/control_sd15_mlsd)
-- *ControlNet+SD1.5 using normal map:* [takuma104/control_sd15_normal](https://huggingface.co/takuma104/control_sd15_normal)
-- *ControlNet+SD1.5 using OpenPose pose detection:* [takuma104/control_sd15_openpose](https://huggingface.co/takuma104/control_sd15_openpose)
-- *ControlNet+SD1.5 using human scribbles:* [takuma104/control_sd15_scribble](https://huggingface.co/takuma104/control_sd15_scribble)
-- *ControlNet+SD1.5 using semantic segmentation:* [takuma104/control_sd15_seg](https://huggingface.co/takuma104/control_sd15_seg)
 
 ## Resources
 

From 8f01ca12a7b3b8b6d4710005bb7c972c0289ad44 Mon Sep 17 00:00:00 2001
From: William Berman <WLBberman@gmail.com>
Date: Mon, 27 Feb 2023 16:15:55 -0800
Subject: [PATCH 068/122] Only save controlnet model in conversion script

---
 scripts/convert_original_stable_diffusion_to_diffusers.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/scripts/convert_original_stable_diffusion_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py
index d0f4b8efa93e..6f6488e7b26d 100644
--- a/scripts/convert_original_stable_diffusion_to_diffusers.py
+++ b/scripts/convert_original_stable_diffusion_to_diffusers.py
@@ -142,4 +142,9 @@
         clip_stats_path=args.clip_stats_path,
         controlnet=args.controlnet,
     )
-    pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
+
+    if args.controlnet:
+        # only save the controlnet model
+        pipe.controlnet.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
+    else:
+        pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)

From ca4378e1495c79f6a82c94277ddff7bd9bb971cc Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 28 Feb 2023 22:03:41 +0900
Subject: [PATCH 069/122] Update
 src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py

typo

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 .../stable_diffusion/pipeline_stable_diffusion_controlnet.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 27422f50f60f..2de3148b6c5a 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -548,7 +548,7 @@ def __call__(
                 instead.
             image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]`):
                 The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
-                the type is is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. PIL.Image.Image` can
+                the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. PIL.Image.Image` can
                 also be accepted as an image. The control image is automatically resized to fit the output image.
             height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image.

From 1799d830e3cffc6f55bc47d1d84c11b125606fbb Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 28 Feb 2023 22:04:28 +0900
Subject: [PATCH 070/122] Update
 docs/source/en/api/pipelines/stable_diffusion/control_net.mdx

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 docs/source/en/api/pipelines/stable_diffusion/control_net.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
index a6caec310285..95b4f7cf9153 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
@@ -16,7 +16,7 @@ specific language governing permissions and limitations under the License.
 
 ControlNet by [@lllyasviel](https://huggingface.co/lllyasviel) is a neural network structure to control diffusion models by adding extra conditions.
 
-It has integration with Stable Diffusion and 8 pre-trained models that conditions the models on different attributes 
+There are 8 pre-trained ControlNet models that were trained to condition the original Stable Diffusion model on different inputs, 
 (such as edge detection, scribbles, depth maps, semantic segmentations and more)
 
 The original codebase/paper can be found here: 

From d7b95cf23b5f5417be417e7a2e78f428df1cb0a1 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 28 Feb 2023 22:04:36 +0900
Subject: [PATCH 071/122] Update
 docs/source/en/api/pipelines/stable_diffusion/control_net.mdx

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 docs/source/en/api/pipelines/stable_diffusion/control_net.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
index 95b4f7cf9153..451f8ecc8d31 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# Text-to-Image Generation with ControlNet guidance
+# Text-to-Image Generation with ControlNet Conditioning
 
 ## StableDiffusionControlNetPipeline
 

From 2e86e1f2e93ea435d78a78c2bdef54978c747966 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 28 Feb 2023 22:05:17 +0900
Subject: [PATCH 072/122] Update
 docs/source/en/api/pipelines/stable_diffusion/control_net.mdx

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 docs/source/en/api/pipelines/stable_diffusion/control_net.mdx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
index 451f8ecc8d31..f6ea05e5c060 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
@@ -17,7 +17,9 @@ specific language governing permissions and limitations under the License.
 ControlNet by [@lllyasviel](https://huggingface.co/lllyasviel) is a neural network structure to control diffusion models by adding extra conditions.
 
 There are 8 pre-trained ControlNet models that were trained to condition the original Stable Diffusion model on different inputs, 
-(such as edge detection, scribbles, depth maps, semantic segmentations and more)
+such as edge detection, scribbles, depth maps, semantic segmentations and more.
+
+Using the pretrained models we can provide control images (for example, a depth map) to control Stable Diffusion text-to-image generation so that it follows the structure of the depth image and fills in the details.
 
 The original codebase/paper can be found here: 
 - [Code](https://github.com/lllyasviel/ControlNet)

From bb030693da8d7a5141306c89f37411b8d2934b4b Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 28 Feb 2023 22:05:53 +0900
Subject: [PATCH 073/122] Update
 docs/source/en/api/pipelines/stable_diffusion/control_net.mdx

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 docs/source/en/api/pipelines/stable_diffusion/control_net.mdx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
index f6ea05e5c060..0a44d3f2715d 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
@@ -68,7 +68,9 @@ image = pipe(prompt="best quality, extremely detailed", image=canny_edged_image)
 image.save("generated.png")
 ```
 
-- Using SD1.x variant model to control
+- Controlling custom Stable Diffusion 1.5 models
+
+In the following example we use PromptHero's [Openjourney model](https://huggingface.co/prompthero/openjourney), which was fine-tuned from the base Stable Diffusion v1.5 model on images from Midjourney. This model has the same structure as Stable Diffusion 1.5 but is capable of producing a different output style.
 ```py
 from diffusers import StableDiffusionControlNetPipeline, AutoencoderKL, UNet2DConditionModel
 from diffusers.utils import load_image

From 9a14567b81bd00de505074b2f65d70b2c3e26783 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 28 Feb 2023 22:06:14 +0900
Subject: [PATCH 074/122] Update
 docs/source/en/api/pipelines/stable_diffusion/control_net.mdx

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 docs/source/en/api/pipelines/stable_diffusion/control_net.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
index 0a44d3f2715d..0a067f3a2cb5 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
@@ -29,7 +29,7 @@ The original codebase/paper can be found here:
 ## Available checkpoints
 
 ControlNet requires a *control image* in addition to the text-to-image *prompt*. 
-Each pretrained model is trained using a different method. 
+Each pretrained model is trained using a different conditioning method that requires different conditioning images. For example, Canny edge conditioning requires the control image to be the output of a Canny filter, while depth conditioning requires the control image to be a depth mask.
 For this reason, the image content passed to the `image` argument of `pipe()` is different. 
 See the overview and image examples.
 

From 71d0a96453f40f3c8cf18555ef26f2dd66a92485 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 28 Feb 2023 22:06:37 +0900
Subject: [PATCH 075/122] Update
 docs/source/en/api/pipelines/stable_diffusion/control_net.mdx

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 docs/source/en/api/pipelines/stable_diffusion/control_net.mdx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
index 0a067f3a2cb5..3a51ee490f5a 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
@@ -68,6 +68,7 @@ image = pipe(prompt="best quality, extremely detailed", image=canny_edged_image)
 image.save("generated.png")
 ```
 
+Note that the text prompt does not make any reference to the structure or contents of the image we are generating. Stable Diffusion interprets the control image as an additional input that controls what to generate.
 - Controlling custom Stable Diffusion 1.5 models
 
 In the following example we use PromptHero's [Openjourney model](https://huggingface.co/prompthero/openjourney), which was fine-tuned from the base Stable Diffusion v1.5 model on images from Midjourney. This model has the same structure as Stable Diffusion 1.5 but is capable of producing a different output style.

From 16efb00b714ab845bd496d2d65e6c7915f8c2ebf Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 28 Feb 2023 22:06:55 +0900
Subject: [PATCH 076/122] Update
 docs/source/en/api/pipelines/stable_diffusion/control_net.mdx

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 .../source/en/api/pipelines/stable_diffusion/control_net.mdx | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
index 3a51ee490f5a..12a02801d5ec 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
@@ -57,6 +57,11 @@ All checkpoints are converted from [lllyasviel/ControlNet](https://huggingface.c
 ## Usage example
 
 - Basic Example (Canny Edge)
+
+The conditioning image is an outline of the image edges, as detected by a Canny filter. This is the example we'll use to control the generation:
+
+![White on black edges detected on Vermeer's Girl with a Pearl Earring portrait](https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged.png)
+
 ```python
 from diffusers import StableDiffusionControlNetPipeline
 from diffusers.utils import load_image

From 1b0af7d44b2fd07f1b5e736fd477aa326cdf9ad9 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 28 Feb 2023 22:07:27 +0900
Subject: [PATCH 077/122] Update
 docs/source/en/api/pipelines/stable_diffusion/control_net.mdx

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 docs/source/en/api/pipelines/stable_diffusion/control_net.mdx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
index 12a02801d5ec..7cfd0b3d385e 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
@@ -30,7 +30,6 @@ The original codebase/paper can be found here:
 
 ControlNet requires a *control image* in addition to the text-to-image *prompt*. 
 Each pretrained model is trained using a different conditioning method that requires different conditioning images. For example, Canny edge conditioning requires the control image to be the output of a Canny filter, while depth conditioning requires the control image to be a depth mask.
-For this reason, the image content passed to the `image` argument of `pipe()` is different. 
 See the overview and image examples.
 
 All checkpoints are converted from [lllyasviel/ControlNet](https://huggingface.co/lllyasviel/ControlNet).

From 53f4523b0ea8136de6d8d3d62284ce80434712f0 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 28 Feb 2023 22:07:46 +0900
Subject: [PATCH 078/122] Update
 docs/source/en/api/pipelines/stable_diffusion/control_net.mdx

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 docs/source/en/api/pipelines/stable_diffusion/control_net.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
index 7cfd0b3d385e..f65256728ac0 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
@@ -34,7 +34,7 @@ See the overview and image examples.
 
 All checkpoints are converted from [lllyasviel/ControlNet](https://huggingface.co/lllyasviel/ControlNet).
 
-### ControlNet + SD1.5
+### ControlNet + Stable Diffusion 1.5
 
 | Model Name | Control Image Overview| Control Image Example
 |---|---|---|

From 161aac28dc6b1c638e9cd95396ed6e6cbbf4eea4 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 28 Feb 2023 23:33:57 +0900
Subject: [PATCH 079/122] add gerated image example

---
 .../stable_diffusion/control_net.mdx          | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
index f65256728ac0..0b3d0cd1985c 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
@@ -36,16 +36,16 @@ All checkpoints are converted from [lllyasviel/ControlNet](https://huggingface.c
 
 ### ControlNet + Stable Diffusion 1.5
 
-| Model Name | Control Image Overview| Control Image Example
-|---|---|---|
-|[takuma104/control_sd15_canny](https://huggingface.co/takuma104/control_sd15_canny)<br/> *Trained with canny edge detection* | A monochrome image with white edges on a black background.  | <a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_bird_canny.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_bird_canny.png"/></a> |
-|[takuma104/control_sd15_depth](https://huggingface.co/takuma104/control_sd15_depth)<br/> *Trained with Midas depth estimation*  |A grayscale image with black representing deep areas and white representing shallow areas.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_vermeer_depth.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_vermeer_depth.png"/></a>|
-|[takuma104/control_sd15_hed](https://huggingface.co/takuma104/control_sd15_hed)<br/> *Trained with HED edge detection (soft edge)*  |A monochrome image with white soft edges on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_bird_hed.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_bird_hed.png"/></a>|
-|[takuma104/control_sd15_mlsd](https://huggingface.co/takuma104/control_sd15_mlsd)<br/> *Trained with M-LSD line detection*  |A monochrome image composed only of white straight lines on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_room_mlsd.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_room_mlsd.png"/></a>|
-|[takuma104/control_sd15_normal](https://huggingface.co/takuma104/control_sd15_normal)<br/> *Trained with normal map*  |A [normal mapped](https://en.wikipedia.org/wiki/Normal_mapping) image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_human_normal.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_human_normal.png"/></a>|
-|[takuma104/control_sd15_openpose](https://huggingface.co/takuma104/control_sd15_openpose)<br/> *Trained with OpenPose bone image*  |A [OpenPose bone](https://github.com/CMU-Perceptual-Computing-Lab/openpose) image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_human_openpose.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_human_openpose.png"/></a>|
-|[takuma104/control_sd15_scribble](https://huggingface.co/takuma104/control_sd15_scribble)<br/> *Trained with human scribbles*  |A hand-drawn monochrome image with white outlines on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_vermeer_scribble.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_vermeer_scribble.png"/></a>|
-|[takuma104/control_sd15_seg](https://huggingface.co/takuma104/control_sd15_seg)<br/>*Trained with semantic segmentation*  |An [ADE20K](https://groups.csail.mit.edu/vision/datasets/ADE20K/)'s segmentation protocol image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_room_seg.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_room_seg.png"/></a>|
+| Model Name | Control Image Overview| Control Image Example | Generated Image Example |
+|---|---|---|---|
+|[takuma104/control_sd15_canny](https://huggingface.co/takuma104/control_sd15_canny)<br/> *Trained with canny edge detection* | A monochrome image with white edges on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_bird_canny.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_bird_canny.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_canny_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_canny_1.png"/></a>|
+|[takuma104/control_sd15_depth](https://huggingface.co/takuma104/control_sd15_depth)<br/> *Trained with Midas depth estimation*  |A grayscale image with black representing deep areas and white representing shallow areas.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_vermeer_depth.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_vermeer_depth.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_depth_2.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_depth_2.png"/></a>|
+|[takuma104/control_sd15_hed](https://huggingface.co/takuma104/control_sd15_hed)<br/> *Trained with HED edge detection (soft edge)*  |A monochrome image with white soft edges on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_bird_hed.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_bird_hed.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_hed_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_hed_1.png"/></a> |
+|[takuma104/control_sd15_mlsd](https://huggingface.co/takuma104/control_sd15_mlsd)<br/> *Trained with M-LSD line detection*  |A monochrome image composed only of white straight lines on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_room_mlsd.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_room_mlsd.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_mlsd_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_mlsd_0.png"/></a>|
+|[takuma104/control_sd15_normal](https://huggingface.co/takuma104/control_sd15_normal)<br/> *Trained with normal map*  |A [normal mapped](https://en.wikipedia.org/wiki/Normal_mapping) image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_human_normal.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_human_normal.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_normal_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_normal_1.png"/></a>|
+|[takuma104/control_sd15_openpose](https://huggingface.co/takuma104/control_sd15_openpose)<br/> *Trained with OpenPose bone image*  |A [OpenPose bone](https://github.com/CMU-Perceptual-Computing-Lab/openpose) image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_human_openpose.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_human_openpose.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_openpose_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_openpose_0.png"/></a>|
+|[takuma104/control_sd15_scribble](https://huggingface.co/takuma104/control_sd15_scribble)<br/> *Trained with human scribbles*  |A hand-drawn monochrome image with white outlines on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_vermeer_scribble.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_vermeer_scribble.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_scribble_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_scribble_0.png"/></a> |
+|[takuma104/control_sd15_seg](https://huggingface.co/takuma104/control_sd15_seg)<br/>*Trained with semantic segmentation*  |An [ADE20K](https://groups.csail.mit.edu/vision/datasets/ADE20K/)'s segmentation protocol image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_room_seg.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_room_seg.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_seg_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_seg_1.png"/></a> |
 
 
 ## Resources

From 349f3bf0422eaa9191b57509956d1130c1fa1ac1 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 28 Feb 2023 23:36:50 +0900
Subject: [PATCH 080/122] a depth mask -> a depth map

---
 docs/source/en/api/pipelines/stable_diffusion/control_net.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
index 0b3d0cd1985c..de168a26c4e6 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
@@ -29,7 +29,7 @@ The original codebase/paper can be found here:
 ## Available checkpoints
 
 ControlNet requires a *control image* in addition to the text-to-image *prompt*. 
-Each pretrained model is trained using a different conditioning method that requires different conditioning images. For example, Canny edge conditioning requires the control image to be the output of a Canny filter, while depth conditioning requires the control image to be a depth mask.
+Each pretrained model is trained using a different conditioning method that requires different conditioning images. For example, Canny edge conditioning requires the control image to be the output of a Canny filter, while depth conditioning requires the control image to be a depth map.
 See the overview and image examples.
 
 All checkpoints are converted from [lllyasviel/ControlNet](https://huggingface.co/lllyasviel/ControlNet).

From 3f6e8f7aef1adc44883044e77454d7f64820bc01 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Tue, 28 Feb 2023 23:41:07 +0900
Subject: [PATCH 081/122] rename control_net.mdx to controlnet.mdx

---
 docs/source/en/_toctree.yml                                     | 2 +-
 .../stable_diffusion/{control_net.mdx => controlnet.mdx}        | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename docs/source/en/api/pipelines/stable_diffusion/{control_net.mdx => controlnet.mdx} (100%)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 8dba589ec56f..2cdfca6cd5fc 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -165,7 +165,7 @@
         title: Self-Attention Guidance
       - local: api/pipelines/stable_diffusion/panorama
         title: MultiDiffusion Panorama
-      - local: api/pipelines/stable_diffusion/control_net
+      - local: api/pipelines/stable_diffusion/controlnet
         title: Text-to-Image with ControlNet guidance
       title: Stable Diffusion
     - local: api/pipelines/stable_diffusion_2
diff --git a/docs/source/en/api/pipelines/stable_diffusion/control_net.mdx b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
similarity index 100%
rename from docs/source/en/api/pipelines/stable_diffusion/control_net.mdx
rename to docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx

From ebabcbec6c0f8ad6a08237de0be96627d2666e52 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Wed, 1 Mar 2023 01:42:04 +0900
Subject: [PATCH 082/122] fix toc title

---
 docs/source/en/_toctree.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 2cdfca6cd5fc..40ee6c7a4960 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -166,7 +166,7 @@
       - local: api/pipelines/stable_diffusion/panorama
         title: MultiDiffusion Panorama
       - local: api/pipelines/stable_diffusion/controlnet
-        title: Text-to-Image with ControlNet guidance
+        title: Text-to-Image Generation with ControlNet Conditioning
       title: Stable Diffusion
     - local: api/pipelines/stable_diffusion_2
       title: Stable Diffusion 2

From 30e2bde64f0636848b02d60001b780f3876eb50a Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Wed, 1 Mar 2023 02:36:20 +0900
Subject: [PATCH 083/122] add ControlNet abstruct and link

---
 .../en/using-diffusers/controlling_generation.mdx  | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/docs/source/en/using-diffusers/controlling_generation.mdx b/docs/source/en/using-diffusers/controlling_generation.mdx
index 173f51c786d0..598fc3346052 100644
--- a/docs/source/en/using-diffusers/controlling_generation.mdx
+++ b/docs/source/en/using-diffusers/controlling_generation.mdx
@@ -35,6 +35,8 @@ Unless otherwise mentioned, these are techniques that work with existing models
 7. [DreamBooth](#dreambooth)
 8. [Textual Inversion](#textual-inversion)
 10. [MultiDiffusion Panorama](#panorama)
+11. [ControlNet](#controlnet)
+
 
 ## Instruct pix2pix
 
@@ -132,3 +134,15 @@ MultiDiffusion defines a new generation process over a pre-trained diffusion mod
 [MultiDiffusion Panorama](../api/pipelines/stable_diffusion/panorama) allows to generate high-quality images at arbitrary aspect ratios (e.g., panoramas).
 
 See [here](../api/pipelines/stable_diffusion/panorama) for more information on how to use it to generate panoramic images.
+
+## ControlNet
+
+[Paper](https://arxiv.org/abs/2302.05543)
+
+[ControlNet](../api/pipelines/stable_diffusion/controlnet) is a neural network structure to control diffusion models by adding extra conditions.
+There are 8 pre-trained ControlNet models that were trained to condition the original Stable Diffusion model on different inputs, 
+such as edge detection, scribbles, depth maps, semantic segmentations and more.
+
+Using the pretrained models we can provide control images (for example, a depth map) to control Stable Diffusion text-to-image generation so that it follows the structure of the depth image and fills in the details.
+
+See [here](../api/pipelines/stable_diffusion/controlnet) for more information on how to use it.

From f099be501a1c110da4ad3c42e86a30d96edfa757 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Wed, 1 Mar 2023 15:29:44 +0900
Subject: [PATCH 084/122] Update
 src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py

Co-authored-by: dqueue <dbyqin@gmail.com>
---
 .../stable_diffusion/pipeline_stable_diffusion_controlnet.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 2de3148b6c5a..c2c9c4f4f723 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -50,7 +50,7 @@
         ...     "https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged.png"
         ... )
         >>> pipe = StableDiffusionControlNetPipeline.from_pretrained("takuma104/control_sd15_canny").to("cuda")
-        >>> image = pipe(prompt="best quality, extremely detailed", controlnet_hint=canny_edged_image).images[0]
+        >>> image = pipe(prompt="best quality, extremely detailed", image=canny_edged_image).images[0]
         ```
 """
 

From 2b553d941018c49bf23009e3eae5e245ab835038 Mon Sep 17 00:00:00 2001
From: William Berman <WLBberman@gmail.com>
Date: Tue, 28 Feb 2023 22:45:49 -0800
Subject: [PATCH 085/122] remove controlnet constructor arguments re:
 @patrickvonplaten

---
 src/diffusers/models/controlnet.py            | 100 ++++--------------
 .../stable_diffusion/convert_from_ckpt.py     |   2 +
 .../test_stable_diffusion_controlnet.py       |   2 -
 3 files changed, 24 insertions(+), 80 deletions(-)

diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py
index 7afa1eccf65d..e034910724cc 100644
--- a/src/diffusers/models/controlnet.py
+++ b/src/diffusers/models/controlnet.py
@@ -20,13 +20,12 @@
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..utils import BaseOutput, logging
 from .cross_attention import AttnProcessor
-from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
+from .embeddings import TimestepEmbedding, Timesteps
 from .modeling_utils import ModelMixin
 from .unet_2d_blocks import (
     CrossAttnDownBlock2D,
     DownBlock2D,
     UNetMidBlock2DCrossAttn,
-    UNetMidBlock2DSimpleCrossAttn,
     get_down_block,
 )
 
@@ -50,7 +49,7 @@ class ControlNetConditioningDefaultEmbedding(nn.Module):
     feature maps ..."
     """
 
-    def __init__(self, conditioning_channels: int, conditioning_embedding_channels: int):
+    def __init__(self, conditioning_embedding_channels: int, conditioning_channels: int = 3):
         super().__init__()
 
         self.conditioning_embedder = nn.Sequential(
@@ -82,9 +81,7 @@ class ControlNetModel(ModelMixin, ConfigMixin):
     @register_to_config
     def __init__(
         self,
-        sample_size: Optional[int] = None,
         in_channels: int = 4,
-        center_input_sample: bool = False,
         flip_sin_to_cos: bool = True,
         freq_shift: int = 0,
         down_block_types: Tuple[str] = (
@@ -93,7 +90,6 @@ def __init__(
             "CrossAttnDownBlock2D",
             "DownBlock2D",
         ),
-        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
         only_cross_attention: Union[bool, Tuple[bool]] = False,
         block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
         layers_per_block: int = 2,
@@ -104,24 +100,15 @@ def __init__(
         norm_eps: float = 1e-5,
         cross_attention_dim: int = 1280,
         attention_head_dim: Union[int, Tuple[int]] = 8,
-        dual_cross_attention: bool = False,
         use_linear_projection: bool = False,
         class_embed_type: Optional[str] = None,
         num_class_embeds: Optional[int] = None,
         upcast_attention: bool = False,
         resnet_time_scale_shift: str = "default",
-        time_embedding_type: str = "positional",
-        timestep_post_act: Optional[str] = None,
-        time_cond_proj_dim: Optional[int] = None,
-        conv_in_kernel: int = 3,
         projection_class_embeddings_input_dim: Optional[int] = None,
-        controlnet_conditioning_embedding_type: str = "default",
-        controlnet_conditioning_channels: int = 3,
     ):
         super().__init__()
 
-        self.sample_size = sample_size
-
         # Check inputs
         if len(block_out_channels) != len(down_block_types):
             raise ValueError(
@@ -139,36 +126,22 @@ def __init__(
             )
 
         # input
+        conv_in_kernel = 3
         conv_in_padding = (conv_in_kernel - 1) // 2
         self.conv_in = nn.Conv2d(
             in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
         )
 
         # time
-        if time_embedding_type == "fourier":
-            time_embed_dim = block_out_channels[0] * 2
-            if time_embed_dim % 2 != 0:
-                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
-            self.time_proj = GaussianFourierProjection(
-                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
-            )
-            timestep_input_dim = time_embed_dim
-        elif time_embedding_type == "positional":
-            time_embed_dim = block_out_channels[0] * 4
+        time_embed_dim = block_out_channels[0] * 4
 
-            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
-            timestep_input_dim = block_out_channels[0]
-        else:
-            raise ValueError(
-                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
-            )
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
 
         self.time_embedding = TimestepEmbedding(
             timestep_input_dim,
             time_embed_dim,
             act_fn=act_fn,
-            post_act_fn=timestep_post_act,
-            cond_proj_dim=time_cond_proj_dim,
         )
 
         # class embedding
@@ -195,15 +168,9 @@ def __init__(
             self.class_embedding = None
 
         # control net conditioning embedding
-        if controlnet_conditioning_embedding_type == "default":
-            self.controlnet_cond_embedding = ControlNetConditioningDefaultEmbedding(
-                conditioning_channels=controlnet_conditioning_channels,
-                conditioning_embedding_channels=block_out_channels[0],
-            )
-        else:
-            raise ValueError(
-                f"unknown `controlnet_conditioning_embedding_type`: {controlnet_conditioning_embedding_type}. Options are 'default'"
-            )
+        self.controlnet_cond_embedding = ControlNetConditioningDefaultEmbedding(
+            conditioning_embedding_channels=block_out_channels[0],
+        )
 
         self.down_blocks = nn.ModuleList([])
         self.controlnet_down_blocks = nn.ModuleList([])
@@ -239,7 +206,6 @@ def __init__(
                 cross_attention_dim=cross_attention_dim,
                 attn_num_head_channels=attention_head_dim[i],
                 downsample_padding=downsample_padding,
-                dual_cross_attention=dual_cross_attention,
                 use_linear_projection=use_linear_projection,
                 only_cross_attention=only_cross_attention[i],
                 upcast_attention=upcast_attention,
@@ -264,37 +230,19 @@ def __init__(
         controlnet_block = zero_module(controlnet_block)
         self.controlnet_mid_block = controlnet_block
 
-        if mid_block_type == "UNetMidBlock2DCrossAttn":
-            self.mid_block = UNetMidBlock2DCrossAttn(
-                in_channels=mid_block_channel,
-                temb_channels=time_embed_dim,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                output_scale_factor=mid_block_scale_factor,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=attention_head_dim[-1],
-                resnet_groups=norm_num_groups,
-                dual_cross_attention=dual_cross_attention,
-                use_linear_projection=use_linear_projection,
-                upcast_attention=upcast_attention,
-            )
-        elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
-            self.mid_block = UNetMidBlock2DSimpleCrossAttn(
-                in_channels=block_out_channels[-1],
-                temb_channels=time_embed_dim,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                output_scale_factor=mid_block_scale_factor,
-                cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=attention_head_dim[-1],
-                resnet_groups=norm_num_groups,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-            )
-        elif mid_block_type is None:
-            self.mid_block = None
-        else:
-            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+        self.mid_block = UNetMidBlock2DCrossAttn(
+            in_channels=mid_block_channel,
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attention_head_dim[-1],
+            resnet_groups=norm_num_groups,
+            use_linear_projection=use_linear_projection,
+            upcast_attention=upcast_attention,
+        )
 
     @property
     # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
@@ -439,10 +387,6 @@ def forward(
             attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
             attention_mask = attention_mask.unsqueeze(1)
 
-        # 0. center input if necessary
-        if self.config.center_input_sample:
-            sample = 2 * sample - 1.0
-
         # 1. time
         timesteps = timestep
         if not torch.is_tensor(timesteps):
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index b3e5d3278594..481c69f6749e 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -1226,6 +1226,8 @@ def load_pipeline_from_original_stable_diffusion_ckpt(
             ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True)
             ctrlnet_config["upcast_attention"] = upcast_attention
 
+            ctrlnet_config.pop("sample_size")
+
             controlnet_model = ControlNetModel(**ctrlnet_config)
 
             converted_ctrl_checkpoint = convert_ldm_unet_checkpoint(
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index a31b6fbbb47e..3eaf2ab24eb5 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -57,11 +57,9 @@ def get_dummy_components(self):
         controlnet = ControlNetModel(
             block_out_channels=(32, 64),
             layers_per_block=2,
-            sample_size=32,
             in_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             cross_attention_dim=32,
-            controlnet_conditioning_channels=3,
         )
         torch.manual_seed(0)
         scheduler = DDIMScheduler(

From d49296ca7cfa573abb555604dceba5532bea2dad Mon Sep 17 00:00:00 2001
From: Will Berman <WLBberman@gmail.com>
Date: Wed, 1 Mar 2023 08:07:15 +0000
Subject: [PATCH 086/122] [integration tests] test canny

---
 .../test_stable_diffusion_controlnet.py       | 309 ++----------------
 1 file changed, 31 insertions(+), 278 deletions(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 3eaf2ab24eb5..ff717d2574df 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -14,8 +14,6 @@
 # limitations under the License.
 
 import gc
-import tempfile
-import time
 import unittest
 
 import numpy as np
@@ -26,12 +24,10 @@
     AutoencoderKL,
     ControlNetModel,
     DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    LMSDiscreteScheduler,
     StableDiffusionControlNetPipeline,
     UNet2DConditionModel,
 )
-from diffusers.utils import randn_tensor, slow, torch_device
+from diffusers.utils import load_image, load_numpy, randn_tensor, slow, torch_device
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import require_torch_gpu
 
@@ -146,300 +142,57 @@ def test_inference_batch_single_identical(self):
 @slow
 @require_torch_gpu
 class StableDiffusionControlNetPipelineSlowTests(unittest.TestCase):
-    model_id = "takuma104/control_sd15_canny"
-    controlnet_memsize = 1451078656  # in float32, https://gist.github.com/takuma104/ce954bde6511a1f0b031a87a646b1f7d
-
     def tearDown(self):
         super().tearDown()
         gc.collect()
         torch.cuda.empty_cache()
 
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=generator_device).manual_seed(seed)
-        latents = torch.randn((1, 4, 64, 64), generator=generator, dtype=dtype)
-        vae_scale_factor = 8
-        image = torch.randn((1, 3, 64 * vae_scale_factor, 64 * vae_scale_factor), generator=generator, dtype=dtype)
-        inputs = {
-            "prompt": "a photograph of an astronaut riding a horse",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 50,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-            "image": image,
-        }
-        return inputs
-
-    def test_stable_diffusion_controlnet_ddim(self):
-        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained(self.model_id, safety_checker=None)
-        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        # print(image_slice)
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [1.0, 0.9598756, 0.8430315, 0.9999685, 0.9130426, 0.8025453, 0.87997377, 0.8080752, 0.7180274]
+    def test_canny(self):
+        controlnet = ControlNetModel.from_pretrained(
+            "fusing/stable-diffusion-v1-5-controlnet-seg", torch_dtype=torch.float16
         )
-        assert np.abs(image_slice - expected_slice).max() < 1e-4
-
-    def test_stable_diffusion_controlnet_lms(self):
-        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained(self.model_id, safety_checker=None)
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
 
-        # print(image_slice)
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [1.0, 0.9631732, 0.84487236, 1.0, 0.914418, 0.8033508, 0.88200307, 0.809505, 0.7186936]
-        )
-        assert np.abs(image_slice - expected_slice).max() < 1e-4
-
-    def test_stable_diffusion_controlnet_dpm(self):
-        sd_pipe = StableDiffusionControlNetPipeline.from_pretrained(self.model_id, safety_checker=None)
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        # print(image_slice)
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [1.0, 0.9627134, 0.8445909, 1.0, 0.9132767, 0.8025819, 0.88159156, 0.8089917, 0.71824443]
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, safety_checker=None, controlnet=controlnet
         )
-        assert np.abs(image_slice - expected_slice).max() < 1e-4
-
-    def test_stable_diffusion_controlnet_attention_slicing(self):
-        torch.cuda.reset_peak_memory_stats()
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(self.model_id, torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
-        # enable attention slicing
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        image_sliced = pipe(**inputs).images
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-        # make sure that less than 3.75 GB is allocated
-        assert mem_bytes < 3.75 * 10**9 + self.controlnet_memsize / 2
-
-        # disable slicing
-        pipe.disable_attention_slicing()
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        image = pipe(**inputs).images
-
-        # make sure that more than 3.75 GB is allocated
-        mem_bytes = torch.cuda.max_memory_allocated()
-        assert mem_bytes > 3.75 * 10**9 + self.controlnet_memsize / 2
-        assert np.abs(image_sliced - image).max() < 1e-3
-
-    def test_stable_diffusion_vae_slicing(self):
-        torch.cuda.reset_peak_memory_stats()
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(self.model_id, torch_dtype=torch.float16)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        # enable vae slicing
-        pipe.enable_vae_slicing()
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        inputs["prompt"] = [inputs["prompt"]] * 4
-        inputs["latents"] = torch.cat([inputs["latents"]] * 4)
-        image_sliced = pipe(**inputs).images
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-        # make sure that less than 4 GB is allocated
-        assert mem_bytes < 4e9 + self.controlnet_memsize / 2
-
-        # disable vae slicing
-        pipe.disable_vae_slicing()
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        inputs["prompt"] = [inputs["prompt"]] * 4
-        inputs["latents"] = torch.cat([inputs["latents"]] * 4)
-        image = pipe(**inputs).images
-
-        # make sure that more than 4 GB is allocated
-        mem_bytes = torch.cuda.max_memory_allocated()
-        assert mem_bytes > 4e9 + self.controlnet_memsize / 2
-        # There is a small discrepancy at the image borders vs. a fully batched version.
-        assert np.abs(image_sliced - image).max() < 1e-2
-
-    def test_stable_diffusion_fp16_vs_autocast(self):
-        # this test makes sure that the original model with autocast
-        # and the new model with fp16 yield the same result
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(self.model_id, torch_dtype=torch.float16)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        image_fp16 = pipe(**inputs).images
-
-        with torch.autocast(torch_device):
-            inputs = self.get_inputs(torch_device)
-            image_autocast = pipe(**inputs).images
-
-        # Make sure results are close enough
-        diff = np.abs(image_fp16.flatten() - image_autocast.flatten())
-        # They ARE different since ops are not run always at the same precision
-        # however, they should be extremely close.
-        assert diff.mean() < 2e-2
-
-    def test_stable_diffusion_controlnet_intermediate_state(self):
-        number_of_steps = 0
-
-        def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
-            callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 1:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([-1.981, 1.052, -1.0625, -0.01709, -1.138, -0.592, -0.372, 0.332, 0.845])
-                # print(latents_slice.flatten())
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
-            elif step == 2:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([-2.043, 1.113, -1.138, 0.062, -1.133, -0.614, -0.3901, 0.352, 0.8667])
-                # print(latents_slice.flatten())
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
-
-        callback_fn.has_been_called = False
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(self.model_id, torch_dtype=torch.float16)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        assert callback_fn.has_been_called
-        assert number_of_steps == inputs["num_inference_steps"]
-
-    def test_stable_diffusion_low_cpu_mem_usage(self):
-        pipeline_id = self.model_id
-
-        start_time = time.time()
-        pipeline_low_cpu_mem_usage = StableDiffusionControlNetPipeline.from_pretrained(
-            pipeline_id, torch_dtype=torch.float16
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "pig with skyscrapers in background"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pig_canny.png"
         )
-        pipeline_low_cpu_mem_usage.to(torch_device)
-        low_cpu_mem_usage_time = time.time() - start_time
-
-        start_time = time.time()
-        _ = StableDiffusionControlNetPipeline.from_pretrained(
-            pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False
-        )
-        normal_load_time = time.time() - start_time
-
-        assert 2 * low_cpu_mem_usage_time < normal_load_time
-
-    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(self.model_id, torch_dtype=torch.float16)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
 
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        _ = pipe(**inputs)
+        output = pipe(prompt, image, generator=generator, output_type="np")
 
-        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 2.8 GB is allocated
-        assert mem_bytes < 2.8 * 10**9 + self.controlnet_memsize / 2
+        image = output.images[0]
 
-    def test_stable_diffusion_pipeline_with_model_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        assert image.shape == (512, 512, 3)
 
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pig_canny_out.npy"
+        )
 
-        # Normal inference
+        assert np.abs(expected_image - image).max() < 1e-4
 
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            self.model_id,
-            torch_dtype=torch.float16,
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        outputs = pipe(**inputs)
-        mem_bytes = torch.cuda.max_memory_allocated()
+    def test_depth(self):
+        ...
 
-        # With model offloading
+    def test_hed(self):
+        ...
 
-        # Reload but don't move to cuda
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            self.model_id,
-            torch_dtype=torch.float16,
-        )
+    def test_mlsd(self):
+        ...
 
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+    def test_normal(self):
+        ...
 
-        pipe.enable_model_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
-        outputs_offloaded = pipe(**inputs)
-        mem_bytes_offloaded = torch.cuda.max_memory_allocated()
+    def test_openpose(self):
+        ...
 
-        assert np.abs(outputs.images - outputs_offloaded.images).max() < 1e-3
-        assert mem_bytes_offloaded < mem_bytes
-        assert mem_bytes_offloaded < 3.5 * 10**9 + self.controlnet_memsize / 2
-        for module in pipe.text_encoder, pipe.unet, pipe.vae, pipe.safety_checker:
-            assert module.device == torch.device("cpu")
+    def test_scribble(self):
+        ...
 
-        # With attention slicing
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        pipe.enable_attention_slicing()
-        _ = pipe(**inputs)
-        mem_bytes_slicing = torch.cuda.max_memory_allocated()
-
-        assert mem_bytes_slicing < mem_bytes_offloaded
-        assert mem_bytes_slicing < 3 * 10**9 + self.controlnet_memsize / 2
-
-    def test_stable_diffusion_no_safety_checker(self):
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(self.model_id, safety_checker=None)
-        assert isinstance(pipe, StableDiffusionControlNetPipeline)
-        assert isinstance(pipe.scheduler, DDIMScheduler)
-        assert pipe.safety_checker is None
-
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
-
-        # check that there's no error when saving a pipeline with one of the models being None
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = StableDiffusionControlNetPipeline.from_pretrained(tmpdirname)
-
-        # sanity check that the pipeline still works
-        assert pipe.safety_checker is None
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
+    def test_seg(self):
+        ...

From d1cd65a6ead5a68f089ace4285b9e14c47640bf3 Mon Sep 17 00:00:00 2001
From: Will Berman <WLBberman@gmail.com>
Date: Wed, 1 Mar 2023 08:16:28 +0000
Subject: [PATCH 087/122] test_canny fixes

---
 .../stable_diffusion/test_stable_diffusion_controlnet.py  | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index ff717d2574df..22a813c36144 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -148,18 +148,16 @@ def tearDown(self):
         torch.cuda.empty_cache()
 
     def test_canny(self):
-        controlnet = ControlNetModel.from_pretrained(
-            "fusing/stable-diffusion-v1-5-controlnet-seg", torch_dtype=torch.float16
-        )
+        controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-canny")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, safety_checker=None, controlnet=controlnet
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
         )
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)
-        prompt = "pig with skyscrapers in background"
+        prompt = "pig in barn"
         image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pig_canny.png"
         )

From 7eb43f1b736c07295c94daf254656ecad5201c6c Mon Sep 17 00:00:00 2001
From: Will Berman <WLBberman@gmail.com>
Date: Wed, 1 Mar 2023 08:51:42 +0000
Subject: [PATCH 088/122] [integration tests] test_depth

---
 .../test_stable_diffusion_controlnet.py       | 26 ++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 22a813c36144..095a4fcd2f11 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -175,7 +175,31 @@ def test_canny(self):
         assert np.abs(expected_image - image).max() < 1e-4
 
     def test_depth(self):
-        ...
+        controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-depth")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet, revision="non-ema"
+        )
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(5)
+        prompt = "Stormtrooper's lecture"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
+        )
+
+        output = pipe(prompt, image, generator=generator, output_type="pt")
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth_out.npy"
+        )
+
+        assert np.abs(expected_image - image).max() < 1e-4
 
     def test_hed(self):
         ...

From 032d5e0999bfe03b03a29dbae531a58e954de4f5 Mon Sep 17 00:00:00 2001
From: Will Berman <WLBberman@gmail.com>
Date: Wed, 1 Mar 2023 09:14:09 +0000
Subject: [PATCH 089/122] [integration tests] test_hed

---
 .../test_stable_diffusion_controlnet.py       | 34 ++++++++++++++++---
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 095a4fcd2f11..99e96d270c0b 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -178,18 +178,18 @@ def test_depth(self):
         controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-depth")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet, revision="non-ema"
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
         )
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
-        generator = torch.Generator(device="cpu").manual_seed(5)
+        generator = torch.Generator(device="cpu").manual_seed(0)
         prompt = "Stormtrooper's lecture"
         image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
         )
 
-        output = pipe(prompt, image, generator=generator, output_type="pt")
+        output = pipe(prompt, image, generator=generator, output_type="np")
 
         image = output.images[0]
 
@@ -199,10 +199,34 @@ def test_depth(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth_out.npy"
         )
 
-        assert np.abs(expected_image - image).max() < 1e-4
+        assert np.abs(expected_image - image).max() < 0e-4
 
     def test_hed(self):
-        ...
+        controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-hed")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "oil painting of handsome old man, masterpiece"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/man_hed.png"
+        )
+
+        output = pipe(prompt, image, generator=generator, height=768, output_type="np")
+
+        image = output.images[0]
+
+        assert image.shape == (768, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/man_hed_out.npy"
+        )
+
+        assert np.abs(expected_image - image).max() < 1e-4
 
     def test_mlsd(self):
         ...

From 5c7dbb3a28ca02aaaadb432474519eec55864319 Mon Sep 17 00:00:00 2001
From: Will Berman <WLBberman@gmail.com>
Date: Wed, 1 Mar 2023 09:38:35 +0000
Subject: [PATCH 090/122] [integration tests] test_mlsd

---
 .../test_stable_diffusion_controlnet.py       | 26 ++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 99e96d270c0b..3ff8850045a5 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -229,7 +229,31 @@ def test_hed(self):
         assert np.abs(expected_image - image).max() < 1e-4
 
     def test_mlsd(self):
-        ...
+        controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-mlsd")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "room"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/room_mlsd.png"
+        )
+
+        output = pipe(prompt, image, generator=generator, height=768, output_type="np")
+
+        image = output.images[0]
+
+        assert image.shape == (768, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/room_mlsd_out.npy"
+        )
+
+        assert np.abs(expected_image - image).max() < 1e-4
 
     def test_normal(self):
         ...

From cdbc7c449f92e3fa9b19b75647e661a75cb4d6e2 Mon Sep 17 00:00:00 2001
From: Will Berman <WLBberman@gmail.com>
Date: Wed, 1 Mar 2023 10:01:26 +0000
Subject: [PATCH 091/122] add channel order config to controlnet

---
 src/diffusers/models/controlnet.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py
index e034910724cc..6dffbf867ce1 100644
--- a/src/diffusers/models/controlnet.py
+++ b/src/diffusers/models/controlnet.py
@@ -106,6 +106,7 @@ def __init__(
         upcast_attention: bool = False,
         resnet_time_scale_shift: str = "default",
         projection_class_embeddings_input_dim: Optional[int] = None,
+        controlnet_conditioning_channel_order: str = "rgb",
     ):
         super().__init__()
 
@@ -382,6 +383,17 @@ def forward(
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         return_dict: bool = True,
     ) -> Union[ControlNetOutput, Tuple]:
+        # check channel order
+        channel_order = self.config.controlnet_conditioning_channel_order
+
+        if channel_order == "rgb":
+            # in rgb order by default
+            ...
+        elif channel_order == "bgr":
+            controlnet_cond = torch.flip(controlnet_cond, dims=[1])
+        else:
+            raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
+
         # prepare attention_mask
         if attention_mask is not None:
             attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0

From 86c16843b0effe8cf4320116d4fdfd386778be9e Mon Sep 17 00:00:00 2001
From: Will Berman <WLBberman@gmail.com>
Date: Wed, 1 Mar 2023 10:01:41 +0000
Subject: [PATCH 092/122] [integration tests] test normal

---
 .../test_stable_diffusion_controlnet.py       | 26 ++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 3ff8850045a5..52e9d952736f 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -256,7 +256,31 @@ def test_mlsd(self):
         assert np.abs(expected_image - image).max() < 1e-4
 
     def test_normal(self):
-        ...
+        controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-normal")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "cute toy"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/cute_toy_normal.png"
+        )
+
+        output = pipe(prompt, image, generator=generator, output_type="np")
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/cute_toy_normal_out.npy"
+        )
+
+        assert np.abs(expected_image - image).max() < 1e-4
 
     def test_openpose(self):
         ...

From a18fc70ce489e7557cc034052cd153452261cda2 Mon Sep 17 00:00:00 2001
From: Will Berman <WLBberman@gmail.com>
Date: Wed, 1 Mar 2023 10:35:21 +0000
Subject: [PATCH 093/122] [integration tests] test_openpose test_scribble

---
 .../test_stable_diffusion_controlnet.py       | 52 ++++++++++++++++++-
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 52e9d952736f..996608a18739 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -283,10 +283,58 @@ def test_normal(self):
         assert np.abs(expected_image - image).max() < 1e-4
 
     def test_openpose(self):
-        ...
+        controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-openpose")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "Chef in the kitchen"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose.png"
+        )
+
+        output = pipe(prompt, image, generator=generator, height=768, output_type="np")
+
+        image = output.images[0]
+
+        assert image.shape == (768, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/chef_pose_out.npy"
+        )
+
+        assert np.abs(expected_image - image).max() < 1e-4
 
     def test_scribble(self):
-        ...
+        controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-scribble")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(5)
+        prompt = "bag"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bag_scribble.png"
+        )
+
+        output = pipe(prompt, image, generator=generator, height=640, output_type="np")
+
+        image = output.images[0]
+
+        assert image.shape == (640, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bag_scribble_out.npy"
+        )
+
+        assert np.abs(expected_image - image).max() < 1e-4
 
     def test_seg(self):
         ...

From 9ec6ad4146e87e41198dc2ad52da70bb3354abcc Mon Sep 17 00:00:00 2001
From: Will Berman <WLBberman@gmail.com>
Date: Wed, 1 Mar 2023 11:09:33 +0000
Subject: [PATCH 094/122] change height and width to default to conditioning
 image

---
 .../pipeline_stable_diffusion_controlnet.py   | 27 ++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index c2c9c4f4f723..d26ac434f897 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -515,6 +515,28 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
         latents = latents * self.scheduler.init_noise_sigma
         return latents
 
+    def _default_height_width(self, height, width, image):
+        if isinstance(image, list):
+            image = image[0]
+
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, torch.Tensor):
+                height = image.shape[3]
+
+            height = (height + 7) & (-8) # round to nearest multiple of 8
+
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, torch.Tensor):
+                width = image.shape[2]
+
+            width = (width + 7) & (-8) # round to nearest multiple of 8
+
+        return height, width
+
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
@@ -615,9 +637,8 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 0. Default height and width to image
+        height, width = self._default_height_width(height, width, image)
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(

From 8fd8e427547e81cf082ed895818afc7c117f8a0d Mon Sep 17 00:00:00 2001
From: Will Berman <WLBberman@gmail.com>
Date: Wed, 1 Mar 2023 11:09:55 +0000
Subject: [PATCH 095/122] [integration tests] test seg

---
 .../test_stable_diffusion_controlnet.py       | 34 ++++++++++++++++---
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 996608a18739..b54cf0b0a25f 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -216,7 +216,7 @@ def test_hed(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/man_hed.png"
         )
 
-        output = pipe(prompt, image, generator=generator, height=768, output_type="np")
+        output = pipe(prompt, image, generator=generator, output_type="np")
 
         image = output.images[0]
 
@@ -243,7 +243,7 @@ def test_mlsd(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/room_mlsd.png"
         )
 
-        output = pipe(prompt, image, generator=generator, height=768, output_type="np")
+        output = pipe(prompt, image, generator=generator, output_type="np")
 
         image = output.images[0]
 
@@ -297,7 +297,7 @@ def test_openpose(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose.png"
         )
 
-        output = pipe(prompt, image, generator=generator, height=768, output_type="np")
+        output = pipe(prompt, image, generator=generator, output_type="np")
 
         image = output.images[0]
 
@@ -324,7 +324,7 @@ def test_scribble(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bag_scribble.png"
         )
 
-        output = pipe(prompt, image, generator=generator, height=640, output_type="np")
+        output = pipe(prompt, image, generator=generator, output_type="np")
 
         image = output.images[0]
 
@@ -337,4 +337,28 @@ def test_scribble(self):
         assert np.abs(expected_image - image).max() < 1e-4
 
     def test_seg(self):
-        ...
+        controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-seg")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(5)
+        prompt = "house"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg.png"
+        )
+
+        output = pipe(prompt, image, generator=generator, output_type="np")
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg_out.npy"
+        )
+
+        assert np.abs(expected_image - image).max() < 1e-4

From 7c35fc77d518eeb1b1eda3a4dae0a4282cac617e Mon Sep 17 00:00:00 2001
From: Will Berman <WLBberman@gmail.com>
Date: Wed, 1 Mar 2023 11:10:17 +0000
Subject: [PATCH 096/122] style

---
 .../stable_diffusion/pipeline_stable_diffusion_controlnet.py  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index d26ac434f897..c56b0b2cf063 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -525,7 +525,7 @@ def _default_height_width(self, height, width, image):
             elif isinstance(image, torch.Tensor):
                 height = image.shape[3]
 
-            height = (height + 7) & (-8) # round to nearest multiple of 8
+            height = (height + 7) & (-8)  # round to nearest multiple of 8
 
         if width is None:
             if isinstance(image, PIL.Image.Image):
@@ -533,7 +533,7 @@ def _default_height_width(self, height, width, image):
             elif isinstance(image, torch.Tensor):
                 width = image.shape[2]
 
-            width = (width + 7) & (-8) # round to nearest multiple of 8
+            width = (width + 7) & (-8)  # round to nearest multiple of 8
 
         return height, width
 

From e20079710b5a8acf51950c604d26b0197ad451f3 Mon Sep 17 00:00:00 2001
From: Will Berman <WLBberman@gmail.com>
Date: Wed, 1 Mar 2023 11:15:28 +0000
Subject: [PATCH 097/122] test_depth fix

---
 .../stable_diffusion/test_stable_diffusion_controlnet.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index b54cf0b0a25f..d8ec54a1e9c7 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -199,7 +199,7 @@ def test_depth(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth_out.npy"
         )
 
-        assert np.abs(expected_image - image).max() < 0e-4
+        assert np.abs(expected_image - image).max() < 1e-4
 
     def test_hed(self):
         controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-hed")

From e6973eb78f739e4b617ad42192aa6ae3d5c14a5e Mon Sep 17 00:00:00 2001
From: Will Berman <WLBberman@gmail.com>
Date: Wed, 1 Mar 2023 11:43:14 +0000
Subject: [PATCH 098/122] [integration tests] size fixes

---
 .../pipeline_stable_diffusion_controlnet.py          |  4 ++--
 .../test_stable_diffusion_controlnet.py              | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index c56b0b2cf063..cc3c856824d7 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -525,7 +525,7 @@ def _default_height_width(self, height, width, image):
             elif isinstance(image, torch.Tensor):
                 height = image.shape[3]
 
-            height = (height + 7) & (-8)  # round to nearest multiple of 8
+            height = (height // 8) * 8  # round down to nearest multiple of 8
 
         if width is None:
             if isinstance(image, PIL.Image.Image):
@@ -533,7 +533,7 @@ def _default_height_width(self, height, width, image):
             elif isinstance(image, torch.Tensor):
                 width = image.shape[2]
 
-            width = (width + 7) & (-8)  # round to nearest multiple of 8
+            width = (width // 8) * 8  # round down to nearest multiple of 8
 
         return height, width
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index d8ec54a1e9c7..900c07b6b465 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -157,19 +157,19 @@ def test_canny(self):
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)
-        prompt = "pig in barn"
+        prompt = "bird"
         image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pig_canny.png"
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
         )
 
         output = pipe(prompt, image, generator=generator, output_type="np")
 
         image = output.images[0]
 
-        assert image.shape == (512, 512, 3)
+        assert image.shape == (768, 512, 3)
 
         expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pig_canny_out.npy"
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out.npy"
         )
 
         assert np.abs(expected_image - image).max() < 1e-4
@@ -220,7 +220,7 @@ def test_hed(self):
 
         image = output.images[0]
 
-        assert image.shape == (768, 512, 3)
+        assert image.shape == (704, 512, 3)
 
         expected_image = load_numpy(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/man_hed_out.npy"
@@ -247,7 +247,7 @@ def test_mlsd(self):
 
         image = output.images[0]
 
-        assert image.shape == (768, 512, 3)
+        assert image.shape == (704, 512, 3)
 
         expected_image = load_numpy(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/room_mlsd_out.npy"

From 0ba19dac4765a8c07dc104c2948f79ec846d70ec Mon Sep 17 00:00:00 2001
From: Will Berman <WLBberman@gmail.com>
Date: Wed, 1 Mar 2023 11:49:20 +0000
Subject: [PATCH 099/122] [integration tests] cpu offloading

---
 .../test_stable_diffusion_controlnet.py       | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 900c07b6b465..1ff288bcc377 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -362,3 +362,34 @@ def test_seg(self):
         )
 
         assert np.abs(expected_image - image).max() < 1e-4
+
+    def test_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-seg")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+        pipe.enable_sequential_cpu_offload()
+
+        prompt = "house"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg.png"
+        )
+
+        _ = pipe(
+            prompt,
+            image,
+            num_inference_steps=2,
+            output_type="np",
+        )
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 7 GB is allocated
+        assert mem_bytes < 7 * 10**9
\ No newline at end of file

From 1a803d1d80fa127d2a36b600ce264335c85860a0 Mon Sep 17 00:00:00 2001
From: Will Berman <WLBberman@gmail.com>
Date: Wed, 1 Mar 2023 11:49:45 +0000
Subject: [PATCH 100/122] style

---
 .../stable_diffusion/test_stable_diffusion_controlnet.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 1ff288bcc377..6da8585fc0d3 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -392,4 +392,4 @@ def test_sequential_cpu_offloading(self):
 
         mem_bytes = torch.cuda.max_memory_allocated()
         # make sure that less than 7 GB is allocated
-        assert mem_bytes < 7 * 10**9
\ No newline at end of file
+        assert mem_bytes < 7 * 10**9

From 8dea9c70a4328203b12783ea4d0ba1187322e654 Mon Sep 17 00:00:00 2001
From: Will Berman <WLBberman@gmail.com>
Date: Wed, 1 Mar 2023 12:30:28 +0000
Subject: [PATCH 101/122] generalize controlnet embedding

---
 src/diffusers/models/controlnet.py            | 52 ++++++++++++-------
 .../stable_diffusion/convert_from_ckpt.py     | 13 +++--
 .../test_stable_diffusion_controlnet.py       |  3 +-
 3 files changed, 43 insertions(+), 25 deletions(-)

diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py
index 6dffbf867ce1..b7a0406b0c1d 100644
--- a/src/diffusers/models/controlnet.py
+++ b/src/diffusers/models/controlnet.py
@@ -16,6 +16,7 @@
 
 import torch
 from torch import nn
+from torch.nn import functional as F
 
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..utils import BaseOutput, logging
@@ -39,7 +40,7 @@ class ControlNetOutput(BaseOutput):
     mid_block_res_sample: torch.Tensor
 
 
-class ControlNetConditioningDefaultEmbedding(nn.Module):
+class ControlNetConditioningEmbedding(nn.Module):
     """
     "Stable Diffusion uses a pre-processing method similar to VQ-GAN [11] to convert the entire dataset of 512 × 512
     images into smaller 64 × 64 “latent images” for stabilized training. This requires ControlNets to convert
@@ -49,29 +50,38 @@ class ControlNetConditioningDefaultEmbedding(nn.Module):
     feature maps ..."
     """
 
-    def __init__(self, conditioning_embedding_channels: int, conditioning_channels: int = 3):
+    def __init__(
+        self,
+        conditioning_embedding_channels: int,
+        conditioning_channels: int = 3,
+        block_out_channels: Tuple[int] = (16, 32, 96, 256),
+    ):
         super().__init__()
 
-        self.conditioning_embedder = nn.Sequential(
-            nn.Conv2d(conditioning_channels, 16, kernel_size=3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(16, 16, kernel_size=3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(16, 32, kernel_size=3, padding=1, stride=2),
-            nn.SiLU(),
-            nn.Conv2d(32, 32, kernel_size=3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(32, 96, kernel_size=3, padding=1, stride=2),
-            nn.SiLU(),
-            nn.Conv2d(96, 96, kernel_size=3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(96, 256, kernel_size=3, padding=1, stride=2),
-            nn.SiLU(),
-            zero_module(nn.Conv2d(256, conditioning_embedding_channels, kernel_size=3, padding=1)),
+        self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
+
+        self.blocks = nn.ModuleList([])
+
+        for i in range(len(block_out_channels) - 1):
+            channel_in = block_out_channels[i]
+            channel_out = block_out_channels[i + 1]
+            self.blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
+            self.blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
+
+        self.conv_out = zero_module(
+            nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
         )
 
     def forward(self, conditioning):
-        embedding = self.conditioning_embedder(conditioning)
+        embedding = self.conv_in(conditioning)
+        embedding = F.silu(embedding)
+
+        for block in self.blocks:
+            embedding = block(embedding)
+            embedding = F.silu(embedding)
+
+        embedding = self.conv_out(embedding)
+
         return embedding
 
 
@@ -107,6 +117,7 @@ def __init__(
         resnet_time_scale_shift: str = "default",
         projection_class_embeddings_input_dim: Optional[int] = None,
         controlnet_conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),
     ):
         super().__init__()
 
@@ -169,8 +180,9 @@ def __init__(
             self.class_embedding = None
 
         # control net conditioning embedding
-        self.controlnet_cond_embedding = ControlNetConditioningDefaultEmbedding(
+        self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
             conditioning_embedding_channels=block_out_channels[0],
+            block_out_channels=conditioning_embedding_out_channels,
         )
 
         self.down_blocks = nn.ModuleList([])
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index 481c69f6749e..837db66929a2 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -527,12 +527,17 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
 
     if controlnet:
         # conditioning embedding
-        # NOTE: `8` is hardcoded based off of the number of blocks in `ControlNetConditioningDefaultEmbedding`
-        for i in range(8):
-            new_checkpoint[f"controlnet_cond_embedding.conditioning_embedder.{i*2}.weight"] = unet_state_dict.pop(
+        new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop("input_hint_block.0.weight")
+        new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop("input_hint_block.0.bias")
+
+        new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop("input_hint_block.7.weight")
+        new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop("input_hint_block.7.bias")
+
+        for i in range(1, 7):
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{i*2}.weight"] = unet_state_dict.pop(
                 f"input_hint_block.{i*2}.weight"
             )
-            new_checkpoint[f"controlnet_cond_embedding.conditioning_embedder.{i*2}.bias"] = unet_state_dict.pop(
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{i*2}.bias"] = unet_state_dict.pop(
                 f"input_hint_block.{i*2}.bias"
             )
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 6da8585fc0d3..fa039ebc5a5a 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -56,6 +56,7 @@ def get_dummy_components(self):
             in_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             cross_attention_dim=32,
+            conditioning_embedding_out_channels=(16, 32),
         )
         torch.manual_seed(0)
         scheduler = DDIMScheduler(
@@ -107,7 +108,7 @@ def get_dummy_inputs(self, device, seed=0):
         else:
             generator = torch.Generator(device=device).manual_seed(seed)
 
-        controlnet_embedder_scale_factor = 8
+        controlnet_embedder_scale_factor = 2
         image = randn_tensor(
             (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
             generator=generator,

From 60e36350cabb220ce3315803aa53aed9895e1e35 Mon Sep 17 00:00:00 2001
From: Will Berman <WLBberman@gmail.com>
Date: Wed, 1 Mar 2023 12:55:42 +0000
Subject: [PATCH 102/122] fix conversion script

---
 .../stable_diffusion/convert_from_ckpt.py     | 35 ++++++++++++++-----
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index 837db66929a2..a46121f0c9cd 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -527,19 +527,36 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
 
     if controlnet:
         # conditioning embedding
-        new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop("input_hint_block.0.weight")
-        new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop("input_hint_block.0.bias")
 
-        new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop("input_hint_block.7.weight")
-        new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop("input_hint_block.7.bias")
+        orig_index = 0
 
-        for i in range(1, 7):
-            new_checkpoint[f"controlnet_cond_embedding.blocks.{i*2}.weight"] = unet_state_dict.pop(
-                f"input_hint_block.{i*2}.weight"
+        new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+
+        orig_index += 2
+
+        diffusers_index = 0
+
+        while diffusers_index < 6:
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.weight"
             )
-            new_checkpoint[f"controlnet_cond_embedding.blocks.{i*2}.bias"] = unet_state_dict.pop(
-                f"input_hint_block.{i*2}.bias"
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.bias"
             )
+            diffusers_index += 1
+            orig_index += 2
+
+        new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
 
         # down blocks
         for i in range(num_input_blocks):

From b15fca9cfd6d9cb15fe8777d7af657341bcce06c Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Wed, 1 Mar 2023 22:59:12 +0900
Subject: [PATCH 103/122] Update
 docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
index de168a26c4e6..27cdd56f0969 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
@@ -29,6 +29,7 @@ The original codebase/paper can be found here:
 ## Available checkpoints
 
 ControlNet requires a *control image* in addition to the text-to-image *prompt*. 
+Each pretrained model is trained using a different conditioning method that requires different images for conditioning the generated outputs. For example, Canny edge conditioning requires the control image to be the output of a Canny filter, while depth conditioning requires the control image to be a depth map. See the overview and image examples below to know more.
 Each pretrained model is trained using a different conditioning method that requires different conditioning images. For example, Canny edge conditioning requires the control image to be the output of a Canny filter, while depth conditioning requires the control image to be a depth map.
 See the overview and image examples.
 

From d7ed0b11504c4fc1af4af429ec9a6f09fd380298 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Wed, 1 Mar 2023 23:00:11 +0900
Subject: [PATCH 104/122] Update
 docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
index 27cdd56f0969..a23bbbc96538 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
@@ -60,7 +60,7 @@ All checkpoints are converted from [lllyasviel/ControlNet](https://huggingface.c
 
 The conditioning image is an outline of the image edges, as detected by a Canny filter. This is the example we'll use to control the generation:
 
-![White on black edges detected on Vermeer's Girl with a Pearl Earring portrait](https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged.png)
+![White on black edges detected on Vermeer's Girl with a Pearl Earring portrait](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/vermeer_canny_edged.png)
 
 ```python
 from diffusers import StableDiffusionControlNetPipeline

From 0ed0581bd74bd67a463b4daea68ca6892643b7da Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Wed, 1 Mar 2023 23:00:25 +0900
Subject: [PATCH 105/122] Update
 docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
index a23bbbc96538..c71b08fd39d7 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
@@ -67,7 +67,7 @@ from diffusers import StableDiffusionControlNetPipeline
 from diffusers.utils import load_image
 
 # Canny edged image for control
-canny_edged_image = load_image("https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged.png")
+canny_edged_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/vermeer_canny_edged.png")
 pipe = StableDiffusionControlNetPipeline.from_pretrained("takuma104/control_sd15_canny").to("cuda")
 image = pipe(prompt="best quality, extremely detailed", image=canny_edged_image).images[0]
 image.save("generated.png")

From 5e16d13a43f9c5b6bee5ed454d6497b1f1c226ab Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Wed, 1 Mar 2023 23:00:59 +0900
Subject: [PATCH 106/122] Update
 docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
index c71b08fd39d7..64e6e57476d1 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
@@ -76,7 +76,8 @@ image.save("generated.png")
 Note that the text prompt does not make any reference to the structure or contents of the image we are generating. Stable Diffusion interprets the control image as an additional input that controls what to generate.
 - Controlling custom Stable Diffusion 1.5 models
 
-In the following example we use PromptHero's [Openjourney model](https://huggingface.co/prompthero/openjourney), which was fine-tuned from the base Stable Diffusion v1.5 model on images from Midjourney. This model has the same structure as Stable Diffusion 1.5 but is capable of producing a different output style.
+In the following example we use PromptHero's [Openjourney model](https://huggingface.co/prompthero/openjourney), which was fine-tuned from the base Stable Diffusion v1.5 model on images from Midjourney. This model has the same structure as Stable Diffusion 1.5 but is capable of producing outputs in a different style.
+
 ```py
 from diffusers import StableDiffusionControlNetPipeline, AutoencoderKL, UNet2DConditionModel
 from diffusers.utils import load_image

From 06bb1db313d1e3b87d82d662ae02245a6894f30f Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Thu, 2 Mar 2023 00:13:22 +0900
Subject: [PATCH 107/122] Style adapted to the documentation of pix2pix

---
 .../pipelines/stable_diffusion/controlnet.mdx | 45 +++++++++----------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
index 64e6e57476d1..32fc2478b87f 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
@@ -12,26 +12,32 @@ specific language governing permissions and limitations under the License.
 
 # Text-to-Image Generation with ControlNet Conditioning
 
-## StableDiffusionControlNetPipeline
+## Overview
 
-ControlNet by [@lllyasviel](https://huggingface.co/lllyasviel) is a neural network structure to control diffusion models by adding extra conditions.
-
-There are 8 pre-trained ControlNet models that were trained to condition the original Stable Diffusion model on different inputs, 
-such as edge detection, scribbles, depth maps, semantic segmentations and more.
+[Adding Conditional Control to Text-to-Image Diffusion Models](https://arxiv.org/abs/2302.05543) by Lvmin Zhang and Maneesh Agrawala.
 
 Using the pretrained models we can provide control images (for example, a depth map) to control Stable Diffusion text-to-image generation so that it follows the structure of the depth image and fills in the details.
 
-The original codebase/paper can be found here: 
-- [Code](https://github.com/lllyasviel/ControlNet)
-- [Paper](https://arxiv.org/abs/2302.05543)
+The abstract of the paper is the following:
+
+*We present a neural network structure, ControlNet, to control pretrained large diffusion models to support additional input conditions. The ControlNet learns task-specific conditions in an end-to-end way, and the learning is robust even when the training dataset is small (< 50k). Moreover, training a ControlNet is as fast as fine-tuning a diffusion model, and the model can be trained on a personal devices. Alternatively, if powerful computation clusters are available, the model can scale to large amounts (millions to billions) of data. We report that large diffusion models like Stable Diffusion can be augmented with ControlNets to enable conditional inputs like edge maps, segmentation maps, keypoints, etc. This may enrich the methods to control large diffusion models and further facilitate related applications.*
+
+Resources:
+
+* [Paper](https://arxiv.org/abs/2302.05543)
+* [Original Code](https://github.com/lllyasviel/ControlNet)
+
+## Available Pipelines:
 
+| Pipeline | Tasks | Demo
+|---|---|:---:|
+| [StableDiffusionControlNetPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py) | *Text-to-Image Generation with ControlNet Conditioning* | [Colab Example](https://colab.research.google.com/drive/1AiR7Q-sBqO88NCyswpfiuwXZc7DfMyKA?usp=sharing) |
 
+<!-- TODO: add space -->
 ## Available checkpoints
 
 ControlNet requires a *control image* in addition to the text-to-image *prompt*. 
 Each pretrained model is trained using a different conditioning method that requires different images for conditioning the generated outputs. For example, Canny edge conditioning requires the control image to be the output of a Canny filter, while depth conditioning requires the control image to be a depth map. See the overview and image examples below to know more.
-Each pretrained model is trained using a different conditioning method that requires different conditioning images. For example, Canny edge conditioning requires the control image to be the output of a Canny filter, while depth conditioning requires the control image to be a depth map.
-See the overview and image examples.
 
 All checkpoints are converted from [lllyasviel/ControlNet](https://huggingface.co/lllyasviel/ControlNet).
 
@@ -48,12 +54,6 @@ All checkpoints are converted from [lllyasviel/ControlNet](https://huggingface.c
 |[takuma104/control_sd15_scribble](https://huggingface.co/takuma104/control_sd15_scribble)<br/> *Trained with human scribbles*  |A hand-drawn monochrome image with white outlines on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_vermeer_scribble.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_vermeer_scribble.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_scribble_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_scribble_0.png"/></a> |
 |[takuma104/control_sd15_seg](https://huggingface.co/takuma104/control_sd15_seg)<br/>*Trained with semantic segmentation*  |An [ADE20K](https://groups.csail.mit.edu/vision/datasets/ADE20K/)'s segmentation protocol image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_room_seg.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_room_seg.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_seg_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_seg_1.png"/></a> |
 
-
-## Resources
-
-- [Colab Notebook Example](https://colab.research.google.com/drive/1AiR7Q-sBqO88NCyswpfiuwXZc7DfMyKA?usp=sharing)
-- [controlnet_hinter](https://github.com/takuma104/controlnet_hinter): Image Preprocess Library for ControlNet
-
 ## Usage example
 
 - Basic Example (Canny Edge)
@@ -62,6 +62,8 @@ The conditioning image is an outline of the image edges, as detected by a Canny
 
 ![White on black edges detected on Vermeer's Girl with a Pearl Earring portrait](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/vermeer_canny_edged.png)
 
+In the following example, note that the text prompt does not make any reference to the structure or contents of the image we are generating. Stable Diffusion interprets the control image as an additional input that controls what to generate.
+
 ```python
 from diffusers import StableDiffusionControlNetPipeline
 from diffusers.utils import load_image
@@ -73,7 +75,6 @@ image = pipe(prompt="best quality, extremely detailed", image=canny_edged_image)
 image.save("generated.png")
 ```
 
-Note that the text prompt does not make any reference to the structure or contents of the image we are generating. Stable Diffusion interprets the control image as an additional input that controls what to generate.
 - Controlling custom Stable Diffusion 1.5 models
 
 In the following example we use PromptHero's [Openjourney model](https://huggingface.co/prompthero/openjourney), which was fine-tuned from the base Stable Diffusion v1.5 model on images from Midjourney. This model has the same structure as Stable Diffusion 1.5 but is capable of producing outputs in a different style.
@@ -83,7 +84,7 @@ from diffusers import StableDiffusionControlNetPipeline, AutoencoderKL, UNet2DCo
 from diffusers.utils import load_image
 
 # Canny edged image for control
-canny_edged_image = load_image("https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged.png")
+canny_edged_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/vermeer_canny_edged.png")
 
 base_model_id = "prompthero/openjourney"  # an example: openjourney model
 vae = AutoencoderKL.from_pretrained(base_model_id, subfolder="vae").to("cuda")
@@ -96,10 +97,4 @@ image.save("generated.png")
 
 [[autodoc]] StableDiffusionControlNetPipeline
 	- all
-	- __call__
-	- enable_attention_slicing
-	- disable_attention_slicing
-	- enable_vae_slicing
-	- disable_vae_slicing
-	- enable_xformers_memory_efficient_attention
-	- disable_xformers_memory_efficient_attention
\ No newline at end of file
+	- __call__
\ No newline at end of file

From 3981459a6a98951f56a414c0247ff4488ce40659 Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Thu, 2 Mar 2023 00:26:08 +0900
Subject: [PATCH 108/122] merge main by hand

---
 .../controlling_generation.mdx                | 27 +++++++------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/docs/source/en/using-diffusers/controlling_generation.mdx b/docs/source/en/using-diffusers/controlling_generation.mdx
index 576afbd1c220..94d579905ebb 100644
--- a/docs/source/en/using-diffusers/controlling_generation.mdx
+++ b/docs/source/en/using-diffusers/controlling_generation.mdx
@@ -27,15 +27,15 @@ Depending on the use case, one should choose a technique accordingly. In many ca
 Unless otherwise mentioned, these are techniques that work with existing models and don't require their own weights.
 
 1. [Instruct Pix2Pix](#instruct-pix2pix)
-2. [Pix2Pix 0](#pix2pixzero)
-3. [Attend and excite](#attend-and-excite)
-4. [Semantic guidance](#semantic-guidance)
-5. [Self attention guidance](#self-attention-guidance)
-6. [Depth2image](#depth2image)
-7. [DreamBooth](#dreambooth)
-8. [Textual Inversion](#textual-inversion)
-10. [MultiDiffusion Panorama](#panorama)
-11. [ControlNet](#controlnet)
+2. [Pix2Pix Zero](#pix2pixzero)
+3. [Attend and Excite](#attend-and-excite)
+4. [Semantic Guidance](#semantic-guidance)
+5. [Self-attention Guidance](#self-attention-guidance)
+6. [Depth2Image](#depth2image)
+7. [MultiDiffusion Panorama](#multidiffusion-panorama)
+8. [DreamBooth](#dreambooth)
+9. [Textual Inversion](#textual-inversion)
+10. [ControlNet](#controlnet)
 
 ## Instruct Pix2Pix
 
@@ -148,15 +148,6 @@ See [here](../training/dreambooth) for more information on how to use it.
 
 See [here](../training/text_inversion) for more information on how to use it.
 
-## MultiDiffusion Panorama
-
-[Paper](https://multidiffusion.github.io/)
-[Demo](https://huggingface.co/spaces/weizmannscience/MultiDiffusion)
-MultiDiffusion defines a new generation process over a pre-trained diffusion model. This process binds together multiple diffusion generation processes can be readily applied to generate high quality and diverse images that adhere to user-provided controls, such as desired aspect ratio (e.g., panorama), and spatial guiding signals, ranging from tight segmentation masks to bounding boxes.
-[MultiDiffusion Panorama](../api/pipelines/stable_diffusion/panorama) allows to generate high-quality images at arbitrary aspect ratios (e.g., panoramas).
-
-See [here](../api/pipelines/stable_diffusion/panorama) for more information on how to use it to generate panoramic images.
-
 ## ControlNet
 
 [Paper](https://arxiv.org/abs/2302.05543)

From 0810e4ca162e2c7eb66555b2e22b91f1e899a257 Mon Sep 17 00:00:00 2001
From: William Berman <WLBberman@gmail.com>
Date: Wed, 1 Mar 2023 16:01:28 -0800
Subject: [PATCH 109/122] style

---
 .../en/api/pipelines/stable_diffusion/controlnet.mdx      | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
index 32fc2478b87f..55f1e40913eb 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
@@ -69,7 +69,9 @@ from diffusers import StableDiffusionControlNetPipeline
 from diffusers.utils import load_image
 
 # Canny edged image for control
-canny_edged_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/vermeer_canny_edged.png")
+canny_edged_image = load_image(
+    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/vermeer_canny_edged.png"
+)
 pipe = StableDiffusionControlNetPipeline.from_pretrained("takuma104/control_sd15_canny").to("cuda")
 image = pipe(prompt="best quality, extremely detailed", image=canny_edged_image).images[0]
 image.save("generated.png")
@@ -84,7 +86,9 @@ from diffusers import StableDiffusionControlNetPipeline, AutoencoderKL, UNet2DCo
 from diffusers.utils import load_image
 
 # Canny edged image for control
-canny_edged_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/vermeer_canny_edged.png")
+canny_edged_image = load_image(
+    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/vermeer_canny_edged.png"
+)
 
 base_model_id = "prompthero/openjourney"  # an example: openjourney model
 vae = AutoencoderKL.from_pretrained(base_model_id, subfolder="vae").to("cuda")

From 10dedd9eaf91af83765d87e53371f2a2c4c6c878 Mon Sep 17 00:00:00 2001
From: William Berman <WLBberman@gmail.com>
Date: Wed, 1 Mar 2023 16:09:57 -0800
Subject: [PATCH 110/122] [docs] controlling generation doc nits

---
 docs/source/en/using-diffusers/controlling_generation.mdx | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/using-diffusers/controlling_generation.mdx b/docs/source/en/using-diffusers/controlling_generation.mdx
index 94d579905ebb..74da955abcf2 100644
--- a/docs/source/en/using-diffusers/controlling_generation.mdx
+++ b/docs/source/en/using-diffusers/controlling_generation.mdx
@@ -152,11 +152,9 @@ See [here](../training/text_inversion) for more information on how to use it.
 
 [Paper](https://arxiv.org/abs/2302.05543)
 
-[ControlNet](../api/pipelines/stable_diffusion/controlnet) is a neural network structure to control diffusion models by adding extra conditions.
-There are 8 pre-trained ControlNet models that were trained to condition the original Stable Diffusion model on different inputs, 
-such as edge detection, scribbles, depth maps, semantic segmentations and more.
-
-Using the pretrained models we can provide control images (for example, a depth map) to control Stable Diffusion text-to-image generation so that it follows the structure of the depth image and fills in the details.
+[ControlNet](../api/pipelines/stable_diffusion/controlnet) is an auxiliary network which adds an extra condition.
+There are 8 canonical pre-trained ControlNets trained on different conditionings such as edge detection, scribbles, 
+depth maps, and semantic segmentations.
 
 See [here](../api/pipelines/stable_diffusion/controlnet) for more information on how to use it.
 

From 042c75eb846f5d71b0164b4efd2a5165837f774a Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 2 Mar 2023 11:49:08 +0000
Subject: [PATCH 111/122] correct some things

---
 .../pipelines/stable_diffusion/controlnet.mdx | 125 +++++++++++++++---
 .../pipeline_stable_diffusion_controlnet.py   |  13 +-
 .../test_stable_diffusion_controlnet.py       |  53 ++++----
 3 files changed, 143 insertions(+), 48 deletions(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
index 55f1e40913eb..e38fe97f3a91 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
@@ -22,6 +22,8 @@ The abstract of the paper is the following:
 
 *We present a neural network structure, ControlNet, to control pretrained large diffusion models to support additional input conditions. The ControlNet learns task-specific conditions in an end-to-end way, and the learning is robust even when the training dataset is small (< 50k). Moreover, training a ControlNet is as fast as fine-tuning a diffusion model, and the model can be trained on a personal devices. Alternatively, if powerful computation clusters are available, the model can scale to large amounts (millions to billions) of data. We report that large diffusion models like Stable Diffusion can be augmented with ControlNets to enable conditional inputs like edge maps, segmentation maps, keypoints, etc. This may enrich the methods to control large diffusion models and further facilitate related applications.*
 
+This model was contributed by the amazing community contributor [takuma104](https://huggingface.co/takuma104) ❤️ .
+
 Resources:
 
 * [Paper](https://arxiv.org/abs/2302.05543)
@@ -33,32 +35,93 @@ Resources:
 |---|---|:---:|
 | [StableDiffusionControlNetPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py) | *Text-to-Image Generation with ControlNet Conditioning* | [Colab Example](https://colab.research.google.com/drive/1AiR7Q-sBqO88NCyswpfiuwXZc7DfMyKA?usp=sharing) |
 
-<!-- TODO: add space -->
-## Available checkpoints
+## Usage example
 
-ControlNet requires a *control image* in addition to the text-to-image *prompt*. 
-Each pretrained model is trained using a different conditioning method that requires different images for conditioning the generated outputs. For example, Canny edge conditioning requires the control image to be the output of a Canny filter, while depth conditioning requires the control image to be a depth map. See the overview and image examples below to know more.
+In the following we give a simple example of how to use a *ControlNet* checkpoint with Diffusers for inference.
+The inference pipeline is the same for all pipelines:
 
-All checkpoints are converted from [lllyasviel/ControlNet](https://huggingface.co/lllyasviel/ControlNet).
+* 1. Take an image and run it through a pre-conditioning processor.
+* 2. Run the pre-processed image through the [`StableDiffusionControlNetPipeline`].
 
-### ControlNet + Stable Diffusion 1.5
+Let's have a look at a simple example using the [Canny Edge ControlNet](https://huggingface.co/fusing/sd-controlnet-canny).
 
-| Model Name | Control Image Overview| Control Image Example | Generated Image Example |
-|---|---|---|---|
-|[takuma104/control_sd15_canny](https://huggingface.co/takuma104/control_sd15_canny)<br/> *Trained with canny edge detection* | A monochrome image with white edges on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_bird_canny.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_bird_canny.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_canny_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_canny_1.png"/></a>|
-|[takuma104/control_sd15_depth](https://huggingface.co/takuma104/control_sd15_depth)<br/> *Trained with Midas depth estimation*  |A grayscale image with black representing deep areas and white representing shallow areas.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_vermeer_depth.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_vermeer_depth.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_depth_2.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_depth_2.png"/></a>|
-|[takuma104/control_sd15_hed](https://huggingface.co/takuma104/control_sd15_hed)<br/> *Trained with HED edge detection (soft edge)*  |A monochrome image with white soft edges on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_bird_hed.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_bird_hed.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_hed_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_hed_1.png"/></a> |
-|[takuma104/control_sd15_mlsd](https://huggingface.co/takuma104/control_sd15_mlsd)<br/> *Trained with M-LSD line detection*  |A monochrome image composed only of white straight lines on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_room_mlsd.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_room_mlsd.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_mlsd_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_mlsd_0.png"/></a>|
-|[takuma104/control_sd15_normal](https://huggingface.co/takuma104/control_sd15_normal)<br/> *Trained with normal map*  |A [normal mapped](https://en.wikipedia.org/wiki/Normal_mapping) image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_human_normal.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_human_normal.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_normal_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_normal_1.png"/></a>|
-|[takuma104/control_sd15_openpose](https://huggingface.co/takuma104/control_sd15_openpose)<br/> *Trained with OpenPose bone image*  |A [OpenPose bone](https://github.com/CMU-Perceptual-Computing-Lab/openpose) image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_human_openpose.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_human_openpose.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_openpose_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_openpose_0.png"/></a>|
-|[takuma104/control_sd15_scribble](https://huggingface.co/takuma104/control_sd15_scribble)<br/> *Trained with human scribbles*  |A hand-drawn monochrome image with white outlines on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_vermeer_scribble.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_vermeer_scribble.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_scribble_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_scribble_0.png"/></a> |
-|[takuma104/control_sd15_seg](https://huggingface.co/takuma104/control_sd15_seg)<br/>*Trained with semantic segmentation*  |An [ADE20K](https://groups.csail.mit.edu/vision/datasets/ADE20K/)'s segmentation protocol image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_room_seg.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_room_seg.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_seg_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_seg_1.png"/></a> |
+```python
+from diffusers import StableDiffusionControlNetPipeline
+from diffusers.utils import load_image
 
-## Usage example
+# Let's load the popular vermeer image
+image = load_image(
+    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
+)
+```
+
+![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png)
 
-- Basic Example (Canny Edge)
+Next, we process the image to get the canny image. This is step *1.* - running the pre-conditioning processor. The pre-conditioning processor is different for every ControlNet. Please see the model cards of the [official checkpoints](#controlnet-with-stable-diffusion-1.5) for more information about other models.
 
-The conditioning image is an outline of the image edges, as detected by a Canny filter. This is the example we'll use to control the generation:
+First, we need to install opencv:
+
+```
+pip install opencv-contrib-python
+```
+
+Then we can retrieve the canny edges of the image.
+
+```python
+import cv2
+from PIL import Image
+import numpy as np
+
+image = np.array(image)
+
+low_threshold = 100
+high_threshold = 200
+
+image = cv2.Canny(image, low_threshold, high_threshold)
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+canny_image = Image.fromarray(image)
+```
+
+Let's take a look at the processed image.
+
+![img](https://huggingface.co/datasets/huggingface/documentation-images/blob/main/diffusers/vermeer_canny_edged.png)
+
+Now, we load the official [Stable Diffusion 1.5 Model](runwayml/stable-diffusion-v1-5) as well as the ControlNet for canny edges.
+
+```py
+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
+import torch
+
+controlnet = ControlNetModel.from_pretrained("fusing/sd-controlnet-canny", torch_dtype=torch.float16)
+pipe = StableDiffusionControlNetPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
+)
+```
+
+To speed-up things and reduce memory, let's enable model offloading and use the fast [`UniPCMultistepScheduler`].
+
+```py
+from diffusers import UniPCMultistepScheduler
+
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+
+# this command loads the individual model components on GPU on-demand.
+pipe.enable_model_cpu_offload()
+```
+
+Finally, we can run the pipeline:
+
+```py
+generator = torch.manual_seed(0)
+
+out_image = pipe("colorful painting of woman", num_inference_steps=20, generator=generator).images[0]
+```
+
+This should take only around 3-4 seconds on GPU (depending on hardware). The output image then looks as follows:
+
+
+The conditioning image is an outline of the image edges, as detected by a Canny filter. This is the example we'll use to control the generation
 
 ![White on black edges detected on Vermeer's Girl with a Pearl Earring portrait](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/vermeer_canny_edged.png)
 
@@ -72,6 +135,7 @@ from diffusers.utils import load_image
 canny_edged_image = load_image(
     "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/vermeer_canny_edged.png"
 )
+
 pipe = StableDiffusionControlNetPipeline.from_pretrained("takuma104/control_sd15_canny").to("cuda")
 image = pipe(prompt="best quality, extremely detailed", image=canny_edged_image).images[0]
 image.save("generated.png")
@@ -99,6 +163,27 @@ image = pipe(prompt="best quality, extremely detailed", image=canny_edged_image,
 image.save("generated.png")
 ```
 
+<!-- TODO: add space -->
+## Available checkpoints
+
+ControlNet requires a *control image* in addition to the text-to-image *prompt*. 
+Each pretrained model is trained using a different conditioning method that requires different images for conditioning the generated outputs. For example, Canny edge conditioning requires the control image to be the output of a Canny filter, while depth conditioning requires the control image to be a depth map. See the overview and image examples below to know more.
+
+All checkpoints can be found under the authors' namespace [lllyasviel](https://huggingface.co/lllyasviel).
+
+### ControlNet with Stable Diffusion 1.5
+
+| Model Name | Control Image Overview| Control Image Example | Generated Image Example |
+|---|---|---|---|
+|[fusing/sd-controlnet-canny](https://huggingface.co/fusing/sd-controlnet-canny)<br/> *Trained with canny edge detection* | A monochrome image with white edges on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_bird_canny.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_bird_canny.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_canny_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_canny_1.png"/></a>|
+|[fusing/sd-controlnet-depth](https://huggingface.co/fusing/sd-controlnet-depth)<br/> *Trained with Midas depth estimation*  |A grayscale image with black representing deep areas and white representing shallow areas.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_vermeer_depth.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_vermeer_depth.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_depth_2.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_depth_2.png"/></a>|
+|[fusing/sd-controlnet-hed](https://huggingface.co/fusing/sd-controlnet-hed)<br/> *Trained with HED edge detection (soft edge)*  |A monochrome image with white soft edges on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_bird_hed.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_bird_hed.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_hed_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_hed_1.png"/></a> |
+|[fusing/sd-controlnet-mlsd](https://huggingface.co/fusing/sd-controlnet-mlsd)<br/> *Trained with M-LSD line detection*  |A monochrome image composed only of white straight lines on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_room_mlsd.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_room_mlsd.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_mlsd_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_mlsd_0.png"/></a>|
+|[fusing/sd-controlnet-normal](https://huggingface.co/fusing/sd-controlnet-normal)<br/> *Trained with normal map*  |A [normal mapped](https://en.wikipedia.org/wiki/Normal_mapping) image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_human_normal.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_human_normal.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_normal_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_normal_1.png"/></a>|
+|[fusing/sd-controlnet_openpose](https://huggingface.co/fusing/sd-controlnet_openpose)<br/> *Trained with OpenPose bone image*  |A [OpenPose bone](https://github.com/CMU-Perceptual-Computing-Lab/openpose) image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_human_openpose.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_human_openpose.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_openpose_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_openpose_0.png"/></a>|
+|[fusing/sd-controlnet_scribble](https://huggingface.co/fusing/sd-controlnet_scribble)<br/> *Trained with human scribbles*  |A hand-drawn monochrome image with white outlines on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_vermeer_scribble.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_vermeer_scribble.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_scribble_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_scribble_0.png"/></a> |
+|[fusing/sd-controlnet_seg](https://huggingface.co/fusing/sd-controlnet_seg)<br/>*Trained with semantic segmentation*  |An [ADE20K](https://groups.csail.mit.edu/vision/datasets/ADE20K/)'s segmentation protocol image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_room_seg.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_room_seg.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_seg_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_seg_1.png"/></a> |
+
 [[autodoc]] StableDiffusionControlNetPipeline
 	- all
-	- __call__
\ No newline at end of file
+	- __call__
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index cc3c856824d7..cb1f82db0d63 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -183,12 +183,16 @@ def enable_model_cpu_offload(self, gpu_id=0):
         device = torch.device(f"cuda:{gpu_id}")
 
         hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae, self.controlnet]:
+        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
             _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
 
         if self.safety_checker is not None:
+            # the safety checker can offload the vae again
             _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
 
+        # control net hook has be manually offloaded as it alternates with unet
+        cpu_offload_with_hook(self.controlnet, device)
+
         # We'll offload the last model manually.
         self.final_offload_hook = hook
 
@@ -750,6 +754,13 @@ def __call__(
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
 
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
         if output_type == "latent":
             image = latents
             has_nsfw_concept = None
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index fa039ebc5a5a..077e2bfe25ed 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -149,12 +149,12 @@ def tearDown(self):
         torch.cuda.empty_cache()
 
     def test_canny(self):
-        controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-canny")
+        controlnet = ControlNetModel.from_pretrained("fusing/sd-controlnet-canny")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
         )
-        pipe = pipe.to(torch_device)
+        pipe.enable_model_cpu_offload()
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)
@@ -173,15 +173,15 @@ def test_canny(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out.npy"
         )
 
-        assert np.abs(expected_image - image).max() < 1e-4
+        assert np.abs(expected_image - image).max() < 5e-3
 
     def test_depth(self):
-        controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-depth")
+        controlnet = ControlNetModel.from_pretrained("fusing/sd-controlnet-depth")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
         )
-        pipe = pipe.to(torch_device)
+        pipe.enable_model_cpu_offload()
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)
@@ -200,15 +200,15 @@ def test_depth(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth_out.npy"
         )
 
-        assert np.abs(expected_image - image).max() < 1e-4
+        assert np.abs(expected_image - image).max() < 5e-3
 
     def test_hed(self):
-        controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-hed")
+        controlnet = ControlNetModel.from_pretrained("fusing/sd-controlnet-hed")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
         )
-        pipe = pipe.to(torch_device)
+        pipe.enable_model_cpu_offload()
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)
@@ -227,15 +227,15 @@ def test_hed(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/man_hed_out.npy"
         )
 
-        assert np.abs(expected_image - image).max() < 1e-4
+        assert np.abs(expected_image - image).max() < 5e-3
 
     def test_mlsd(self):
-        controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-mlsd")
+        controlnet = ControlNetModel.from_pretrained("fusing/sd-controlnet-mlsd")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
         )
-        pipe = pipe.to(torch_device)
+        pipe.enable_model_cpu_offload()
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)
@@ -254,15 +254,15 @@ def test_mlsd(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/room_mlsd_out.npy"
         )
 
-        assert np.abs(expected_image - image).max() < 1e-4
+        assert np.abs(expected_image - image).max() < 5e-3
 
     def test_normal(self):
-        controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-normal")
+        controlnet = ControlNetModel.from_pretrained("fusing/sd-controlnet-normal")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
         )
-        pipe = pipe.to(torch_device)
+        pipe.enable_model_cpu_offload()
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)
@@ -281,15 +281,15 @@ def test_normal(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/cute_toy_normal_out.npy"
         )
 
-        assert np.abs(expected_image - image).max() < 1e-4
+        assert np.abs(expected_image - image).max() < 5e-3
 
     def test_openpose(self):
-        controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-openpose")
+        controlnet = ControlNetModel.from_pretrained("fusing/sd-controlnet-openpose")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
         )
-        pipe = pipe.to(torch_device)
+        pipe.enable_model_cpu_offload()
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(0)
@@ -308,15 +308,15 @@ def test_openpose(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/chef_pose_out.npy"
         )
 
-        assert np.abs(expected_image - image).max() < 1e-4
+        assert np.abs(expected_image - image).max() < 5e-3
 
     def test_scribble(self):
-        controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-scribble")
+        controlnet = ControlNetModel.from_pretrained("fusing/sd-controlnet-scribble")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
         )
-        pipe = pipe.to(torch_device)
+        pipe.enable_model_cpu_offload()
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(5)
@@ -335,15 +335,15 @@ def test_scribble(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bag_scribble_out.npy"
         )
 
-        assert np.abs(expected_image - image).max() < 1e-4
+        assert np.abs(expected_image - image).max() < 5e-3
 
     def test_seg(self):
-        controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-seg")
+        controlnet = ControlNetModel.from_pretrained("fusing/sd-controlnet-seg")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
         )
-        pipe = pipe.to(torch_device)
+        pipe.enable_model_cpu_offload()
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device="cpu").manual_seed(5)
@@ -362,19 +362,18 @@ def test_seg(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg_out.npy"
         )
 
-        assert np.abs(expected_image - image).max() < 1e-4
+        assert np.abs(expected_image - image).max() < 5e-3
 
     def test_sequential_cpu_offloading(self):
         torch.cuda.empty_cache()
         torch.cuda.reset_max_memory_allocated()
         torch.cuda.reset_peak_memory_stats()
 
-        controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-seg")
+        controlnet = ControlNetModel.from_pretrained("fusing/sd-controlnet-seg")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
         )
-        pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
         pipe.enable_sequential_cpu_offload()
@@ -393,4 +392,4 @@ def test_sequential_cpu_offloading(self):
 
         mem_bytes = torch.cuda.max_memory_allocated()
         # make sure that less than 7 GB is allocated
-        assert mem_bytes < 7 * 10**9
+        assert mem_bytes < 4 * 10**9

From ff2e69165d424911c87832d2a46fc9eff77b683b Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 2 Mar 2023 17:36:00 +0530
Subject: [PATCH 112/122] add: controlnetmodel to autodoc.

---
 docs/source/en/api/models.mdx | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/en/api/models.mdx b/docs/source/en/api/models.mdx
index 6db2eba32220..dc425e98628c 100644
--- a/docs/source/en/api/models.mdx
+++ b/docs/source/en/api/models.mdx
@@ -64,6 +64,12 @@ The models are built on the base class ['ModelMixin'] that is a `torch.nn.module
 ## PriorTransformerOutput
 [[autodoc]] models.prior_transformer.PriorTransformerOutput
 
+## ControlNetOutput
+[[autodoc]] models.controlnet.ControlNetOutput
+
+## ControlNetModel
+[[autodoc]] ControlNetModel
+
 ## FlaxModelMixin
 [[autodoc]] FlaxModelMixin
 

From 9cb8816370556673e432e203680965f4bfc2ebdf Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 2 Mar 2023 12:45:38 +0000
Subject: [PATCH 113/122] finish docs

---
 .../pipelines/stable_diffusion/controlnet.mdx | 53 +++++--------------
 .../pipeline_stable_diffusion_controlnet.py   |  7 ++-
 .../test_stable_diffusion_controlnet.py       |  3 ++
 3 files changed, 20 insertions(+), 43 deletions(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
index e38fe97f3a91..5b26d80d62c3 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
@@ -115,55 +115,20 @@ Finally, we can run the pipeline:
 ```py
 generator = torch.manual_seed(0)
 
-out_image = pipe("colorful painting of woman", num_inference_steps=20, generator=generator).images[0]
+out_image = pipe(
+    "disco dancer with colorful lights", num_inference_steps=20, generator=generator, image=canny_image
+).images[0]
 ```
 
 This should take only around 3-4 seconds on GPU (depending on hardware). The output image then looks as follows:
 
+![img](https://huggingface.co/datasets/huggingface/documentation-images/blob/main/diffusers/vermeer_disco_dancing.png)
 
-The conditioning image is an outline of the image edges, as detected by a Canny filter. This is the example we'll use to control the generation
 
-![White on black edges detected on Vermeer's Girl with a Pearl Earring portrait](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/vermeer_canny_edged.png)
-
-In the following example, note that the text prompt does not make any reference to the structure or contents of the image we are generating. Stable Diffusion interprets the control image as an additional input that controls what to generate.
-
-```python
-from diffusers import StableDiffusionControlNetPipeline
-from diffusers.utils import load_image
-
-# Canny edged image for control
-canny_edged_image = load_image(
-    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/vermeer_canny_edged.png"
-)
-
-pipe = StableDiffusionControlNetPipeline.from_pretrained("takuma104/control_sd15_canny").to("cuda")
-image = pipe(prompt="best quality, extremely detailed", image=canny_edged_image).images[0]
-image.save("generated.png")
-```
-
-- Controlling custom Stable Diffusion 1.5 models
-
-In the following example we use PromptHero's [Openjourney model](https://huggingface.co/prompthero/openjourney), which was fine-tuned from the base Stable Diffusion v1.5 model on images from Midjourney. This model has the same structure as Stable Diffusion 1.5 but is capable of producing outputs in a different style.
-
-```py
-from diffusers import StableDiffusionControlNetPipeline, AutoencoderKL, UNet2DConditionModel
-from diffusers.utils import load_image
-
-# Canny edged image for control
-canny_edged_image = load_image(
-    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/vermeer_canny_edged.png"
-)
-
-base_model_id = "prompthero/openjourney"  # an example: openjourney model
-vae = AutoencoderKL.from_pretrained(base_model_id, subfolder="vae").to("cuda")
-unet = UNet2DConditionModel.from_pretrained(base_model_id, subfolder="unet").to("cuda")
-
-pipe = StableDiffusionControlNetPipeline.from_pretrained("takuma104/control_sd15_canny", unet=unet, vae=vae).to("cuda")
-image = pipe(prompt="best quality, extremely detailed", image=canny_edged_image, width=512, height=512).images[0]
-image.save("generated.png")
-```
+**Note**: To see how to run all other ControlNet checkpoints, please have a look at [ControlNet with Stable Diffusion 1.5](#controlnet-with-stable-diffusion-1.5)
 
 <!-- TODO: add space -->
+
 ## Available checkpoints
 
 ControlNet requires a *control image* in addition to the text-to-image *prompt*. 
@@ -187,3 +152,9 @@ All checkpoints can be found under the authors' namespace [lllyasviel](https://h
 [[autodoc]] StableDiffusionControlNetPipeline
 	- all
 	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_vae_slicing
+	- disable_vae_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index cb1f82db0d63..d48a67e34e34 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -447,7 +447,7 @@ def check_inputs(
 
         if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list:
             raise TypeError(
-                "image must be one of PIL image, torch tensor, list of PIL images, or list of torch tensors"
+                "image must be passed and be one of PIL image, torch tensor, list of PIL images, or list of torch tensors"
             )
 
         if image_is_pil:
@@ -641,7 +641,10 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
-        # 0. Default height and width to image
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
         height, width = self._default_height_width(height, width, image)
 
         # 1. Check inputs. Raise error if not correct
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index 077e2bfe25ed..b5c7f8d3c06a 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -31,11 +31,14 @@
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import require_torch_gpu
 
+from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ...test_pipelines_common import PipelineTesterMixin
 
 
 class StableDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = StableDiffusionControlNetPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
 
     def get_dummy_components(self):
         torch.manual_seed(0)

From 3bbc356d4179c05d37c07afea48642f2ec101bdb Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 2 Mar 2023 13:01:03 +0000
Subject: [PATCH 114/122] finish

---
 .../pipelines/stable_diffusion/controlnet.mdx | 20 +++++++++----------
 .../pipeline_stable_diffusion_controlnet.py   |  3 ---
 .../test_stable_diffusion_controlnet.py       | 18 ++++++++---------
 3 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
index 5b26d80d62c3..916ddd621ae3 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
@@ -43,7 +43,7 @@ The inference pipeline is the same for all pipelines:
 * 1. Take an image and run it through a pre-conditioning processor.
 * 2. Run the pre-processed image through the [`StableDiffusionControlNetPipeline`].
 
-Let's have a look at a simple example using the [Canny Edge ControlNet](https://huggingface.co/fusing/sd-controlnet-canny).
+Let's have a look at a simple example using the [Canny Edge ControlNet](https://huggingface.co/lllyasviel/sd-controlnet-canny).
 
 ```python
 from diffusers import StableDiffusionControlNetPipeline
@@ -93,7 +93,7 @@ Now, we load the official [Stable Diffusion 1.5 Model](runwayml/stable-diffusion
 from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
 import torch
 
-controlnet = ControlNetModel.from_pretrained("fusing/sd-controlnet-canny", torch_dtype=torch.float16)
+controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
 pipe = StableDiffusionControlNetPipeline.from_pretrained(
     "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
 )
@@ -140,14 +140,14 @@ All checkpoints can be found under the authors' namespace [lllyasviel](https://h
 
 | Model Name | Control Image Overview| Control Image Example | Generated Image Example |
 |---|---|---|---|
-|[fusing/sd-controlnet-canny](https://huggingface.co/fusing/sd-controlnet-canny)<br/> *Trained with canny edge detection* | A monochrome image with white edges on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_bird_canny.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_bird_canny.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_canny_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_canny_1.png"/></a>|
-|[fusing/sd-controlnet-depth](https://huggingface.co/fusing/sd-controlnet-depth)<br/> *Trained with Midas depth estimation*  |A grayscale image with black representing deep areas and white representing shallow areas.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_vermeer_depth.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_vermeer_depth.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_depth_2.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_depth_2.png"/></a>|
-|[fusing/sd-controlnet-hed](https://huggingface.co/fusing/sd-controlnet-hed)<br/> *Trained with HED edge detection (soft edge)*  |A monochrome image with white soft edges on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_bird_hed.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_bird_hed.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_hed_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_hed_1.png"/></a> |
-|[fusing/sd-controlnet-mlsd](https://huggingface.co/fusing/sd-controlnet-mlsd)<br/> *Trained with M-LSD line detection*  |A monochrome image composed only of white straight lines on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_room_mlsd.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_room_mlsd.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_mlsd_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_mlsd_0.png"/></a>|
-|[fusing/sd-controlnet-normal](https://huggingface.co/fusing/sd-controlnet-normal)<br/> *Trained with normal map*  |A [normal mapped](https://en.wikipedia.org/wiki/Normal_mapping) image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_human_normal.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_human_normal.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_normal_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_normal_1.png"/></a>|
-|[fusing/sd-controlnet_openpose](https://huggingface.co/fusing/sd-controlnet_openpose)<br/> *Trained with OpenPose bone image*  |A [OpenPose bone](https://github.com/CMU-Perceptual-Computing-Lab/openpose) image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_human_openpose.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_human_openpose.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_openpose_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_openpose_0.png"/></a>|
-|[fusing/sd-controlnet_scribble](https://huggingface.co/fusing/sd-controlnet_scribble)<br/> *Trained with human scribbles*  |A hand-drawn monochrome image with white outlines on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_vermeer_scribble.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_vermeer_scribble.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_scribble_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_scribble_0.png"/></a> |
-|[fusing/sd-controlnet_seg](https://huggingface.co/fusing/sd-controlnet_seg)<br/>*Trained with semantic segmentation*  |An [ADE20K](https://groups.csail.mit.edu/vision/datasets/ADE20K/)'s segmentation protocol image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_room_seg.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_room_seg.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_seg_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_seg_1.png"/></a> |
+|[lllyasviel/sd-controlnet-canny](https://huggingface.co/lllyasviel/sd-controlnet-canny)<br/> *Trained with canny edge detection* | A monochrome image with white edges on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_bird_canny.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_bird_canny.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_canny_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_canny_1.png"/></a>|
+|[lllyasviel/sd-controlnet-depth](https://huggingface.co/lllyasviel/sd-controlnet-depth)<br/> *Trained with Midas depth estimation*  |A grayscale image with black representing deep areas and white representing shallow areas.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_vermeer_depth.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_vermeer_depth.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_depth_2.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_depth_2.png"/></a>|
+|[lllyasviel/sd-controlnet-hed](https://huggingface.co/lllyasviel/sd-controlnet-hed)<br/> *Trained with HED edge detection (soft edge)*  |A monochrome image with white soft edges on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_bird_hed.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_bird_hed.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_hed_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_bird_hed_1.png"/></a> |
+|[lllyasviel/sd-controlnet-mlsd](https://huggingface.co/lllyasviel/sd-controlnet-mlsd)<br/> *Trained with M-LSD line detection*  |A monochrome image composed only of white straight lines on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_room_mlsd.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_room_mlsd.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_mlsd_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_mlsd_0.png"/></a>|
+|[lllyasviel/sd-controlnet-normal](https://huggingface.co/lllyasviel/sd-controlnet-normal)<br/> *Trained with normal map*  |A [normal mapped](https://en.wikipedia.org/wiki/Normal_mapping) image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_human_normal.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_human_normal.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_normal_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_normal_1.png"/></a>|
+|[lllyasviel/sd-controlnet_openpose](https://huggingface.co/lllyasviel/sd-controlnet_openpose)<br/> *Trained with OpenPose bone image*  |A [OpenPose bone](https://github.com/CMU-Perceptual-Computing-Lab/openpose) image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_human_openpose.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_human_openpose.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_openpose_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_human_openpose_0.png"/></a>|
+|[lllyasviel/sd-controlnet_scribble](https://huggingface.co/lllyasviel/sd-controlnet_scribble)<br/> *Trained with human scribbles*  |A hand-drawn monochrome image with white outlines on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_vermeer_scribble.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_vermeer_scribble.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_scribble_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_scribble_0.png"/></a> |
+|[lllyasviel/sd-controlnet_seg](https://huggingface.co/lllyasviel/sd-controlnet_seg)<br/>*Trained with semantic segmentation*  |An [ADE20K](https://groups.csail.mit.edu/vision/datasets/ADE20K/)'s segmentation protocol image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_room_seg.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_room_seg.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_seg_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_seg_1.png"/></a> |
 
 [[autodoc]] StableDiffusionControlNetPipeline
 	- all
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index d48a67e34e34..309caaaa1a19 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -642,9 +642,6 @@ def __call__(
             (nsfw) content, according to the `safety_checker`.
         """
         # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
         height, width = self._default_height_width(height, width, image)
 
         # 1. Check inputs. Raise error if not correct
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index b5c7f8d3c06a..b32c395c5537 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -152,7 +152,7 @@ def tearDown(self):
         torch.cuda.empty_cache()
 
     def test_canny(self):
-        controlnet = ControlNetModel.from_pretrained("fusing/sd-controlnet-canny")
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
@@ -179,7 +179,7 @@ def test_canny(self):
         assert np.abs(expected_image - image).max() < 5e-3
 
     def test_depth(self):
-        controlnet = ControlNetModel.from_pretrained("fusing/sd-controlnet-depth")
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-depth")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
@@ -206,7 +206,7 @@ def test_depth(self):
         assert np.abs(expected_image - image).max() < 5e-3
 
     def test_hed(self):
-        controlnet = ControlNetModel.from_pretrained("fusing/sd-controlnet-hed")
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-hed")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
@@ -233,7 +233,7 @@ def test_hed(self):
         assert np.abs(expected_image - image).max() < 5e-3
 
     def test_mlsd(self):
-        controlnet = ControlNetModel.from_pretrained("fusing/sd-controlnet-mlsd")
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-mlsd")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
@@ -260,7 +260,7 @@ def test_mlsd(self):
         assert np.abs(expected_image - image).max() < 5e-3
 
     def test_normal(self):
-        controlnet = ControlNetModel.from_pretrained("fusing/sd-controlnet-normal")
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-normal")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
@@ -287,7 +287,7 @@ def test_normal(self):
         assert np.abs(expected_image - image).max() < 5e-3
 
     def test_openpose(self):
-        controlnet = ControlNetModel.from_pretrained("fusing/sd-controlnet-openpose")
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
@@ -314,7 +314,7 @@ def test_openpose(self):
         assert np.abs(expected_image - image).max() < 5e-3
 
     def test_scribble(self):
-        controlnet = ControlNetModel.from_pretrained("fusing/sd-controlnet-scribble")
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-scribble")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
@@ -341,7 +341,7 @@ def test_scribble(self):
         assert np.abs(expected_image - image).max() < 5e-3
 
     def test_seg(self):
-        controlnet = ControlNetModel.from_pretrained("fusing/sd-controlnet-seg")
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
@@ -372,7 +372,7 @@ def test_sequential_cpu_offloading(self):
         torch.cuda.reset_max_memory_allocated()
         torch.cuda.reset_peak_memory_stats()
 
-        controlnet = ControlNetModel.from_pretrained("fusing/sd-controlnet-seg")
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")
 
         pipe = StableDiffusionControlNetPipeline.from_pretrained(
             "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet

From b8d190872fce54ccd57eb521f970bb80dc40824d Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 2 Mar 2023 13:41:36 +0000
Subject: [PATCH 115/122] finish 2

---
 docs/source/en/api/pipelines/overview.mdx | 1 +
 docs/source/en/index.mdx                  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/docs/source/en/api/pipelines/overview.mdx b/docs/source/en/api/pipelines/overview.mdx
index bd24a170957b..6d3428997de7 100644
--- a/docs/source/en/api/pipelines/overview.mdx
+++ b/docs/source/en/api/pipelines/overview.mdx
@@ -46,6 +46,7 @@ available a colab notebook to directly try them out.
 |---|---|:---:|:---:|
 | [alt_diffusion](./alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation | -
 | [audio_diffusion](./audio_diffusion) | [**Audio Diffusion**](https://github.com/teticio/audio_diffusion.git) | Unconditional Audio Generation |
+| [controlnet](./api/pipelines/controlnet) | [**ControlNet with Stable Diffusion**](https://arxiv.org/abs/2302.05543) | Image-to-Image Text-Guided Generation |
 | [cycle_diffusion](./cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
 | [dance_diffusion](./dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
 | [ddpm](./ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 675278bf8924..fff2fe2da760 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -36,6 +36,7 @@ available a colab notebook to directly try them out.
 |---|---|:---:|:---:|
 | [alt_diffusion](./api/pipelines/alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation |
 | [audio_diffusion](./api/pipelines/audio_diffusion) | [**Audio Diffusion**](https://github.com/teticio/audio-diffusion.git) | Unconditional Audio Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/teticio/audio-diffusion/blob/master/notebooks/audio_diffusion_pipeline.ipynb)
+| [controlnet](./api/pipelines/controlnet) | [**ControlNet with Stable Diffusion**](https://arxiv.org/abs/2302.05543) | Image-to-Image Text-Guided Generation |
 | [cycle_diffusion](./api/pipelines/cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
 | [dance_diffusion](./api/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
 | [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |

From 1f36d9e92c29370b51a7e901339754d8a7a30a2b Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 2 Mar 2023 14:43:22 +0100
Subject: [PATCH 116/122] correct images

---
 docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
index 5b26d80d62c3..a5d7053cafc0 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
@@ -85,7 +85,7 @@ canny_image = Image.fromarray(image)
 
 Let's take a look at the processed image.
 
-![img](https://huggingface.co/datasets/huggingface/documentation-images/blob/main/diffusers/vermeer_canny_edged.png)
+![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/vermeer_canny_edged.png)
 
 Now, we load the official [Stable Diffusion 1.5 Model](runwayml/stable-diffusion-v1-5) as well as the ControlNet for canny edges.
 
@@ -122,7 +122,7 @@ out_image = pipe(
 
 This should take only around 3-4 seconds on GPU (depending on hardware). The output image then looks as follows:
 
-![img](https://huggingface.co/datasets/huggingface/documentation-images/blob/main/diffusers/vermeer_disco_dancing.png)
+![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/vermeer_disco_dancing.png)
 
 
 **Note**: To see how to run all other ControlNet checkpoints, please have a look at [ControlNet with Stable Diffusion 1.5](#controlnet-with-stable-diffusion-1.5)

From a610e4749bbc5d0ae57dc4c5b476e0fe60170217 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 2 Mar 2023 14:51:07 +0100
Subject: [PATCH 117/122] finish controlnet

---
 docs/source/en/api/pipelines/overview.mdx | 2 +-
 docs/source/en/index.mdx                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/pipelines/overview.mdx b/docs/source/en/api/pipelines/overview.mdx
index 6d3428997de7..411f56ce5871 100644
--- a/docs/source/en/api/pipelines/overview.mdx
+++ b/docs/source/en/api/pipelines/overview.mdx
@@ -46,7 +46,7 @@ available a colab notebook to directly try them out.
 |---|---|:---:|:---:|
 | [alt_diffusion](./alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation | -
 | [audio_diffusion](./audio_diffusion) | [**Audio Diffusion**](https://github.com/teticio/audio_diffusion.git) | Unconditional Audio Generation |
-| [controlnet](./api/pipelines/controlnet) | [**ControlNet with Stable Diffusion**](https://arxiv.org/abs/2302.05543) | Image-to-Image Text-Guided Generation |
+| [controlnet](./api/pipelines/stable_diffusion/controlnet) | [**ControlNet with Stable Diffusion**](https://arxiv.org/abs/2302.05543) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1AiR7Q-sBqO88NCyswpfiuwXZc7DfMyKA?usp=sharing)
 | [cycle_diffusion](./cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
 | [dance_diffusion](./dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
 | [ddpm](./ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index fff2fe2da760..5471d9235c7e 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -36,7 +36,7 @@ available a colab notebook to directly try them out.
 |---|---|:---:|:---:|
 | [alt_diffusion](./api/pipelines/alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation |
 | [audio_diffusion](./api/pipelines/audio_diffusion) | [**Audio Diffusion**](https://github.com/teticio/audio-diffusion.git) | Unconditional Audio Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/teticio/audio-diffusion/blob/master/notebooks/audio_diffusion_pipeline.ipynb)
-| [controlnet](./api/pipelines/controlnet) | [**ControlNet with Stable Diffusion**](https://arxiv.org/abs/2302.05543) | Image-to-Image Text-Guided Generation |
+| [controlnet](./api/pipelines/stable_diffusion/controlnet) | [**ControlNet with Stable Diffusion**](https://arxiv.org/abs/2302.05543) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1AiR7Q-sBqO88NCyswpfiuwXZc7DfMyKA?usp=sharing)
 | [cycle_diffusion](./api/pipelines/cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
 | [dance_diffusion](./api/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
 | [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |

From 9947fcbd5f00f905dfbd4a33751dfda4aa9bb7af Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 2 Mar 2023 14:57:08 +0100
Subject: [PATCH 118/122] Apply suggestions from code review

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 src/diffusers/models/controlnet.py                              | 1 +
 .../stable_diffusion/test_stable_diffusion_controlnet.py        | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py
index b7a0406b0c1d..0d7691ffc4f5 100644
--- a/src/diffusers/models/controlnet.py
+++ b/src/diffusers/models/controlnet.py
@@ -42,6 +42,7 @@ class ControlNetOutput(BaseOutput):
 
 class ControlNetConditioningEmbedding(nn.Module):
     """
+    Quoting from https://arxiv.org/abs/2302.05543:
     "Stable Diffusion uses a pre-processing method similar to VQ-GAN [11] to convert the entire dataset of 512 × 512
     images into smaller 64 × 64 “latent images” for stabilized training. This requires ControlNets to convert
     image-based conditions to 64 × 64 feature space to match the convolution size. We use a tiny network E(·) of four
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index b32c395c5537..406cbc9ad089 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 HuggingFace Inc.
+# Copyright 2023 HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 592f3891348e15d6a22bbbb09e4d6c8957997145 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 2 Mar 2023 14:04:57 +0000
Subject: [PATCH 119/122] uP

---
 src/diffusers/models/controlnet.py            | 13 ++++---
 .../pipeline_stable_diffusion_controlnet.py   | 35 ++++++++++++++++---
 2 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py
index 0d7691ffc4f5..5b55da5af372 100644
--- a/src/diffusers/models/controlnet.py
+++ b/src/diffusers/models/controlnet.py
@@ -42,13 +42,12 @@ class ControlNetOutput(BaseOutput):
 
 class ControlNetConditioningEmbedding(nn.Module):
     """
-    Quoting from https://arxiv.org/abs/2302.05543:
-    "Stable Diffusion uses a pre-processing method similar to VQ-GAN [11] to convert the entire dataset of 512 × 512
-    images into smaller 64 × 64 “latent images” for stabilized training. This requires ControlNets to convert
-    image-based conditions to 64 × 64 feature space to match the convolution size. We use a tiny network E(·) of four
-    convolution layers with 4 × 4 kernels and 2 × 2 strides (activated by ReLU, channels are 16, 32, 64, 128,
-    initialized with Gaussian weights, trained jointly with the full model) to encode image-space conditions ... into
-    feature maps ..."
+    Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
+    [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
+    training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
+    convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
+    (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
+    model) to encode image-space conditions ... into feature maps ..."
     """
 
     def __init__(
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 309caaaa1a19..7917f61789c1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -42,15 +42,40 @@
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
+        >>> # !pip install opencv-python transformers accelerate
         >>> from diffusers import StableDiffusionControlNetPipeline
         >>> from diffusers.utils import load_image
 
-        >>> # Canny edged image for control
-        >>> canny_edged_image = load_image(
-        ...     "https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged.png"
+        >>> # download an image
+        >>> image = load_image(
+        ...     "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
         ... )
-        >>> pipe = StableDiffusionControlNetPipeline.from_pretrained("takuma104/control_sd15_canny").to("cuda")
-        >>> image = pipe(prompt="best quality, extremely detailed", image=canny_edged_image).images[0]
+        >>> image = np.array(image)
+
+        >>> # get canny image
+        >>> image = cv2.Canny(image, 100, 200)
+        >>> image = image[:, :, None]
+        >>> image = np.concatenate([image, image, image], axis=2)
+        >>> canny_image = Image.fromarray(image)
+
+        >>> # load control net and stable diffusion v1-5
+        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionControlNetPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+
+        >>> # speed up diffusion process with faster scheduler and memory optimization
+        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> # remove following line if xformers is not installed
+        >>> pipe.enable_xformers_memory_efficient_attention()
+
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # generate image
+        >>> generator = torch.manual_seed(0)
+        >>> image = pipe(
+        ...     "futuristic-looking woman", num_inference_steps=20, generator=generator, image=canny_image
+        ... ).images[0]
         ```
 """
 

From ec4fc3a5424cc1e9ecc94da1e0c2320d21c4bdf7 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 2 Mar 2023 14:07:45 +0000
Subject: [PATCH 120/122] upload model

---
 .../stable_diffusion/pipeline_stable_diffusion_controlnet.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 7917f61789c1..5ff2c2660f46 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -45,6 +45,11 @@
         >>> # !pip install opencv-python transformers accelerate
         >>> from diffusers import StableDiffusionControlNetPipeline
         >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+
+        >>> import cv2
+        >>> from PIL import Image
 
         >>> # download an image
         >>> image = load_image(

From b010e3c6d674a165a2068b5bbb25915c61317ea4 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 2 Mar 2023 14:13:34 +0000
Subject: [PATCH 121/122] up

---
 .../stable_diffusion/pipeline_stable_diffusion_controlnet.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 5ff2c2660f46..15218b842ba0 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -43,7 +43,7 @@
     Examples:
         ```py
         >>> # !pip install opencv-python transformers accelerate
-        >>> from diffusers import StableDiffusionControlNetPipeline
+        >>> from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
         >>> from diffusers.utils import load_image
         >>> import numpy as np
         >>> import torch

From 547ba02f883d9c523622037e43954582cc645bdd Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 2 Mar 2023 14:14:53 +0000
Subject: [PATCH 122/122] up

---
 .../stable_diffusion/pipeline_stable_diffusion_controlnet.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 15218b842ba0..00f35ee4d021 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -43,7 +43,7 @@
     Examples:
         ```py
         >>> # !pip install opencv-python transformers accelerate
-        >>> from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
+        >>> from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
         >>> from diffusers.utils import load_image
         >>> import numpy as np
         >>> import torch