From 5a478cf3c5a4f80ba8782a3fd14833a376b7aa29 Mon Sep 17 00:00:00 2001 From: Nathaniel Herman Date: Sat, 19 Nov 2022 07:04:50 +0000 Subject: [PATCH 01/16] Repaint pipeline --- src/diffusers/pipelines/__init__.py | 1 + .../pipelines/stable_diffusion/__init__.py | 1 + .../pipeline_stable_diffusion_repaint.py | 606 ++++++++++++++++++ .../schedulers/scheduling_repaint.py | 3 +- 4 files changed, 610 insertions(+), 1 deletion(-) create mode 100644 src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 3ca66b28b5f1..8a20c00b052a 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -23,6 +23,7 @@ StableDiffusionInpaintPipeline, StableDiffusionInpaintPipelineLegacy, StableDiffusionPipeline, + StableDiffusionRepaintPipeline, ) from .vq_diffusion import VQDiffusionPipeline diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py index 6623929f8648..c2253adac008 100644 --- a/src/diffusers/pipelines/stable_diffusion/__init__.py +++ b/src/diffusers/pipelines/stable_diffusion/__init__.py @@ -32,6 +32,7 @@ class StableDiffusionPipelineOutput(BaseOutput): from .pipeline_stable_diffusion import StableDiffusionPipeline from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline + from .pipeline_stable_diffusion_repaint import StableDiffusionRepaintPipeline from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy from .safety_checker import StableDiffusionSafetyChecker diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py new file mode 100644 index 000000000000..bbe9f9cdf363 --- /dev/null +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py @@ -0,0 +1,606 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Callable, List, Optional, Union + +import numpy as np +import torch + +import PIL +from diffusers.utils import is_accelerate_available +from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer + +from ...configuration_utils import FrozenDict +from ...models import AutoencoderKL, UNet2DConditionModel +from ...pipeline_utils import DiffusionPipeline +from ...schedulers import ( + DDIMScheduler, + DPMSolverMultistepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, +) +from ...utils import deprecate, logging +from . import StableDiffusionPipelineOutput +from .safety_checker import StableDiffusionSafetyChecker + + +logger = logging.get_logger(__name__) + + +def preprocess_image(image): + w, h = image.size + w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32 + image = image.resize((w, h), resample=PIL.Image.LANCZOS) + image = np.array(image).astype(np.float32) / 255.0 + image = image[None].transpose(0, 3, 1, 2) + image = torch.from_numpy(image) + return 2.0 * image - 1.0 + + +def preprocess_mask(mask): + mask = mask.convert("L") + w, h = mask.size + w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32 + mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST) + mask = np.array(mask).astype(np.float32) / 255.0 + mask = np.tile(mask, (4, 1, 1)) + mask = mask[None].transpose(0, 1, 2, 3) # what does this step do? + mask = 1 - mask # repaint white, keep black + mask = torch.from_numpy(mask) + return mask + + +class StableDiffusionRepaintPipeline(DiffusionPipeline): + r""" + Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Stable Diffusion uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offensive or harmful. + Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. + feature_extractor ([`CLIPFeatureExtractor`]): + Model that extracts features from generated images to be used as inputs for the `safety_checker`. + """ + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__init__ + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: Union[ + DDIMScheduler, + PNDMScheduler, + LMSDiscreteScheduler, + EulerDiscreteScheduler, + EulerAncestralDiscreteScheduler, + DPMSolverMultistepScheduler, + ], + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + ): + super().__init__() + + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: + deprecation_message = ( + f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" + f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " + "to update the config accordingly as leaving `steps_offset` might led to incorrect results" + " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," + " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(scheduler.config) + new_config["steps_offset"] = 1 + scheduler._internal_dict = FrozenDict(new_config) + + if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: + deprecation_message = ( + f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." + " `clip_sample` should be set to False in the configuration file. Please make sure to update the" + " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in" + " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" + " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" + ) + deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(scheduler.config) + new_config["clip_sample"] = False + scheduler._internal_dict = FrozenDict(new_config) + + if safety_checker is None: + logger.warn( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + ) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing + def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): + r""" + Enable sliced attention computation. + + When this option is enabled, the attention module will split the input tensor in slices, to compute attention + in several steps. This is useful to save some memory in exchange for a small speed decrease. + + Args: + slice_size (`str` or `int`, *optional*, defaults to `"auto"`): + When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If + a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, + `attention_head_dim` must be a multiple of `slice_size`. + """ + if slice_size == "auto": + # half the attention head size is usually a good trade-off between + # speed and memory + slice_size = self.unet.config.attention_head_dim // 2 + self.unet.set_attention_slice(slice_size) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_attention_slicing + def disable_attention_slicing(self): + r""" + Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go + back to computing attention in one step. + """ + # set slice_size = `None` to disable `attention slicing` + self.enable_attention_slicing(None) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload + def enable_sequential_cpu_offload(self): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, + text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a + `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. + """ + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device("cuda") + + for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention + def enable_xformers_memory_efficient_attention(self): + r""" + Enable memory efficient attention as implemented in xformers. + + When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference + time. Speed up at training time is not guaranteed. + + Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention + is used. + """ + self.unet.set_use_memory_efficient_attention_xformers(True) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_xformers_memory_efficient_attention + def disable_xformers_memory_efficient_attention(self): + r""" + Disable memory efficient attention as implemented in xformers. + """ + self.unet.set_use_memory_efficient_attention_xformers(False) + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt + def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `list(int)`): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`): + The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored + if `guidance_scale` is less than `1`). + """ + batch_size = len(prompt) if isinstance(prompt, list) else 1 + + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids + + if not torch.equal(text_input_ids, untruncated_ids): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = text_inputs.attention_mask.to(device) + else: + attention_mask = None + + text_embeddings = self.text_encoder( + text_input_ids.to(device), + attention_mask=attention_mask, + ) + text_embeddings = text_embeddings[0] + + # duplicate text embeddings for each generation per prompt, using mps friendly method + bs_embed, seq_len, _ = text_embeddings.shape + text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1) + text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + max_length = text_input_ids.shape[-1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = uncond_input.attention_mask.to(device) + else: + attention_mask = None + + uncond_embeddings = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + uncond_embeddings = uncond_embeddings[0] + + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = uncond_embeddings.shape[1] + uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1) + uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) + + return text_embeddings + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is not None: + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + else: + has_nsfw_concept = None + return image, has_nsfw_concept + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents + def decode_latents(self, latents): + latents = 1 / 0.18215 * latents + image = self.vae.decode(latents).sample + image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + return image + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs + def check_inputs(self, prompt, strength, callback_steps): + if not isinstance(prompt, str) and not isinstance(prompt, list): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if strength < 0 or strength > 1: + raise ValueError(f"The value of strength should in [1.0, 1.0] but is {strength}") + + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps + def get_timesteps(self, num_inference_steps, strength, device): + # get the original timestep using init_timestep + # TODO: steps_offset is usually 1, so this effectively cuts the first step out when strength=1.0, is that desired? (for inpaint/img2img) + offset = self.scheduler.config.get("steps_offset", 0) + + init_timestep = int(num_inference_steps * strength) + offset + init_timestep = min(init_timestep, num_inference_steps) + + t_start = max(num_inference_steps - init_timestep + offset, 0) + + timesteps = self.scheduler.timesteps[t_start:] + + return timesteps + + def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator): + init_image = init_image.to(device=self.device, dtype=dtype) + init_latent_dist = self.vae.encode(init_image).latent_dist + init_latents = init_latent_dist.sample(generator=generator) + init_latents = 0.18215 * init_latents + + # Expand init_latents for batch_size and num_images_per_prompt + init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0) + init_latents_orig = init_latents + + # initialize noise in the shape of the latent space + noise = torch.randn(init_latents.shape, generator=generator, device=self.device, dtype=dtype) + latents = torch.cat([noise * self.scheduler.init_noise_sigma] * batch_size * num_images_per_prompt, dim=0) + return latents, init_latents_orig + + @torch.no_grad() + def __call__( + self, + prompt: Union[str, List[str]], + init_image: Union[torch.FloatTensor, PIL.Image.Image], + mask_image: Union[torch.FloatTensor, PIL.Image.Image], + num_inference_steps: Optional[int] = 50, + jump_length: Optional[int] = 10, + jump_n_sample: Optional[int] = 10, + guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: Optional[float] = 0.0, + generator: Optional[torch.Generator] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: Optional[int] = 1, + **kwargs, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`): + The prompt or prompts to guide the image generation. + init_image (`torch.FloatTensor` or `PIL.Image.Image`): + `Image`, or tensor representing an image batch, that will be used as the starting point for the + process. This is the image whose masked region will be inpainted. + mask_image (`torch.FloatTensor` or `PIL.Image.Image`): + `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be + replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a + PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should + contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. + strength (`float`, *optional*, defaults to 0.8): + Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength` + is 1, the denoising process will be run on the masked area for the full number of iterations specified + in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more + noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur. + num_inference_steps (`int`, *optional*, defaults to 50): + The reference number of denoising steps. More denoising steps usually lead to a higher quality image at + the expense of slower inference. This parameter will be modulated by `strength`, as explained above. + jump_length (`int`, *optional*, defaults to 10): + The number of steps taken forward in time before going backward in time for a single jump ("j" in + RePaint paper). Take a look at Figure 9 and 10 in https://arxiv.org/pdf/2201.09865.pdf. + jump_n_sample (`int`, *optional*, defaults to 10): + The number of times we will make forward time jump for a given chosen time sample. Take a look at + Figure 9 and 10 in https://arxiv.org/pdf/2201.09865.pdf. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored + if `guidance_scale` is less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator`, *optional*): + A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation + deterministic. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated images, and the second element is a + list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, according to the `safety_checker`. + """ + # 1. Check inputs + self.check_inputs(prompt, strength, callback_steps) + + # 2. Define call parameters + batch_size = 1 if isinstance(prompt, str) else len(prompt) + device = self._execution_device + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + text_embeddings = self._encode_prompt( + prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt + ) + + # 4. Preprocess image and mask + if not isinstance(init_image, torch.FloatTensor): + init_image = preprocess_image(init_image) + + if not isinstance(mask_image, torch.FloatTensor): + mask_image = preprocess_mask(mask_image) + + # 5. set timesteps + self.scheduler.set_timesteps(num_inference_steps, jump_length, jump_n_sample, self.device) + self.scheduler.eta = eta + + timesteps = self.get_timesteps(num_inference_steps, 1.0, device) + latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) + + # 6. Prepare latent variables + # encode the init image into latents and scale the latents + latents, init_latents_orig = self.prepare_latents( + init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator + ) + + # 7. Prepare mask latent + mask = mask_image.to(device=self.device, dtype=latents.dtype) + mask = torch.cat([mask] * batch_size * num_images_per_prompt) + + t_last = timesteps[0] + 1 + # 9. Denoising loop + for i, t in enumerate(self.progress_bar(timesteps)): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + if t >= t_last: + # compute the reverse: x_t-1 -> x_t + latents = self.scheduler.undo_step(latent_model_input, t_last, generator) + t_last = t + continue + + # predict the noise residual + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, init_latents_orig, mask, generator).prev_sample + + # call the callback, if provided + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + + t_last = t + + # 10. Post-processing + image = self.decode_latents(latents) + + # 11. Run safety checker + image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype) + + # 12. Convert to PIL + if output_type == "pil": + image = self.numpy_to_pil(image) + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/src/diffusers/schedulers/scheduling_repaint.py b/src/diffusers/schedulers/scheduling_repaint.py index 55625c1bfa92..f90749d9c96f 100644 --- a/src/diffusers/schedulers/scheduling_repaint.py +++ b/src/diffusers/schedulers/scheduling_repaint.py @@ -190,6 +190,7 @@ def set_timesteps( timesteps = np.array(timesteps) * (self.config.num_train_timesteps // self.num_inference_steps) self.timesteps = torch.from_numpy(timesteps).to(device) + self.timesteps += self.config.steps_offset def _get_variance(self, t): prev_timestep = t - self.config.num_train_timesteps // self.num_inference_steps @@ -303,7 +304,7 @@ def undo_step(self, sample, timestep, generator=None): for i in range(n): beta = self.betas[timestep + i] - noise = torch.randn(sample.shape, generator=generator, device=sample.device) + noise = torch.randn(sample.shape, generator=generator, device=sample.device, dtype=sample.dtype) # 10. Algorithm 1 Line 10 https://arxiv.org/pdf/2201.09865.pdf sample = (1 - beta) ** 0.5 * sample + beta**0.5 * noise From 17fd219e959639ccc4fd65fb0f7a430746a683d6 Mon Sep 17 00:00:00 2001 From: Nathaniel Herman Date: Thu, 8 Dec 2022 10:46:02 -0800 Subject: [PATCH 02/16] Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py Co-authored-by: Anton Lozhkov --- .../stable_diffusion/pipeline_stable_diffusion_repaint.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py index bbe9f9cdf363..46146ee32cf1 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py @@ -91,6 +91,7 @@ class StableDiffusionRepaintPipeline(DiffusionPipeline): feature_extractor ([`CLIPFeatureExtractor`]): Model that extracts features from generated images to be used as inputs for the `safety_checker`. """ + _optional_components = ["safety_checker", "feature_extractor"] # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__init__ def __init__( From a696c14bac3b2dea4dd83687680d1640cdeca75b Mon Sep 17 00:00:00 2001 From: Nathaniel Herman Date: Thu, 8 Dec 2022 10:46:15 -0800 Subject: [PATCH 03/16] Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py Co-authored-by: Anton Lozhkov --- .../stable_diffusion/pipeline_stable_diffusion_repaint.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py index 46146ee32cf1..4234d1a66187 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py @@ -51,6 +51,7 @@ def preprocess_image(image): return 2.0 * image - 1.0 +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint_legacy.preprocess_mask def preprocess_mask(mask): mask = mask.convert("L") w, h = mask.size From 80737f43c0e3b14c00e369f2781dee065f106114 Mon Sep 17 00:00:00 2001 From: Nathaniel Herman Date: Thu, 8 Dec 2022 10:47:21 -0800 Subject: [PATCH 04/16] Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py Co-authored-by: Anton Lozhkov --- .../stable_diffusion/pipeline_stable_diffusion_repaint.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py index 4234d1a66187..424e75c31997 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py @@ -455,7 +455,6 @@ def __call__( return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: Optional[int] = 1, - **kwargs, ): r""" Function invoked when calling the pipeline for generation. From e9890e9079409de29830f3bd576c9b18f4b189f9 Mon Sep 17 00:00:00 2001 From: Nathaniel Herman Date: Thu, 8 Dec 2022 10:47:31 -0800 Subject: [PATCH 05/16] Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py Co-authored-by: Anton Lozhkov --- .../stable_diffusion/pipeline_stable_diffusion_repaint.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py index 424e75c31997..c88ce10391ea 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py @@ -41,6 +41,7 @@ logger = logging.get_logger(__name__) +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint_legacy.preprocess_image def preprocess_image(image): w, h = image.size w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32 From a01b16a8694c523a174ef0dc63f220d27f5baa8e Mon Sep 17 00:00:00 2001 From: Nathaniel Herman Date: Thu, 8 Dec 2022 18:53:19 +0000 Subject: [PATCH 06/16] . --- .../stable_diffusion/pipeline_stable_diffusion_repaint.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py index c88ce10391ea..27915f648fef 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py @@ -411,14 +411,11 @@ def check_inputs(self, prompt, strength, callback_steps): # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps def get_timesteps(self, num_inference_steps, strength, device): # get the original timestep using init_timestep - # TODO: steps_offset is usually 1, so this effectively cuts the first step out when strength=1.0, is that desired? (for inpaint/img2img) offset = self.scheduler.config.get("steps_offset", 0) - init_timestep = int(num_inference_steps * strength) + offset init_timestep = min(init_timestep, num_inference_steps) t_start = max(num_inference_steps - init_timestep + offset, 0) - timesteps = self.scheduler.timesteps[t_start:] return timesteps From 269bcb195cbccf571c0a676a022df3143cce8e5c Mon Sep 17 00:00:00 2001 From: Nathaniel Herman Date: Thu, 8 Dec 2022 19:13:50 +0000 Subject: [PATCH 07/16] fix bug + rm strength --- .../pipeline_stable_diffusion_repaint.py | 38 +++++-------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py index 27915f648fef..e01245b5ee34 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py @@ -393,13 +393,10 @@ def prepare_extra_step_kwargs(self, generator, eta): return extra_step_kwargs # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs - def check_inputs(self, prompt, strength, callback_steps): + def check_inputs(self, prompt, callback_steps): if not isinstance(prompt, str) and not isinstance(prompt, list): raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - if strength < 0 or strength > 1: - raise ValueError(f"The value of strength should in [1.0, 1.0] but is {strength}") - if (callback_steps is None) or ( callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) ): @@ -408,18 +405,6 @@ def check_inputs(self, prompt, strength, callback_steps): f" {type(callback_steps)}." ) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps - def get_timesteps(self, num_inference_steps, strength, device): - # get the original timestep using init_timestep - offset = self.scheduler.config.get("steps_offset", 0) - init_timestep = int(num_inference_steps * strength) + offset - init_timestep = min(init_timestep, num_inference_steps) - - t_start = max(num_inference_steps - init_timestep + offset, 0) - timesteps = self.scheduler.timesteps[t_start:] - - return timesteps - def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator): init_image = init_image.to(device=self.device, dtype=dtype) init_latent_dist = self.vae.encode(init_image).latent_dist @@ -468,14 +453,9 @@ def __call__( replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. - strength (`float`, *optional*, defaults to 0.8): - Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength` - is 1, the denoising process will be run on the masked area for the full number of iterations specified - in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more - noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur. num_inference_steps (`int`, *optional*, defaults to 50): The reference number of denoising steps. More denoising steps usually lead to a higher quality image at - the expense of slower inference. This parameter will be modulated by `strength`, as explained above. + the expense of slower inference. jump_length (`int`, *optional*, defaults to 10): The number of steps taken forward in time before going backward in time for a single jump ("j" in RePaint paper). Take a look at Figure 9 and 10 in https://arxiv.org/pdf/2201.09865.pdf. @@ -520,7 +500,7 @@ def __call__( (nsfw) content, according to the `safety_checker`. """ # 1. Check inputs - self.check_inputs(prompt, strength, callback_steps) + self.check_inputs(prompt, callback_steps) # 2. Define call parameters batch_size = 1 if isinstance(prompt, str) else len(prompt) @@ -546,7 +526,7 @@ def __call__( self.scheduler.set_timesteps(num_inference_steps, jump_length, jump_n_sample, self.device) self.scheduler.eta = eta - timesteps = self.get_timesteps(num_inference_steps, 1.0, device) + timesteps = self.timesteps latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) # 6. Prepare latent variables @@ -562,16 +542,16 @@ def __call__( t_last = timesteps[0] + 1 # 9. Denoising loop for i, t in enumerate(self.progress_bar(timesteps)): - # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - if t >= t_last: # compute the reverse: x_t-1 -> x_t - latents = self.scheduler.undo_step(latent_model_input, t_last, generator) + latents = self.scheduler.undo_step(latents, t_last, generator) t_last = t continue + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + # predict the noise residual noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample From 9cb5d4414a4925a229c57587de58aab3d644de61 Mon Sep 17 00:00:00 2001 From: Nathaniel Herman Date: Thu, 8 Dec 2022 19:24:27 +0000 Subject: [PATCH 08/16] run check_copies.py --- .../pipeline_stable_diffusion_repaint.py | 95 ++++++++----------- 1 file changed, 40 insertions(+), 55 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py index e01245b5ee34..b0e844c20692 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py @@ -45,7 +45,7 @@ def preprocess_image(image): w, h = image.size w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32 - image = image.resize((w, h), resample=PIL.Image.LANCZOS) + image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]) image = np.array(image).astype(np.float32) / 255.0 image = image[None].transpose(0, 3, 1, 2) image = torch.from_numpy(image) @@ -57,7 +57,7 @@ def preprocess_mask(mask): mask = mask.convert("L") w, h = mask.size w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32 - mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST) + mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"]) mask = np.array(mask).astype(np.float32) / 255.0 mask = np.tile(mask, (4, 1, 1)) mask = mask[None].transpose(0, 1, 2, 3) # what does this step do? @@ -112,6 +112,7 @@ def __init__( ], safety_checker: StableDiffusionSafetyChecker, feature_extractor: CLIPFeatureExtractor, + requires_safety_checker: bool = True, ): super().__init__() @@ -142,8 +143,8 @@ def __init__( new_config["clip_sample"] = False scheduler._internal_dict = FrozenDict(new_config) - if safety_checker is None: - logger.warn( + if safety_checker is None and requires_safety_checker: + logger.warning( f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" " results in services or applications open to the public. Both the diffusers team and Hugging Face" @@ -152,6 +153,33 @@ def __init__( " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." ) + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + + is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse( + version.parse(unet.config._diffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 + if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: + deprecation_message = ( + "The configuration file of the unet has set the default `sample_size` to smaller than" + " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the" + " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-" + " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5" + " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the" + " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" + " in the config might lead to incorrect results in future versions. If you have downloaded this" + " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" + " the `unet/config.json` file" + ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(unet.config) + new_config["sample_size"] = 64 + unet._internal_dict = FrozenDict(new_config) + self.register_modules( vae=vae, text_encoder=text_encoder, @@ -161,35 +189,8 @@ def __init__( safety_checker=safety_checker, feature_extractor=feature_extractor, ) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing - def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): - r""" - Enable sliced attention computation. - - When this option is enabled, the attention module will split the input tensor in slices, to compute attention - in several steps. This is useful to save some memory in exchange for a small speed decrease. - - Args: - slice_size (`str` or `int`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, - `attention_head_dim` must be a multiple of `slice_size`. - """ - if slice_size == "auto": - # half the attention head size is usually a good trade-off between - # speed and memory - slice_size = self.unet.config.attention_head_dim // 2 - self.unet.set_attention_slice(slice_size) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_attention_slicing - def disable_attention_slicing(self): - r""" - Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go - back to computing attention in one step. - """ - # set slice_size = `None` to disable `attention slicing` - self.enable_attention_slicing(None) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload def enable_sequential_cpu_offload(self): @@ -203,31 +204,16 @@ def enable_sequential_cpu_offload(self): else: raise ImportError("Please install accelerate via `pip install accelerate`") - device = torch.device("cuda") + device = torch.device(f"cuda:{gpu_id}") - for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]: + for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: if cpu_offloaded_model is not None: cpu_offload(cpu_offloaded_model, device) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention - def enable_xformers_memory_efficient_attention(self): - r""" - Enable memory efficient attention as implemented in xformers. - - When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference - time. Speed up at training time is not guaranteed. - - Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention - is used. - """ - self.unet.set_use_memory_efficient_attention_xformers(True) - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_xformers_memory_efficient_attention - def disable_xformers_memory_efficient_attention(self): - r""" - Disable memory efficient attention as implemented in xformers. - """ - self.unet.set_use_memory_efficient_attention_xformers(False) + if self.safety_checker is not None: + # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate + # fix by only offloading self.safety_checker for now + cpu_offload(self.safety_checker.vision_model, device) @property # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device @@ -392,7 +378,6 @@ def prepare_extra_step_kwargs(self, generator, eta): extra_step_kwargs["generator"] = generator return extra_step_kwargs - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs def check_inputs(self, prompt, callback_steps): if not isinstance(prompt, str) and not isinstance(prompt, list): raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") From 4cdec74c661b7256851e49f461e1c18730f518df Mon Sep 17 00:00:00 2001 From: Nathaniel Herman Date: Thu, 8 Dec 2022 19:51:51 +0000 Subject: [PATCH 09/16] rename init_image to image --- .../pipeline_stable_diffusion_repaint.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py index b0e844c20692..0e05639b1fc5 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py @@ -409,7 +409,7 @@ def prepare_latents(self, init_image, timestep, batch_size, num_images_per_promp def __call__( self, prompt: Union[str, List[str]], - init_image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.FloatTensor, PIL.Image.Image], mask_image: Union[torch.FloatTensor, PIL.Image.Image], num_inference_steps: Optional[int] = 50, jump_length: Optional[int] = 10, @@ -430,11 +430,11 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - init_image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.FloatTensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. This is the image whose masked region will be inpainted. mask_image (`torch.FloatTensor` or `PIL.Image.Image`): - `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be + `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. @@ -501,8 +501,8 @@ def __call__( ) # 4. Preprocess image and mask - if not isinstance(init_image, torch.FloatTensor): - init_image = preprocess_image(init_image) + if not isinstance(image, torch.FloatTensor): + image = preprocess_image(image) if not isinstance(mask_image, torch.FloatTensor): mask_image = preprocess_mask(mask_image) @@ -517,7 +517,7 @@ def __call__( # 6. Prepare latent variables # encode the init image into latents and scale the latents latents, init_latents_orig = self.prepare_latents( - init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator + image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator ) # 7. Prepare mask latent From 41833a5488c6867e1a31fa49e162d4b2c41d2664 Mon Sep 17 00:00:00 2001 From: Nathaniel Herman Date: Thu, 8 Dec 2022 19:52:22 +0000 Subject: [PATCH 10/16] add test file --- .../test_stable_diffusion_repaint.py | 485 ++++++++++++++++++ 1 file changed, 485 insertions(+) create mode 100644 tests/pipelines/stable_diffusion/test_stable_diffusion_repaint.py diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_repaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_repaint.py new file mode 100644 index 000000000000..48798edfe6e0 --- /dev/null +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_repaint.py @@ -0,0 +1,485 @@ +# coding=utf-8 +# Copyright 2022 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import random +import unittest + +import numpy as np +import torch + +from diffusers import ( + AutoencoderKL, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusionInpaintPipeline, + StableDiffusionInpaintPipelineLegacy, + StableDiffusionRepaintPipeline, + UNet2DConditionModel, + UNet2DModel, + VQModel, +) +from diffusers.utils import floats_tensor, load_image, slow, torch_device +from diffusers.utils.testing_utils import load_numpy, require_torch_gpu +from PIL import Image +from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer + + +torch.backends.cuda.matmul.allow_tf32 = False + + +class StableDiffusionRepaintPipelineFastTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + @property + def dummy_image(self): + batch_size = 1 + num_channels = 3 + sizes = (32, 32) + + image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device) + return image + + @property + def dummy_uncond_unet(self): + torch.manual_seed(0) + model = UNet2DModel( + block_out_channels=(32, 64), + layers_per_block=2, + sample_size=32, + in_channels=3, + out_channels=3, + down_block_types=("DownBlock2D", "AttnDownBlock2D"), + up_block_types=("AttnUpBlock2D", "UpBlock2D"), + ) + return model + + @property + def dummy_cond_unet(self): + torch.manual_seed(0) + model = UNet2DConditionModel( + block_out_channels=(32, 64), + layers_per_block=2, + sample_size=32, + in_channels=4, + out_channels=4, + down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), + up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), + cross_attention_dim=32, + ) + return model + + @property + def dummy_cond_unet_inpaint(self): + torch.manual_seed(0) + model = UNet2DConditionModel( + block_out_channels=(32, 64), + layers_per_block=2, + sample_size=32, + in_channels=9, + out_channels=4, + down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), + up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), + cross_attention_dim=32, + ) + return model + + @property + def dummy_vq_model(self): + torch.manual_seed(0) + model = VQModel( + block_out_channels=[32, 64], + in_channels=3, + out_channels=3, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=3, + ) + return model + + @property + def dummy_vae(self): + torch.manual_seed(0) + model = AutoencoderKL( + block_out_channels=[32, 64], + in_channels=3, + out_channels=3, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=4, + ) + return model + + @property + def dummy_text_encoder(self): + torch.manual_seed(0) + config = CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + ) + return CLIPTextModel(config) + + @property + def dummy_extractor(self): + def extract(*args, **kwargs): + class Out: + def __init__(self): + self.pixel_values = torch.ones([0]) + + def to(self, device): + self.pixel_values.to(device) + return self + + return Out() + + return extract + + def test_stable_diffusion_inpaint_legacy(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + unet = self.dummy_cond_unet + scheduler = PNDMScheduler(skip_prk_steps=True) + vae = self.dummy_vae + bert = self.dummy_text_encoder + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + + image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0] + init_image = Image.fromarray(np.uint8(image)).convert("RGB") + mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32)) + + # make sure here that pndm scheduler skips prk + sd_pipe = StableDiffusionInpaintPipelineLegacy( + unet=unet, + scheduler=scheduler, + vae=vae, + text_encoder=bert, + tokenizer=tokenizer, + safety_checker=None, + feature_extractor=self.dummy_extractor, + ) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + prompt = "A painting of a squirrel eating a burger" + generator = torch.Generator(device=device).manual_seed(0) + output = sd_pipe( + [prompt], + generator=generator, + guidance_scale=6.0, + num_inference_steps=2, + output_type="np", + image=init_image, + mask_image=mask_image, + ) + + image = output.images + + generator = torch.Generator(device=device).manual_seed(0) + image_from_tuple = sd_pipe( + [prompt], + generator=generator, + guidance_scale=6.0, + num_inference_steps=2, + output_type="np", + image=init_image, + mask_image=mask_image, + return_dict=False, + )[0] + + image_slice = image[0, -3:, -3:, -1] + image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] + + assert image.shape == (1, 32, 32, 3) + expected_slice = np.array([0.4731, 0.5346, 0.4531, 0.6251, 0.5446, 0.4057, 0.5527, 0.5896, 0.5153]) + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 + + def test_stable_diffusion_inpaint_legacy_negative_prompt(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + unet = self.dummy_cond_unet + scheduler = PNDMScheduler(skip_prk_steps=True) + vae = self.dummy_vae + bert = self.dummy_text_encoder + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + + image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0] + init_image = Image.fromarray(np.uint8(image)).convert("RGB") + mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32)) + + # make sure here that pndm scheduler skips prk + sd_pipe = StableDiffusionInpaintPipelineLegacy( + unet=unet, + scheduler=scheduler, + vae=vae, + text_encoder=bert, + tokenizer=tokenizer, + safety_checker=None, + feature_extractor=self.dummy_extractor, + ) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + prompt = "A painting of a squirrel eating a burger" + negative_prompt = "french fries" + generator = torch.Generator(device=device).manual_seed(0) + output = sd_pipe( + prompt, + negative_prompt=negative_prompt, + generator=generator, + guidance_scale=6.0, + num_inference_steps=2, + output_type="np", + image=init_image, + mask_image=mask_image, + ) + + image = output.images + image_slice = image[0, -3:, -3:, -1] + + assert image.shape == (1, 32, 32, 3) + expected_slice = np.array([0.4765, 0.5339, 0.4541, 0.6240, 0.5439, 0.4055, 0.5503, 0.5891, 0.5150]) + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + + def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self): + device = "cpu" + unet = self.dummy_cond_unet + scheduler = PNDMScheduler(skip_prk_steps=True) + vae = self.dummy_vae + bert = self.dummy_text_encoder + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + + image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0] + init_image = Image.fromarray(np.uint8(image)).convert("RGB") + mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32)) + + # make sure here that pndm scheduler skips prk + sd_pipe = StableDiffusionInpaintPipelineLegacy( + unet=unet, + scheduler=scheduler, + vae=vae, + text_encoder=bert, + tokenizer=tokenizer, + safety_checker=None, + feature_extractor=self.dummy_extractor, + ) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + prompt = "A painting of a squirrel eating a burger" + + # test num_images_per_prompt=1 (default) + images = sd_pipe( + prompt, + num_inference_steps=2, + output_type="np", + image=init_image, + mask_image=mask_image, + ).images + + assert images.shape == (1, 32, 32, 3) + + # test num_images_per_prompt=1 (default) for batch of prompts + batch_size = 2 + images = sd_pipe( + [prompt] * batch_size, + num_inference_steps=2, + output_type="np", + image=init_image, + mask_image=mask_image, + ).images + + assert images.shape == (batch_size, 32, 32, 3) + + # test num_images_per_prompt for single prompt + num_images_per_prompt = 2 + images = sd_pipe( + prompt, + num_inference_steps=2, + output_type="np", + image=init_image, + mask_image=mask_image, + num_images_per_prompt=num_images_per_prompt, + ).images + + assert images.shape == (num_images_per_prompt, 32, 32, 3) + + # test num_images_per_prompt for batch of prompts + batch_size = 2 + images = sd_pipe( + [prompt] * batch_size, + num_inference_steps=2, + output_type="np", + image=init_image, + mask_image=mask_image, + num_images_per_prompt=num_images_per_prompt, + ).images + + assert images.shape == (batch_size * num_images_per_prompt, 32, 32, 3) + + +@slow +@require_torch_gpu +class StableDiffusionRepaintPipelineIntegrationTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_stable_diffusion_repaint_pipeline(self): + init_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/in_paint/overture-creations-5sI6fQgYIuo.png" + ) + mask_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/in_paint/overture-creations-5sI6fQgYIuo_mask.png" + ) + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/in_paint" + "/red_cat_sitting_on_a_park_bench.npy" + ) + + model_id = "CompVis/stable-diffusion-v1-4" + pipe = StableDiffusionRepaintPipeline.from_pretrained(model_id, safety_checker=None) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + pipe.enable_attention_slicing() + + prompt = "A red cat sitting on a park bench" + + generator = torch.Generator(device=torch_device).manual_seed(0) + output = pipe( + prompt=prompt, + image=init_image, + mask_image=mask_image, + guidance_scale=7.5, + generator=generator, + output_type="np", + ) + image = output.images[0] + + assert image.shape == (512, 512, 3) + assert np.abs(expected_image - image).max() < 1e-3 + + def test_stable_diffusion_inpaint_legacy_pipeline_k_lms(self): + init_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/in_paint/overture-creations-5sI6fQgYIuo.png" + ) + mask_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/in_paint/overture-creations-5sI6fQgYIuo_mask.png" + ) + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/in_paint" + "/red_cat_sitting_on_a_park_bench_k_lms.npy" + ) + + model_id = "CompVis/stable-diffusion-v1-4" + lms = LMSDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler") + pipe = StableDiffusionInpaintPipeline.from_pretrained( + model_id, + scheduler=lms, + safety_checker=None, + ) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + pipe.enable_attention_slicing() + + prompt = "A red cat sitting on a park bench" + + generator = torch.Generator(device=torch_device).manual_seed(0) + output = pipe( + prompt=prompt, + image=init_image, + mask_image=mask_image, + strength=0.75, + guidance_scale=7.5, + generator=generator, + output_type="np", + ) + image = output.images[0] + + assert image.shape == (512, 512, 3) + assert np.abs(expected_image - image).max() < 1e-3 + + def test_stable_diffusion_inpaint_legacy_intermediate_state(self): + number_of_steps = 0 + + def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + test_callback_fn.has_been_called = True + nonlocal number_of_steps + number_of_steps += 1 + if step == 0: + latents = latents.detach().cpu().numpy() + assert latents.shape == (1, 4, 64, 64) + latents_slice = latents[0, -3:, -3:, -1] + expected_slice = np.array( + [-0.5472, 1.1218, -0.5505, -0.9390, -1.0794, 0.4063, 0.5158, 0.6429, -1.5246] + ) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3 + elif step == 37: + latents = latents.detach().cpu().numpy() + assert latents.shape == (1, 4, 64, 64) + latents_slice = latents[0, -3:, -3:, -1] + expected_slice = np.array([0.4781, 1.1572, 0.6258, 0.2291, 0.2554, -0.1443, 0.7085, -0.1598, -0.5659]) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3 + + test_callback_fn.has_been_called = False + + init_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/in_paint/overture-creations-5sI6fQgYIuo.png" + ) + mask_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/in_paint/overture-creations-5sI6fQgYIuo_mask.png" + ) + + pipe = StableDiffusionInpaintPipeline.from_pretrained( + "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16 + ) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + pipe.enable_attention_slicing() + + prompt = "A red cat sitting on a park bench" + + generator = torch.Generator(device=torch_device).manual_seed(0) + with torch.autocast(torch_device): + pipe( + prompt=prompt, + image=init_image, + mask_image=mask_image, + strength=0.75, + num_inference_steps=50, + guidance_scale=7.5, + generator=generator, + callback=test_callback_fn, + callback_steps=1, + ) + assert test_callback_fn.has_been_called + assert number_of_steps == 37 From ce924ecbdd2033270c998303281fb3fc6a8dbcdb Mon Sep 17 00:00:00 2001 From: Nathaniel Herman Date: Fri, 9 Dec 2022 22:22:11 +0000 Subject: [PATCH 11/16] fixes --- .../pipeline_stable_diffusion_repaint.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py index 0e05639b1fc5..f95fa6e135fe 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py @@ -20,6 +20,7 @@ import PIL from diffusers.utils import is_accelerate_available +from packaging import version from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict @@ -33,7 +34,7 @@ LMSDiscreteScheduler, PNDMScheduler, ) -from ...utils import deprecate, logging +from ...utils import deprecate, PIL_INTERPOLATION, logging from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker @@ -53,7 +54,7 @@ def preprocess_image(image): # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint_legacy.preprocess_mask -def preprocess_mask(mask): +def preprocess_mask(mask, scale_factor=8): mask = mask.convert("L") w, h = mask.size w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32 @@ -505,13 +506,13 @@ def __call__( image = preprocess_image(image) if not isinstance(mask_image, torch.FloatTensor): - mask_image = preprocess_mask(mask_image) + mask_image = preprocess_mask(mask_image, self.vae_scale_factor) # 5. set timesteps self.scheduler.set_timesteps(num_inference_steps, jump_length, jump_n_sample, self.device) self.scheduler.eta = eta - timesteps = self.timesteps + timesteps = self.scheduler.timesteps latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) # 6. Prepare latent variables From 79966881950ff00a1f738b32946dfba5fd73de14 Mon Sep 17 00:00:00 2001 From: Nathaniel Herman Date: Mon, 12 Dec 2022 22:35:10 +0000 Subject: [PATCH 12/16] add integration test --- src/diffusers/__init__.py | 1 + .../test_stable_diffusion_repaint.py | 408 +----------------- 2 files changed, 8 insertions(+), 401 deletions(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 48d35012e6a4..a893c2262ec4 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -105,6 +105,7 @@ StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline, StableDiffusionInpaintPipelineLegacy, + StableDiffusionRepaintPipeline, StableDiffusionPipeline, StableDiffusionPipelineSafe, StableDiffusionUpscalePipeline, diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_repaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_repaint.py index 48798edfe6e0..bc4cc7f60acf 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_repaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_repaint.py @@ -24,6 +24,7 @@ AutoencoderKL, LMSDiscreteScheduler, PNDMScheduler, + RePaintScheduler, StableDiffusionInpaintPipeline, StableDiffusionInpaintPipelineLegacy, StableDiffusionRepaintPipeline, @@ -40,305 +41,6 @@ torch.backends.cuda.matmul.allow_tf32 = False -class StableDiffusionRepaintPipelineFastTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - @property - def dummy_image(self): - batch_size = 1 - num_channels = 3 - sizes = (32, 32) - - image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device) - return image - - @property - def dummy_uncond_unet(self): - torch.manual_seed(0) - model = UNet2DModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=3, - out_channels=3, - down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), - ) - return model - - @property - def dummy_cond_unet(self): - torch.manual_seed(0) - model = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - return model - - @property - def dummy_cond_unet_inpaint(self): - torch.manual_seed(0) - model = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=9, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - return model - - @property - def dummy_vq_model(self): - torch.manual_seed(0) - model = VQModel( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=3, - ) - return model - - @property - def dummy_vae(self): - torch.manual_seed(0) - model = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - return model - - @property - def dummy_text_encoder(self): - torch.manual_seed(0) - config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - return CLIPTextModel(config) - - @property - def dummy_extractor(self): - def extract(*args, **kwargs): - class Out: - def __init__(self): - self.pixel_values = torch.ones([0]) - - def to(self, device): - self.pixel_values.to(device) - return self - - return Out() - - return extract - - def test_stable_diffusion_inpaint_legacy(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unet = self.dummy_cond_unet - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0] - init_image = Image.fromarray(np.uint8(image)).convert("RGB") - mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32)) - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionInpaintPipelineLegacy( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - generator = torch.Generator(device=device).manual_seed(0) - output = sd_pipe( - [prompt], - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - ) - - image = output.images - - generator = torch.Generator(device=device).manual_seed(0) - image_from_tuple = sd_pipe( - [prompt], - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.4731, 0.5346, 0.4531, 0.6251, 0.5446, 0.4057, 0.5527, 0.5896, 0.5153]) - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_inpaint_legacy_negative_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unet = self.dummy_cond_unet - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0] - init_image = Image.fromarray(np.uint8(image)).convert("RGB") - mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32)) - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionInpaintPipelineLegacy( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - negative_prompt = "french fries" - generator = torch.Generator(device=device).manual_seed(0) - output = sd_pipe( - prompt, - negative_prompt=negative_prompt, - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.4765, 0.5339, 0.4541, 0.6240, 0.5439, 0.4055, 0.5503, 0.5891, 0.5150]) - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self): - device = "cpu" - unet = self.dummy_cond_unet - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0] - init_image = Image.fromarray(np.uint8(image)).convert("RGB") - mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32)) - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionInpaintPipelineLegacy( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - - # test num_images_per_prompt=1 (default) - images = sd_pipe( - prompt, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - ).images - - assert images.shape == (1, 32, 32, 3) - - # test num_images_per_prompt=1 (default) for batch of prompts - batch_size = 2 - images = sd_pipe( - [prompt] * batch_size, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - ).images - - assert images.shape == (batch_size, 32, 32, 3) - - # test num_images_per_prompt for single prompt - num_images_per_prompt = 2 - images = sd_pipe( - prompt, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - num_images_per_prompt=num_images_per_prompt, - ).images - - assert images.shape == (num_images_per_prompt, 32, 32, 3) - - # test num_images_per_prompt for batch of prompts - batch_size = 2 - images = sd_pipe( - [prompt] * batch_size, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - num_images_per_prompt=num_images_per_prompt, - ).images - - assert images.shape == (batch_size * num_images_per_prompt, 32, 32, 3) - - @slow @require_torch_gpu class StableDiffusionRepaintPipelineIntegrationTests(unittest.TestCase): @@ -358,12 +60,13 @@ def test_stable_diffusion_repaint_pipeline(self): "/in_paint/overture-creations-5sI6fQgYIuo_mask.png" ) expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/in_paint" - "/red_cat_sitting_on_a_park_bench.npy" + #"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/in_paint" + "./red_cat_sitting_on_a_park_bench_repaint.npy" ) model_id = "CompVis/stable-diffusion-v1-4" pipe = StableDiffusionRepaintPipeline.from_pretrained(model_id, safety_checker=None) + pipe.scheduler = RePaintScheduler.from_config(pipe.scheduler.config) pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() @@ -375,6 +78,9 @@ def test_stable_diffusion_repaint_pipeline(self): prompt=prompt, image=init_image, mask_image=mask_image, + jump_length=3, + jump_n_sample=3, + num_inference_steps=50, guidance_scale=7.5, generator=generator, output_type="np", @@ -383,103 +89,3 @@ def test_stable_diffusion_repaint_pipeline(self): assert image.shape == (512, 512, 3) assert np.abs(expected_image - image).max() < 1e-3 - - def test_stable_diffusion_inpaint_legacy_pipeline_k_lms(self): - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/overture-creations-5sI6fQgYIuo.png" - ) - mask_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/overture-creations-5sI6fQgYIuo_mask.png" - ) - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/in_paint" - "/red_cat_sitting_on_a_park_bench_k_lms.npy" - ) - - model_id = "CompVis/stable-diffusion-v1-4" - lms = LMSDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler") - pipe = StableDiffusionInpaintPipeline.from_pretrained( - model_id, - scheduler=lms, - safety_checker=None, - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - prompt = "A red cat sitting on a park bench" - - generator = torch.Generator(device=torch_device).manual_seed(0) - output = pipe( - prompt=prompt, - image=init_image, - mask_image=mask_image, - strength=0.75, - guidance_scale=7.5, - generator=generator, - output_type="np", - ) - image = output.images[0] - - assert image.shape == (512, 512, 3) - assert np.abs(expected_image - image).max() < 1e-3 - - def test_stable_diffusion_inpaint_legacy_intermediate_state(self): - number_of_steps = 0 - - def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: - test_callback_fn.has_been_called = True - nonlocal number_of_steps - number_of_steps += 1 - if step == 0: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 64) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array( - [-0.5472, 1.1218, -0.5505, -0.9390, -1.0794, 0.4063, 0.5158, 0.6429, -1.5246] - ) - assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3 - elif step == 37: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 64) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([0.4781, 1.1572, 0.6258, 0.2291, 0.2554, -0.1443, 0.7085, -0.1598, -0.5659]) - assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3 - - test_callback_fn.has_been_called = False - - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/overture-creations-5sI6fQgYIuo.png" - ) - mask_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/overture-creations-5sI6fQgYIuo_mask.png" - ) - - pipe = StableDiffusionInpaintPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16 - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - prompt = "A red cat sitting on a park bench" - - generator = torch.Generator(device=torch_device).manual_seed(0) - with torch.autocast(torch_device): - pipe( - prompt=prompt, - image=init_image, - mask_image=mask_image, - strength=0.75, - num_inference_steps=50, - guidance_scale=7.5, - generator=generator, - callback=test_callback_fn, - callback_steps=1, - ) - assert test_callback_fn.has_been_called - assert number_of_steps == 37 From 7f728b0926ea16bfaa4aaa5a1e9bb351740a33e9 Mon Sep 17 00:00:00 2001 From: Nathaniel Herman Date: Fri, 16 Dec 2022 21:31:11 +0000 Subject: [PATCH 13/16] update image url --- .../stable_diffusion/test_stable_diffusion_repaint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_repaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_repaint.py index bc4cc7f60acf..3e8357a8d3e7 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_repaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_repaint.py @@ -60,8 +60,8 @@ def test_stable_diffusion_repaint_pipeline(self): "/in_paint/overture-creations-5sI6fQgYIuo_mask.png" ) expected_image = load_numpy( - #"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/in_paint" - "./red_cat_sitting_on_a_park_bench_repaint.npy" + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint" + "/red_cat_sitting_on_a_park_bench_repaint.npy" ) model_id = "CompVis/stable-diffusion-v1-4" From 33e37eb258fb11453e1fabb47381f603e402752d Mon Sep 17 00:00:00 2001 From: Nathaniel Herman Date: Fri, 16 Dec 2022 21:36:33 +0000 Subject: [PATCH 14/16] run make style and make quality --- src/diffusers/__init__.py | 2 +- .../pipelines/stable_diffusion/__init__.py | 2 +- .../pipeline_stable_diffusion_repaint.py | 6 ++++-- .../test_stable_diffusion_repaint.py | 18 ++---------------- 4 files changed, 8 insertions(+), 20 deletions(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index a893c2262ec4..061f7a38709e 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -105,9 +105,9 @@ StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline, StableDiffusionInpaintPipelineLegacy, - StableDiffusionRepaintPipeline, StableDiffusionPipeline, StableDiffusionPipelineSafe, + StableDiffusionRepaintPipeline, StableDiffusionUpscalePipeline, VersatileDiffusionDualGuidedPipeline, VersatileDiffusionImageVariationPipeline, diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py index ba605ff84378..f4ca5332642a 100644 --- a/src/diffusers/pipelines/stable_diffusion/__init__.py +++ b/src/diffusers/pipelines/stable_diffusion/__init__.py @@ -41,8 +41,8 @@ class StableDiffusionPipelineOutput(BaseOutput): from .pipeline_stable_diffusion import StableDiffusionPipeline from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline - from .pipeline_stable_diffusion_repaint import StableDiffusionRepaintPipeline from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy + from .pipeline_stable_diffusion_repaint import StableDiffusionRepaintPipeline from .pipeline_stable_diffusion_upscale import StableDiffusionUpscalePipeline from .safety_checker import StableDiffusionSafetyChecker diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py index f95fa6e135fe..054674225d90 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py @@ -33,8 +33,9 @@ EulerDiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler, + RePaintScheduler, ) -from ...utils import deprecate, PIL_INTERPOLATION, logging +from ...utils import PIL_INTERPOLATION, deprecate, logging from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker @@ -110,6 +111,7 @@ def __init__( EulerDiscreteScheduler, EulerAncestralDiscreteScheduler, DPMSolverMultistepScheduler, + RePaintScheduler, ], safety_checker: StableDiffusionSafetyChecker, feature_extractor: CLIPFeatureExtractor, @@ -194,7 +196,7 @@ def __init__( self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload - def enable_sequential_cpu_offload(self): + def enable_sequential_cpu_offload(self, gpu_id=0): r""" Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_repaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_repaint.py index 3e8357a8d3e7..5259e7848a0e 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_repaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_repaint.py @@ -14,28 +14,14 @@ # limitations under the License. import gc -import random import unittest import numpy as np import torch -from diffusers import ( - AutoencoderKL, - LMSDiscreteScheduler, - PNDMScheduler, - RePaintScheduler, - StableDiffusionInpaintPipeline, - StableDiffusionInpaintPipelineLegacy, - StableDiffusionRepaintPipeline, - UNet2DConditionModel, - UNet2DModel, - VQModel, -) -from diffusers.utils import floats_tensor, load_image, slow, torch_device +from diffusers import RePaintScheduler, StableDiffusionRepaintPipeline +from diffusers.utils import load_image, slow, torch_device from diffusers.utils.testing_utils import load_numpy, require_torch_gpu -from PIL import Image -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer torch.backends.cuda.matmul.allow_tf32 = False From 3f0ffc618a5914fe1d08839ff1c4f162eb821d94 Mon Sep 17 00:00:00 2001 From: Nathaniel Herman Date: Wed, 21 Dec 2022 20:42:25 +0000 Subject: [PATCH 15/16] fix unit test + style warning --- .../stable_diffusion/pipeline_stable_diffusion_repaint.py | 5 ++--- src/diffusers/schedulers/scheduling_repaint.py | 5 +++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py index 054674225d90..c7dd1f24739b 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_repaint.py @@ -97,7 +97,6 @@ class StableDiffusionRepaintPipeline(DiffusionPipeline): """ _optional_components = ["safety_checker", "feature_extractor"] - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__init__ def __init__( self, vae: AutoencoderKL, @@ -265,9 +264,9 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr return_tensors="pt", ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids - if not torch.equal(text_input_ids, untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids): removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" diff --git a/src/diffusers/schedulers/scheduling_repaint.py b/src/diffusers/schedulers/scheduling_repaint.py index 52b37f8b87ac..70931b4a456c 100644 --- a/src/diffusers/schedulers/scheduling_repaint.py +++ b/src/diffusers/schedulers/scheduling_repaint.py @@ -99,6 +99,10 @@ class RePaintScheduler(SchedulerMixin, ConfigMixin): `fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`. clip_sample (`bool`, default `True`): option to clip predicted sample between -1 and 1 for numerical stability. + steps_offset (`int`, default `0`): + an offset added to the inference steps. You can use a combination of `offset=1` and + `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in + stable diffusion. """ @@ -114,6 +118,7 @@ def __init__( eta: float = 0.0, trained_betas: Optional[np.ndarray] = None, clip_sample: bool = True, + steps_offset: int = 0, ): if trained_betas is not None: self.betas = torch.from_numpy(trained_betas) From 3984383abfc42e10417f07dad0581994977943ba Mon Sep 17 00:00:00 2001 From: Nathaniel Herman Date: Fri, 23 Dec 2022 05:13:37 +0000 Subject: [PATCH 16/16] make fix-copies --- .../utils/dummy_torch_and_transformers_objects.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index 160b83a7e6c0..4ed15511ed45 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -184,6 +184,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) +class StableDiffusionRepaintPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class StableDiffusionUpscalePipeline(metaclass=DummyObject): _backends = ["torch", "transformers"]