diff --git a/.github/workflows/package-release.yml b/.github/workflows/package-release.yml index 3182ef9e..793fbcd6 100644 --- a/.github/workflows/package-release.yml +++ b/.github/workflows/package-release.yml @@ -39,6 +39,8 @@ jobs: uses: actions/setup-python@v4 with: python-version: '3.10' + cache: 'pip' + cache-dependency-path: '**/win-linux-cuda.txt' - name: Install dependencies into target shell: bash run: 'python -m pip install -r requirements/win-linux-cuda.txt --no-cache-dir --target .python_dependencies' @@ -65,6 +67,8 @@ jobs: uses: actions/setup-python@v4 with: python-version: '3.10' + cache: 'pip' + cache-dependency-path: '**/win-dml.txt' - name: Install dependencies into target shell: bash run: 'python -m pip install -r requirements/win-dml.txt --no-cache-dir --target .python_dependencies' @@ -79,58 +83,4 @@ jobs: uses: actions/upload-artifact@v3 with: name: dream_textures-windows-directml - path: dream_textures-windows-directml.zip - windows-dreamstudio: - runs-on: windows-latest - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - path: dream_textures - - name: Setup Python - uses: actions/setup-python@v4 - with: - python-version: '3.10' - - name: Install dependencies into target - shell: bash - run: 'python -m pip install -r requirements/dreamstudio.txt --no-cache-dir --target .python_dependencies' - working-directory: dream_textures - - name: Archive Release - uses: thedoctor0/zip-release@main - with: - type: zip - filename: dream_textures-windows-dreamstudio.zip - exclusions: '*.git*' - - name: Archive and upload artifact - uses: actions/upload-artifact@v3 - with: - name: dream_textures-windows-dreamstudio - path: dream_textures-windows-dreamstudio.zip - # No universal wheels for grpcio means we need separate macOS installations for Intel/ARM. - # GitHub Actions does not have Apple Silicon runners, so that build still needs to be created manually. - macos-intel-dreamstudio: - runs-on: macos-latest - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - path: dream_textures - - name: Setup Python - uses: actions/setup-python@v4 - with: - python-version: '3.10' - - name: Install dependencies into target - shell: bash - run: 'python -m pip install -r requirements/dreamstudio.txt --no-cache-dir --target .python_dependencies' - working-directory: dream_textures - - name: Archive Release - uses: thedoctor0/zip-release@main - with: - type: zip - filename: dream_textures-macos-intel-dreamstudio.zip - exclusions: '*.git*' - - name: Archive and upload artifact - uses: actions/upload-artifact@v3 - with: - name: dream_textures-macos-intel-dreamstudio - path: dream_textures-macos-intel-dreamstudio.zip + path: dream_textures-windows-directml.zip \ No newline at end of file diff --git a/__init__.py b/__init__.py index b3a2ff15..5d24d69a 100644 --- a/__init__.py +++ b/__init__.py @@ -49,6 +49,8 @@ def clear_modules(): from . import engine + from .diffusers_backend import DiffusersBackend + requirements_path_items = ( ('requirements/win-linux-cuda.txt', 'Linux/Windows (CUDA)', 'Linux or Windows with NVIDIA GPU'), ('requirements/mac-mps-cpu.txt', 'Apple Silicon', 'Apple M1/M2'), @@ -127,6 +129,9 @@ def project_use_controlnet(self, context): register_render_pass() register_default_presets() + + # Register the default backend. + bpy.utils.register_class(DiffusersBackend) def unregister(): for cls in PREFERENCE_CLASSES: @@ -143,4 +148,7 @@ def unregister(): unregister_render_pass() + # Unregister the default backend + bpy.utils.unregister_class(DiffusersBackend) + kill_generator() \ No newline at end of file diff --git a/api/__init__.py b/api/__init__.py new file mode 100644 index 00000000..36654ceb --- /dev/null +++ b/api/__init__.py @@ -0,0 +1,2 @@ +from .models import * +from .backend import * \ No newline at end of file diff --git a/api/backend/__init__.py b/api/backend/__init__.py new file mode 100644 index 00000000..3583fd62 --- /dev/null +++ b/api/backend/__init__.py @@ -0,0 +1 @@ +from .backend import * \ No newline at end of file diff --git a/api/backend/backend.py b/api/backend/backend.py new file mode 100644 index 00000000..93ef3724 --- /dev/null +++ b/api/backend/backend.py @@ -0,0 +1,143 @@ +try: + import bpy + from typing import Callable, List, Tuple + from ..models.generation_arguments import GenerationArguments + from ..models.generation_result import GenerationResult + from ..models.model import Model + + StepCallback = Callable[[List[GenerationResult]], bool] + Callback = Callable[[List[GenerationResult] | Exception], None] + + class Backend(bpy.types.PropertyGroup): + """A backend for Dream Textures. + + Provide the following methods to create a valid backend. + + ```python + def list_models(self) -> List[Model] + def generate( + self, + arguments: GenerationArguments, + + step_callback: StepCallback, + callback: Callback + ) + ``` + """ + + @classmethod + def register(cls): + from ...property_groups.dream_prompt import DreamPrompt + setattr(DreamPrompt, cls._attribute(), bpy.props.PointerProperty(type=cls)) + + @classmethod + def unregister(cls): + from ...property_groups.dream_prompt import DreamPrompt + delattr(DreamPrompt, cls._attribute()) + + @classmethod + def _id(cls) -> str: + return f"{cls.__module__}.{cls.__name__}" + + @classmethod + def _attribute(cls) -> str: + return cls._id().replace('.', '_') + + @classmethod + def _lookup(cls, id): + return next( + (backend for backend in cls._list_backends() if backend._id() == id), + next(iter(cls._list_backends()), None) + ) + + @classmethod + def _list_backends(cls): + return cls.__subclasses__() + + def list_models(self, context) -> List[Model]: + """Provide a list of available models. + + The `id` of the model will be provided. + """ + ... + + def list_controlnet_models(self, context) -> List[Model]: + """Provide a list of available ControlNet models. + + The `id` of the model will be provided. + """ + return [] + + def list_schedulers(self, context) -> List[str]: + """Provide a list of available schedulers.""" + ... + + def draw_prompt(self, layout, context): + """Draw additional UI in the 'Prompt' panel""" + ... + + def draw_advanced(self, layout, context): + """Draw additional UI in the 'Advanced' panel""" + ... + + def draw_speed_optimizations(self, layout, context): + """Draw additional UI in the 'Speed Optimizations' panel""" + ... + + def draw_memory_optimizations(self, layout, context): + """Draw additional UI in the 'Memory Optimizations' panel""" + ... + + def draw_extra(self, layout, context): + """Draw additional UI in the panel""" + ... + + def get_batch_size(self, context) -> int: + """Return the selected batch size for the backend (if applicable). + + A default implementation is provided that returns `1`. + """ + return 1 + + def generate( + self, + arguments: GenerationArguments, + step_callback: StepCallback, + callback: Callback + ): + """ + A request to generate an image. + + If the `step_callback` returns `False`, the generation should be cancelled. + After cancelling, `callback` should be called with an `InterruptedError`. + """ + ... + + def validate( + self, + arguments: GenerationArguments + ): + """Validates the given arguments in the UI without generating. + + This validation should occur as quickly as possible. + + To report problems with the inputs, raise a `ValueError`. + Use the `FixItError` to provide a solution to the problem as well. + + ```python + if arguments.steps % 2 == 0: + throw FixItError( + "The number of steps is even", + solution=FixItError.UpdateGenerationArgumentsSolution( + title="Add 1 more step", + arguments=dataclasses.replace( + arguments, + steps=arguments.steps + 1 + ) + ) + ) + ``` + """ + ... +except: + pass \ No newline at end of file diff --git a/api/models/__init__.py b/api/models/__init__.py new file mode 100644 index 00000000..8219c44b --- /dev/null +++ b/api/models/__init__.py @@ -0,0 +1,7 @@ +from .generation_result import * +from .model import * +from .prompt import * +from .seamless_axes import * +from .step_preview_mode import * +from .task import * +from .fix_it_error import * \ No newline at end of file diff --git a/api/models/control_net.py b/api/models/control_net.py new file mode 100644 index 00000000..5fc1900b --- /dev/null +++ b/api/models/control_net.py @@ -0,0 +1,14 @@ +from dataclasses import dataclass +from typing import Tuple, List +from numpy.typing import NDArray + +@dataclass +class ControlNet: + model: str + """The selected ControlNet model used for generation""" + + image: NDArray + """The control image""" + + strength: float + """The strength of the ControlNet's influence""" \ No newline at end of file diff --git a/api/models/fix_it_error.py b/api/models/fix_it_error.py new file mode 100644 index 00000000..3292c3f4 --- /dev/null +++ b/api/models/fix_it_error.py @@ -0,0 +1,41 @@ +from typing import Callable, Any +from .generation_arguments import GenerationArguments +from dataclasses import dataclass + +class FixItError(Exception): + """An exception with a solution. + + Call the `draw` method to render the UI elements responsible for resolving this error. + """ + def __init__(self, message, solution: 'Solution'): + super().__init__(message) + + self._solution = solution + + def _draw(self, dream_prompt, context, layout): + self._solution._draw(dream_prompt, context, layout) + + @dataclass + class Solution: + def _draw(self, dream_prompt, context, layout): + ... + + @dataclass + class ChangeProperty(Solution): + """Prompts the user to change the given `property` of the `GenerationArguments`.""" + property: str + + def _draw(self, dream_prompt, context, layout): + layout.prop(dream_prompt, self.property) + + @dataclass + class RunOperator(Solution): + """Runs the given operator""" + title: str + operator: str + modify_operator: Callable[[Any], None] + + def _draw(self, dream_prompt, context, layout): + self.modify_operator( + layout.operator(self.operator, text=self.title) + ) \ No newline at end of file diff --git a/api/models/generation_arguments.py b/api/models/generation_arguments.py new file mode 100644 index 00000000..343263b0 --- /dev/null +++ b/api/models/generation_arguments.py @@ -0,0 +1,107 @@ +from dataclasses import dataclass +from typing import Tuple, List +from ..models.task import Task +from ..models.model import Model +from ..models.prompt import Prompt +from ..models.seamless_axes import SeamlessAxes +from ..models.step_preview_mode import StepPreviewMode +from ..models.control_net import ControlNet + +@dataclass +class GenerationArguments: + task: Task + """The type of generation to perform. + + Use a match statement to perform different actions based on the selected task. + + ```python + match task: + case PromptToImage(): + ... + case ImageToImage(image=image, strength=strength, fit=fit): + ... + case Inpaint(image=image, fit=fit, strength=strength, mask_source=mask_source, mask_prompt=mask_prompt, confidence=confidence): + ... + case DepthToImage(depth=depth, image=image, strength=strength): + ... + case Outpaint(image=image, origin=origin): + ... + case _: + raise NotImplementedError() + ``` + """ + + model: Model + """The selected model. + + This is one of the options provided by `Backend.list_models`. + """ + + prompt: Prompt + """The positive and (optionally) negative prompt. + + If `prompt.negative` is `None`, then the 'Negative Prompt' panel was disabled by the user. + """ + + size: Tuple[int, int] | None + """The target size of the image, or `None` to use the native size of the model.""" + + seed: int + """The random or user-provided seed to use.""" + + steps: int + """The number of inference steps to perform.""" + + guidance_scale: float + """The selected classifier-free guidance scale.""" + + scheduler: str + """The selected scheduler. + + This is one of the options provided by `Backend.list_schedulers`. + """ + + seamless_axes: SeamlessAxes + """Which axes to tile seamlessly.""" + + step_preview_mode: StepPreviewMode + """The style of preview to display at each step.""" + + iterations: int + """The number of images to generate. + + The value sent to `callback` should contain the same number of `GenerationResult` instances in a list. + """ + + control_nets: List[ControlNet] + + @staticmethod + def _map_property_name(name: str) -> str | List[str] | None: + """Converts a property name from `GenerationArguments` to the corresponding property of a `DreamPrompt`.""" + match name: + case "model": + return "model" + case "prompt": + return ["prompt", "use_negative_prompt", "negative_prompt"] + case "prompt.positive": + return "prompt" + case "prompt.negative": + return ["use_negative_prompt", "negative_prompt"] + case "size": + return ["use_size", "width", "height"] + case "seed": + return "seed" + case "steps": + return "steps" + case "guidance_scale": + return "cfg_scale" + case "scheduler": + return "scheduler" + case "seamless_axes": + return "seamless_axes" + case "step_preview_mode": + return "step_preview_mode" + case "iterations": + return "iterations" + case _: + return None \ No newline at end of file diff --git a/api/models/generation_result.py b/api/models/generation_result.py new file mode 100644 index 00000000..2ce5711e --- /dev/null +++ b/api/models/generation_result.py @@ -0,0 +1,71 @@ +from dataclasses import dataclass +from numpy.typing import NDArray +import numpy as np +import math + +@dataclass +class GenerationResult: + """The output of a `Backend`. + + Create a result with an `image` and a `seed`. + + ```python + result = GenerationResult( + progress=3, + total=5, + image=np.zeros((512, 512, 3)), + seed=42 + ) + ``` + + Alternatively, create a result with just a `title` and progress values. + + ```python + result = GenerationResult( + progress=3, + total=5, + title="Loading model" + ) + ``` + """ + + progress: int + """The amount out of `total` that has been completed""" + + total: int + """The number of steps to complete""" + + seed: int + """The seed used to generate the image.""" + + title: str | None = None + """The name of the currently executing task""" + + image: NDArray | None = None + """The generated image as a Numpy array. + + The shape should be `(height, width, channels)`, where `channels` is 3 or 4. + """ + + @staticmethod + def tile_images(results: list['GenerationResult']) -> NDArray: + images = [result.image for result in results] + if len(images) == 0: + return None + elif len(images) == 1: + return images[0] + width = images[0].shape[1] + height = images[0].shape[0] + tiles_x = math.ceil(math.sqrt(len(images))) + tiles_y = math.ceil(len(images) / tiles_x) + tiles = np.zeros((height * tiles_y, width * tiles_x, 4), dtype=np.float32) + bottom_offset = (tiles_x*tiles_y-len(images)) * width // 2 + for i, image in enumerate(images): + x = i % tiles_x + y = tiles_y - 1 - int((i - x) / tiles_x) + x *= width + y *= height + if y == 0: + x += bottom_offset + tiles[y: y + height, x: x + width] = image + return tiles diff --git a/api/models/model.py b/api/models/model.py new file mode 100644 index 00000000..7c5a69e9 --- /dev/null +++ b/api/models/model.py @@ -0,0 +1,7 @@ +from dataclasses import dataclass + +@dataclass +class Model: + name: str + description: str + id: str \ No newline at end of file diff --git a/api/models/prompt.py b/api/models/prompt.py new file mode 100644 index 00000000..3affcf00 --- /dev/null +++ b/api/models/prompt.py @@ -0,0 +1,7 @@ +from dataclasses import dataclass +from typing import List + +@dataclass +class Prompt: + positive: str | List[str] + negative: str | List[str] | None \ No newline at end of file diff --git a/api/models/seamless_axes.py b/api/models/seamless_axes.py new file mode 100644 index 00000000..739be2f4 --- /dev/null +++ b/api/models/seamless_axes.py @@ -0,0 +1,75 @@ +from enum import Enum + +class SeamlessAxes(Enum): + """Unified handling of seamless axes. + Can be converted from str (id or text) or bool tuple/list (x, y). + Each enum is equal to their respective convertible values. + Special cases: + AUTO: None + OFF: False, empty str + BOTH: True + """ + + AUTO = 'auto', 'Auto-detect', None, None + OFF = 'off', 'Off', False, False + HORIZONTAL = 'x', 'X', True, False + VERTICAL = 'y', 'Y', False, True + BOTH = 'xy', 'Both', True, True + + def __init__(self, id, text, x, y): + self.id = id + self.text = text + self.x = x + self.y = y + + def __eq__(self, other): + if isinstance(other, type(self)): + return self is other + if isinstance(other, str): + return self.id == other or self.text == other or (other == '' and self is self.OFF) + if isinstance(other, (tuple, list)) and len(other) == 2: + return self.x == other[0] and self.y == other[1] + if other is True and self is self.BOTH: + return True + if other is False and self is self.OFF: + return True + if other is None and self is self.AUTO: + return True + return False + + def __and__(self, other): + return SeamlessAxes((self.x and other.x, self.y and other.y)) + + def __or__(self, other): + return SeamlessAxes((self.x or other.x, self.y or other.y)) + + def __xor__(self, other): + return SeamlessAxes((self.x != other.x, self.y != other.y)) + + def __invert__(self): + return SeamlessAxes((not self.x, not self.y)) + + @classmethod + def _missing_(cls, value): + if isinstance(value, str): + if value == '': + return cls.OFF + for e in cls: + if e.id == value or e.text == value: + return e + raise ValueError(f'no {cls.__name__} with id {repr(id)}') + elif isinstance(value, (tuple, list)) and len(value) == 2: + for e in cls: + if e.x == value[0] and e.y == value[1]: + return e + raise ValueError(f'no {cls.__name__} with x {value[0]} and y {value[1]}') + elif value is True: + return cls.BOTH + elif value is False: + return cls.OFF + elif value is None: + return cls.AUTO + raise TypeError(f'expected str, bool, tuple[bool, bool], or None, got {repr(value)}') + + def bpy_enum(self, *args): + return self.id, self.text, *args \ No newline at end of file diff --git a/api/models/step_preview_mode.py b/api/models/step_preview_mode.py new file mode 100644 index 00000000..2ae2441d --- /dev/null +++ b/api/models/step_preview_mode.py @@ -0,0 +1,8 @@ +import enum + +class StepPreviewMode(enum.Enum): + NONE = "None" + FAST = "Fast" + FAST_BATCH = "Fast (Batch Tiled)" + ACCURATE = "Accurate" + ACCURATE_BATCH = "Accurate (Batch Tiled)" \ No newline at end of file diff --git a/api/models/task.py b/api/models/task.py new file mode 100644 index 00000000..a009f169 --- /dev/null +++ b/api/models/task.py @@ -0,0 +1,97 @@ +from dataclasses import dataclass +from typing import Tuple +from numpy.typing import NDArray +from enum import IntEnum + +class Task: + """A specific task type. + + Access the properties of the task using dot notation. + + ```python + # Task.ImageToImage + task.image + task.strength + task.fit + ``` + + Switch over the task to perform the correct actions. + + ```python + match type(task): + case PromptToImage: + ... + case ImageToImage: + ... + case Inpaint: + ... + case DepthToImage: + ... + case Outpaint: + ... + ``` + """ + + @classmethod + def name(cls) -> str: + "unknown" + """A human readable name for this task.""" + +@dataclass +class PromptToImage(Task): + @classmethod + def name(cls): + return "prompt to image" + +@dataclass +class ImageToImage(Task): + image: NDArray + strength: float + fit: bool + + @classmethod + def name(cls): + return "image to image" + +@dataclass +class Inpaint(ImageToImage): + class MaskSource(IntEnum): + ALPHA = 0 + PROMPT = 1 + + mask_source: MaskSource + mask_prompt: str + confidence: float + + @classmethod + def name(cls): + return "inpainting" + +@dataclass +class DepthToImage(Task): + depth: NDArray | None + image: NDArray | None + strength: float + + @classmethod + def name(cls): + return "depth to image" + +@dataclass +class Outpaint(Task): + image: NDArray + origin: Tuple[int, int] + + @classmethod + def name(cls): + return "outpainting" + +@dataclass +class Upscale(Task): + image: NDArray + tile_size: int + blend: int + + @classmethod + def name(cls): + return "upscaling" \ No newline at end of file diff --git a/classes.py b/classes.py index 5f6adefd..a5cad7f5 100644 --- a/classes.py +++ b/classes.py @@ -10,7 +10,9 @@ from .property_groups.dream_prompt import DreamPrompt from .property_groups.seamless_result import SeamlessResult from .ui.panels import dream_texture, history, upscaling, render_properties -from .preferences import OpenURL, StableDiffusionPreferences, ImportWeights, Model, ModelSearch, InstallModel, PREFERENCES_UL_ModelList +from .preferences import OpenURL, StableDiffusionPreferences,\ + ImportWeights, Model, ModelSearch, InstallModel, PREFERENCES_UL_ModelList,\ + CheckpointGroup, LinkCheckpoint, UnlinkCheckpoint, PREFERENCES_UL_CheckpointList from .ui.presets import DREAM_PT_AdvancedPresets, DREAM_MT_AdvancedPresets, AddAdvancedPreset, RestoreDefaultPresets @@ -68,5 +70,9 @@ OpenURL, ImportWeights, RestoreDefaultPresets, + CheckpointGroup, + LinkCheckpoint, + UnlinkCheckpoint, + PREFERENCES_UL_CheckpointList, StableDiffusionPreferences, ) \ No newline at end of file diff --git a/community_backends/test.py b/community_backends/test.py new file mode 100644 index 00000000..663ada05 --- /dev/null +++ b/community_backends/test.py @@ -0,0 +1,33 @@ +bl_info = { + "name": "Test Backend", + "blender": (3, 1, 0), + "category": "Paint", +} + +import bpy +from typing import List, Tuple +from dream_textures.api import * + +class TestBackend(Backend): + name = "Test" + description = "A short description of this backend" + + custom_optimization: bpy.props.BoolProperty(name="My Custom Optimization") + + def list_models(self, context) -> List[Model]: + return [] + + def list_schedulers(self, context) -> List[str]: + return [] + + def generate(self, task: Task, model: Model, prompt: Prompt, size: Tuple[int, int] | None, seed: int, steps: int, guidance_scale: float, scheduler: str, seamless_axes: SeamlessAxes, step_preview_mode: StepPreviewMode, iterations: int, step_callback: StepCallback, callback: Callback): + raise NotImplementedError() + + def draw_speed_optimizations(self, layout, context): + layout.prop(self, "custom_optimization") + +def register(): + bpy.utils.register_class(TestBackend) + +def unregister(): + bpy.utils.unregister_class(TestBackend) diff --git a/diffusers_backend.py b/diffusers_backend.py new file mode 100644 index 00000000..85e4ffd1 --- /dev/null +++ b/diffusers_backend.py @@ -0,0 +1,330 @@ +import bpy +from bpy.props import FloatProperty, IntProperty, EnumProperty, BoolProperty +from typing import List + +from .api import Backend, StepCallback, Callback +from .api.models import Model, GenerationArguments, GenerationResult +from .api.models.task import PromptToImage, ImageToImage, Inpaint, DepthToImage, Outpaint, Upscale +from .api.models.fix_it_error import FixItError + +from .generator_process import Generator +from .generator_process.actions.prompt_to_image import ImageGenerationResult +from .generator_process.future import Future +from .generator_process.models import CPUOffload, ModelType, Optimizations, Scheduler + +from .preferences import checkpoint_lookup, StableDiffusionPreferences, _template_model_download_progress, InstallModel, model_lookup + +from functools import reduce + +def _convert_models(models): + return [ + None if model is None else (model.id, model.name, model.description) + for model in models + ] + +class DiffusersBackend(Backend): + name = "HuggingFace Diffusers" + description = "Local image generation inside of Blender" + + attention_slicing: BoolProperty(name="Attention Slicing", default=True, description="Computes attention in several steps. Saves some memory in exchange for a small speed decrease") + attention_slice_size_src: EnumProperty( + name="Attention Slice Size", + items=( + ("auto", "Automatic", "Computes attention in two steps", 1), + ("manual", "Manual", "Computes attention in `attention_head_dim // size` steps. A smaller `size` saves more memory.\n" + "`attention_head_dim` must be a multiple of `size`, otherwise the image won't generate properly.\n" + "`attention_head_dim` can be found within the model snapshot's unet/config.json file", 2) + ), + default=1 + ) + attention_slice_size: IntProperty(name="Attention Slice Size", default=1, min=1) + cudnn_benchmark: BoolProperty(name="cuDNN Benchmark", description="Allows cuDNN to benchmark multiple convolution algorithms and select the fastest", default=False) + tf32: BoolProperty(name="TF32", description="Utilizes tensor cores on Ampere (RTX 30xx) or newer GPUs for matrix multiplications.\nHas no effect if half precision is enabled", default=False) + half_precision: BoolProperty(name="Half Precision", description="Reduces memory usage and increases speed in exchange for a slight loss in image quality.\nHas no effect if CPU only is enabled or using a GTX 16xx GPU", default=True) + cpu_offload: EnumProperty( + name="CPU Offload", + items=( + ("off", "Off", "", 0), + ("model", "Model", "Some memory savings with minimal speed penalty", 1), + ("submodule", "Submodule", "Better memory savings with large speed penalty", 2) + ), + default=0, + description="Dynamically moves models in and out of device memory for reduced memory usage with reduced speed" + ) + channels_last_memory_format: BoolProperty(name="Channels Last Memory Format", description="An alternative way of ordering NCHW tensors that may be faster or slower depending on the device", default=False) + sdp_attention: BoolProperty( + name="SDP Attention", + description="Scaled dot product attention requires less memory and often comes with a good speed increase.\n" + "Prompt recall may not produce the exact same image, but usually only minor noise differences.\n" + "Overrides attention slicing", + default=True + ) + batch_size: IntProperty(name="Batch Size", default=1, min=1, description="Improves speed when using iterations or upscaling in exchange for higher memory usage.\nHighly recommended to use with VAE slicing enabled") + vae_slicing: BoolProperty(name="VAE Slicing", description="Reduces memory usage of batched VAE decoding. Has no effect if batch size is 1.\nMay have a small performance improvement with large batches", default=True) + vae_tiling: EnumProperty( + name="VAE Tiling", + items=( + ("off", "Off", "", 0), + ("half", "Half", "Uses tiles of half the selected model's default size. Likely to cause noticeably inaccurate colors", 1), + ("full", "Full", "Uses tiles of the selected model's default size, intended for use where image size is manually set higher. May cause slightly inaccurate colors", 2), + ("manual", "Manual", "", 3) + ), + default=0, + description="Decodes generated images in tiled regions to reduce memory usage in exchange for longer decode time and less accurate colors.\nCan allow for generating larger images that would otherwise run out of memory on the final step" + ) + vae_tile_size: IntProperty(name="VAE Tile Size", min=1, default=512, description="Width and height measurement of tiles. Smaller sizes are more likely to cause inaccurate colors and other undesired artifacts") + vae_tile_blend: IntProperty(name="VAE Tile Blend", min=0, default=64, description="Minimum amount of how much each edge of a tile will intersect its adjacent tile") + cfg_end: FloatProperty(name="CFG End", min=0, max=1, default=1, description="The percentage of steps to complete before disabling classifier-free guidance") + cpu_only: BoolProperty(name="CPU Only", default=False, description="Disables GPU acceleration and is extremely slow") + + use_sdxl_refiner: BoolProperty(name="Use SDXL Refiner", default=False, description="Provide a refiner model to run automatically after the initial generation") + sdxl_refiner_model: EnumProperty(name="SDXL Refiner Model", items=lambda self, context: _convert_models(self.list_models(context)), description="Specify which model to use as a refiner") + + def list_models(self, context): + def model_case(model, i): + return Model( + name=model.model_base.replace('models--', '').replace('--', '/'), + description=ModelType[model.model_type].name, + id=model.model_base.replace('models--', '').replace('--', '/') + ) + models = {} + for i, model in enumerate(context.preferences.addons[StableDiffusionPreferences.bl_idname].preferences.installed_models): + if model.model_type in {ModelType.CONTROL_NET.name, ModelType.UNKNOWN.name}: + continue + if model.model_type not in models: + models[model.model_type] = [model_case(model, i)] + else: + models[model.model_type].append(model_case(model, i)) + return reduce( + lambda a, b: a + [None] + sorted(b, key=lambda m: m.id), + [ + models[group] + for group in sorted(models.keys()) + ], + [] + ) + + def list_controlnet_models(self, context): + return [ + Model( + name=model.model_base.replace('models--', '').replace('--', '/'), + description="ControlNet", + id=model.model_base.replace('models--', '').replace('--', '/') + ) + for model in context.preferences.addons[StableDiffusionPreferences.bl_idname].preferences.installed_models + if model.model_type == ModelType.CONTROL_NET.name + ] + + def list_schedulers(self, context) -> List[str]: + return [scheduler.value for scheduler in Scheduler] + + def get_batch_size(self, context) -> int: + return self.batch_size + + def optimizations(self) -> Optimizations: + optimizations = Optimizations() + for prop in dir(self): + if hasattr(optimizations, prop) and not prop.startswith('__'): + setattr(optimizations, prop, getattr(self, prop)) + if self.attention_slice_size_src == 'auto': + optimizations.attention_slice_size = 'auto' + optimizations.cpu_offload = CPUOffload(optimizations.cpu_offload) + return optimizations + + def generate(self, arguments: GenerationArguments, step_callback: StepCallback, callback: Callback): + gen = Generator.shared() + common_kwargs = { + 'model': checkpoint_lookup.get(arguments.model.id), + 'scheduler': Scheduler(arguments.scheduler), + 'optimizations': self.optimizations(), + 'prompt': arguments.prompt.positive, + 'steps': arguments.steps, + 'width': arguments.size[0] if arguments.size is not None else None, + 'height': arguments.size[1] if arguments.size is not None else None, + 'seed': arguments.seed, + 'cfg_scale': arguments.guidance_scale, + 'use_negative_prompt': arguments.prompt.negative is not None, + 'negative_prompt': arguments.prompt.negative or "", + 'seamless_axes': arguments.seamless_axes, + 'iterations': arguments.iterations, + 'step_preview_mode': arguments.step_preview_mode, + + 'sdxl_refiner_model': (checkpoint_lookup.get(self.sdxl_refiner_model) if self.use_sdxl_refiner else None), + } + future: Future + match arguments.task: + case PromptToImage(): + if len(arguments.control_nets) > 0: + future = gen.control_net( + **common_kwargs, + control_net=[checkpoint_lookup.get(c.model) for c in arguments.control_nets], + control=[c.image for c in arguments.control_nets], + controlnet_conditioning_scale=[c.strength for c in arguments.control_nets], + image=None, + inpaint=False, + inpaint_mask_src='alpha', + text_mask='', + text_mask_confidence=1, + strength=1 + ) + else: + future = gen.prompt_to_image(**common_kwargs) + case Inpaint(image=image, fit=fit, strength=strength, mask_source=mask_source, mask_prompt=mask_prompt, confidence=confidence): + if len(arguments.control_nets) > 0: + future = gen.control_net( + **common_kwargs, + control_net=[c.model for c in arguments.control_nets], + control=[c.image for c in arguments.control_nets], + controlnet_conditioning_scale=[c.strength for c in arguments.control_nets], + image=image, + inpaint=True, + inpaint_mask_src='alpha' if mask_source == Inpaint.MaskSource.ALPHA else 'prompt', + text_mask=mask_prompt, + text_mask_confidence=confidence, + strength=strength + ) + else: + future = gen.inpaint( + image=image, + fit=fit, + strength=strength, + inpaint_mask_src='alpha' if mask_source == Inpaint.MaskSource.ALPHA else 'prompt', + text_mask=mask_prompt, + text_mask_confidence=confidence, + **common_kwargs + ) + case ImageToImage(image=image, strength=strength, fit=fit): + if len(arguments.control_nets) > 0: + future = gen.control_net( + **common_kwargs, + control_net=[c.model for c in arguments.control_nets], + control=[c.image for c in arguments.control_nets], + controlnet_conditioning_scale=[c.strength for c in arguments.control_nets], + image=image, + inpaint=False, + inpaint_mask_src='alpha', + text_mask='', + text_mask_confidence=1, + strength=strength + ) + else: + future = gen.image_to_image(image=image, fit=fit, strength=strength, **common_kwargs) + case DepthToImage(depth=depth, image=image, strength=strength): + future = gen.depth_to_image( + depth=depth, + image=image, + strength=strength, + **common_kwargs + ) + case Outpaint(image=image, origin=origin): + future = gen.outpaint( + image=image, + outpaint_origin=origin, + fit=False, + strength=1, + inpaint_mask_src='alpha', + text_mask='', + text_mask_confidence=1, + **common_kwargs + ) + case Upscale(image=image, tile_size=tile_size, blend=blend): + future = gen.upscale( + image=image, + tile_size=tile_size, + blend=blend, + **common_kwargs + ) + case _: + raise NotImplementedError() + def on_step(future: Future, step_image: ImageGenerationResult): + if len(step_image.images) == 0: + results = [(GenerationResult(progress=step_image.step, total=step_image.total or arguments.steps, seed=step_image.seeds[-1]))] + else: + results = [ + GenerationResult(progress=step_image.step, total=step_image.total or arguments.steps, image=step_image.images[i], seed=step_image.seeds[i]) + for i in range(len(step_image.images)) + ] + should_continue = step_callback(results) + if not should_continue: + future.cancel() + callback(InterruptedError()) + def on_done(future: Future): + result: ImageGenerationResult = future.result(last_only=True) + callback([ + GenerationResult(progress=result.step, total=arguments.steps, image=result.images[i], seed=result.seeds[i]) + for i in range(len(result.images)) + ]) + def on_exception(_, exception): + callback(exception) + future.add_response_callback(on_step) + future.add_exception_callback(on_exception) + future.add_done_callback(on_done) + + def validate(self, arguments: GenerationArguments): + model = model_lookup.get(arguments.model.id) + if model is None: + raise FixItError("No model selected.", FixItError.ChangeProperty("model")) + else: + if not model.model_type.matches_task(arguments.task): + class DownloadModel(FixItError.Solution): + def _draw(self, dream_prompt, context, layout): + if not _template_model_download_progress(context, layout): + target_model_type = ModelType.from_task(arguments.task) + if target_model_type is not None: + install_model = layout.operator(InstallModel.bl_idname, text=f"Download {target_model_type.recommended_model()} (Recommended)", icon="IMPORT") + install_model.model = target_model_type.recommended_model() + install_model.prefer_fp16_revision = context.preferences.addons[StableDiffusionPreferences.bl_idname].preferences.prefer_fp16_revision + model_task_description = f"""Incorrect model type selected for {type(arguments.task).name().replace('_', ' ').lower()} tasks. +The selected model is for {model.model_type.name.replace('_', ' ').lower()}.""" + if not any(m.model_type.matches_task(arguments.task) for m in model_lookup._models.values()): + raise FixItError( + message=model_task_description + "\nYou do not have any compatible models downloaded:", + solution=DownloadModel() + ) + else: + raise FixItError( + message=model_task_description + "\nSelect a different model below.", + solution=FixItError.ChangeProperty("model") + ) + + def draw_advanced(self, layout, context): + layout.prop(self, "use_sdxl_refiner") + col = layout.column() + col.enabled = self.use_sdxl_refiner + col.prop(self, "sdxl_refiner_model") + + def draw_speed_optimizations(self, layout, context): + inferred_device = Optimizations.infer_device() + if self.cpu_only: + inferred_device = "cpu" + def optimization(prop): + if Optimizations.device_supports(prop, inferred_device): + layout.prop(self, prop) + + optimization("cudnn_benchmark") + optimization("tf32") + optimization("half_precision") + optimization("channels_last_memory_format") + optimization("batch_size") + + def draw_memory_optimizations(self, layout, context): + inferred_device = Optimizations.infer_device() + if self.cpu_only: + inferred_device = "cpu" + def optimization(prop): + if Optimizations.device_supports(prop, inferred_device): + layout.prop(self, prop) + + optimization("sdp_attention") + optimization("attention_slicing") + slice_size_row = layout.row() + slice_size_row.prop(self, "attention_slice_size_src") + if self.attention_slice_size_src == 'manual': + slice_size_row.prop(self, "attention_slice_size", text="Size") + optimization("cpu_offload") + optimization("cpu_only") + optimization("vae_slicing") + optimization("vae_tiling") + if self.vae_tiling == "manual": + optimization("vae_tile_size") + optimization("vae_tile_blend") \ No newline at end of file diff --git a/engine/engine.py b/engine/engine.py index dd376f65..33a2136b 100644 --- a/engine/engine.py +++ b/engine/engine.py @@ -8,6 +8,8 @@ from .node_tree import DreamTexturesNodeTree from ..engine import node_executor from .annotations import depth +from ..property_groups.dream_prompt import backend_options +from .nodes.pipeline_nodes import NodeStableDiffusion class DreamTexturesRenderEngine(bpy.types.RenderEngine): """A custom Dream Textures render engine, that uses Stable Diffusion and scene data to render images, instead of as a pass on top of Cycles.""" @@ -96,11 +98,21 @@ def draw_device(self, context): if context.engine == DreamTexturesRenderEngine.bl_idname: layout.template_ID(scene.dream_textures_render_engine, "node_tree", text="Node Tree", new=NewEngineNodeTree.bl_idname) + layout.prop(scene.dream_textures_render_engine, "backend") def _poll_node_tree(self, value): return value.bl_idname == "DreamTexturesNodeTree" + +def _update_engine_backend(self, context): + if self.node_tree is not None: + for node in self.node_tree.nodes: + if node.bl_idname == NodeStableDiffusion.bl_idname: + node.prompt.backend = self.backend + context.scene.dream_textures_engine_prompt.backend = self.backend + class DreamTexturesRenderEngineProperties(bpy.types.PropertyGroup): node_tree: bpy.props.PointerProperty(type=DreamTexturesNodeTree, name="Node Tree", poll=_poll_node_tree) + backend: bpy.props.EnumProperty(name="Backend", items=backend_options, default=1, description="The backend to use for all pipeline nodes", update=_update_engine_backend) def engine_panels(): bpy.types.RENDER_PT_output.COMPAT_ENGINES.add(DreamTexturesRenderEngine.bl_idname) diff --git a/engine/node_executor.py b/engine/node_executor.py index 8cf3e4f4..f113c769 100644 --- a/engine/node_executor.py +++ b/engine/node_executor.py @@ -1,4 +1,4 @@ -import graphlib +import bpy # from dream_textures.engine import node_executor # node_executor.execute(bpy.data.node_groups["NodeTree"], bpy.context.evaluated_depsgraph_get()) @@ -10,6 +10,7 @@ def __init__(self, depsgraph, start, update, end, test_break, cache={}): self.end = end self.test_break = test_break self.cache = {} + self.preferences = bpy.context.preferences def _evaluate_input(self, input): if input.is_linked: diff --git a/engine/nodes/pipeline_nodes.py b/engine/nodes/pipeline_nodes.py index f7ff5612..ea37f2e4 100644 --- a/engine/nodes/pipeline_nodes.py +++ b/engine/nodes/pipeline_nodes.py @@ -1,7 +1,7 @@ import bpy import numpy as np from dataclasses import dataclass -from typing import Any +from typing import Any, List import enum from ..node import DreamTexturesNode from ...generator_process import Generator @@ -11,6 +11,8 @@ from ..annotations import depth from ..annotations import normal from ..annotations import ade20k +from ... import api +from ...property_groups.seamless_result import SeamlessAxes import threading class NodeSocketControlNet(bpy.types.NodeSocket): @@ -73,6 +75,9 @@ class NodeStableDiffusion(DreamTexturesNode): ('inpaint', 'Inpaint', '', 4), ), update=_update_stable_diffusion_sockets) + def update(self): + self.prompt.backend = bpy.context.scene.dream_textures_render_engine.backend + def init(self, context): self.inputs.new("NodeSocketColor", "Depth Map") self.inputs.new("NodeSocketColor", "Source Image") @@ -97,180 +102,78 @@ def init(self, context): def draw_buttons(self, context, layout): layout.prop(self, "task") prompt = self.prompt - layout.prop(prompt, "pipeline", text="") layout.prop(prompt, "model", text="") layout.prop(prompt, "scheduler", text="") layout.prop(prompt, "seamless_axes", text="") def execute(self, context, prompt, negative_prompt, width, height, steps, seed, cfg_scale, controlnets, depth_map, source_image, noise_strength): - self.prompt.use_negative_prompt = True - self.prompt.negative_prompt = negative_prompt - self.prompt.steps = steps - self.prompt.seed = str(seed) - self.prompt.cfg_scale = cfg_scale - args = self.prompt.generate_args() - - shared_args = context.depsgraph.scene.dream_textures_engine_prompt.generate_args() - - # the source image is a default color, ignore it. - if np.array(source_image).shape == (4,): - source_image = None - - if controlnets is not None: - if not isinstance(controlnets, list): - controlnets = [controlnets] - future = Generator.shared().control_net( - pipeline=args['pipeline'], - model=args['model'], - scheduler=args['scheduler'], - optimizations=shared_args['optimizations'], - seamless_axes=args['seamless_axes'], - iterations=args['iterations'], - step_preview_mode=args['step_preview_mode'], - - control_net=[c.model for c in controlnets], - control=[c.control(context.depsgraph) for c in controlnets], - controlnet_conditioning_scale=[c.conditioning_scale for c in controlnets], - - image=np.flipud(np.uint8(source_image * 255)) if self.task in {'image_to_image', 'inpaint'} else None, - strength=noise_strength, + backend: api.Backend = self.prompt.get_backend() - inpaint=self.task == 'inpaint', - inpaint_mask_src='alpha', - text_mask='', - text_mask_confidence=1, - - prompt=prompt, - steps=steps, - seed=seed, - width=width, - height=height, - cfg_scale=cfg_scale, - use_negative_prompt=True, - negative_prompt=negative_prompt - ) - else: + def get_task(): match self.task: case 'prompt_to_image': - future = Generator.shared().prompt_to_image( - pipeline=args['pipeline'], - model=args['model'], - scheduler=args['scheduler'], - optimizations=shared_args['optimizations'], - seamless_axes=args['seamless_axes'], - iterations=args['iterations'], - step_preview_mode=args['step_preview_mode'], - prompt=prompt, - steps=steps, - seed=seed, - width=width, - height=height, - cfg_scale=cfg_scale, - use_negative_prompt=True, - negative_prompt=negative_prompt - ) + return api.PromptToImage() case 'image_to_image': - future = Generator.shared().image_to_image( - pipeline=args['pipeline'], - model=args['model'], - scheduler=args['scheduler'], - optimizations=shared_args['optimizations'], - seamless_axes=args['seamless_axes'], - iterations=args['iterations'], - step_preview_mode=args['step_preview_mode'], - - image=np.flipud(np.uint8(source_image * 255)), - strength=noise_strength, - fit=True, - - prompt=prompt, - steps=steps, - seed=seed, - width=width, - height=height, - cfg_scale=cfg_scale, - use_negative_prompt=True, - negative_prompt=negative_prompt - ) + return api.ImageToImage(source_image, noise_strength, fit=False) case 'depth_to_image': - future = Generator.shared().depth_to_image( - pipeline=args['pipeline'], - model=args['model'], - scheduler=args['scheduler'], - optimizations=shared_args['optimizations'], - seamless_axes=args['seamless_axes'], - iterations=args['iterations'], - step_preview_mode=args['step_preview_mode'], - - depth=depth_map, - image=np.flipud(np.uint8(source_image * 255)) if source_image is not None else None, - strength=noise_strength, - - prompt=prompt, - steps=steps, - seed=seed, - width=width, - height=height, - cfg_scale=cfg_scale, - use_negative_prompt=True, - negative_prompt=negative_prompt - ) + return api.DepthToImage(depth_map, source_image, noise_strength) case 'inpaint': - future = Generator.shared().inpaint( - pipeline=args['pipeline'], - model=args['model'], - scheduler=args['scheduler'], - optimizations=shared_args['optimizations'], - seamless_axes=args['seamless_axes'], - iterations=args['iterations'], - step_preview_mode=args['step_preview_mode'], - - image=np.flipud(np.uint8(source_image * 255)), - strength=noise_strength, - - fit=args['fit'], - inpaint_mask_src='alpha', - text_mask='', - text_mask_confidence=1, + return api.Inpaint(source_image, noise_strength, fit=False, mask_source=api.Inpaint.MaskSource.ALPHA, mask_prompt="", confidence=0) + + def map_controlnet(c): + return api.models.control_net.ControlNet(c.model, c.control(context.depsgraph), c.conditioning_scale) + + args = api.GenerationArguments( + get_task(), + model=next(model for model in self.prompt.get_backend().list_models(context) if model is not None and model.id == self.prompt.model), + prompt=api.Prompt( + prompt, + negative_prompt + ), + size=(width, height), + seed=seed, + steps=steps, + guidance_scale=cfg_scale, + scheduler=self.prompt.scheduler, + seamless_axes=SeamlessAxes(self.prompt.seamless_axes), + step_preview_mode=api.models.StepPreviewMode.FAST, + iterations=1, + control_nets=[map_controlnet(c) for c in controlnets] if isinstance(controlnets, list) else ([map_controlnet(controlnets)] if controlnets is not None else []) + ) - prompt=prompt, - steps=steps, - seed=seed, - width=width, - height=height, - cfg_scale=cfg_scale, - use_negative_prompt=True, - negative_prompt=negative_prompt - ) + # the source image is a default color, ignore it. + if np.array(source_image).shape == (4,): + source_image = None + event = threading.Event() result = None exception = None - def on_response(_, response): - context.update(response.images[0]) - if context.test_break(): + def step_callback(progress: List[api.GenerationResult]) -> bool: + context.update(progress[-1].image) + return True + # if context.test_break(): + # nonlocal result + # result = [response] + # event.set() + + def callback(results: List[api.GenerationResult] | Exception): + if isinstance(results, Exception): + nonlocal exception + exception = results + event.set() + else: nonlocal result - future.cancel() - result = [response] + result = results[-1].image event.set() - - def on_done(future): - nonlocal result - result = future.result() - event.set() - - def on_exception(_, error): - nonlocal exception - exception = error - event.set() - future.add_response_callback(on_response) - future.add_done_callback(on_done) - future.add_exception_callback(on_exception) + backend = self.prompt.get_backend() + backend.generate(args, step_callback=step_callback, callback=callback) + event.wait() if exception is not None: raise exception return { - 'Image': result[-1].images[-1] + 'Image': result } def _update_control_net_sockets(self, context): diff --git a/generator_process/__init__.py b/generator_process/__init__.py index 52268c36..46c544a2 100644 --- a/generator_process/__init__.py +++ b/generator_process/__init__.py @@ -5,7 +5,9 @@ class Generator(Actor): The actor used for all background processes. """ - from .actions.prompt_to_image import prompt_to_image, choose_device + from .actions.choose_device import choose_device + from .actions.load_model import load_model + from .actions.prompt_to_image import prompt_to_image from .actions.image_to_image import image_to_image from .actions.inpaint import inpaint from .actions.outpaint import outpaint diff --git a/generator_process/actions/choose_device.py b/generator_process/actions/choose_device.py new file mode 100644 index 00000000..27f5054a --- /dev/null +++ b/generator_process/actions/choose_device.py @@ -0,0 +1,21 @@ +import sys + +def choose_device(self, optimizations) -> str: + """ + Automatically select which PyTorch device to use. + """ + if optimizations.cpu_only: + return "cpu" + + import torch + + if torch.cuda.is_available(): + return "cuda" + elif torch.backends.mps.is_available(): + return "mps" + if 'torch_directml' in sys.modules: + import torch_directml + if torch_directml.is_available(): + torch.utils.rename_privateuse1_backend("dml") + return "dml" + return "cpu" \ No newline at end of file diff --git a/generator_process/actions/control_net.py b/generator_process/actions/control_net.py index 81cf9eae..9bedacde 100644 --- a/generator_process/actions/control_net.py +++ b/generator_process/actions/control_net.py @@ -3,22 +3,27 @@ from numpy.typing import NDArray import numpy as np +import logging +import os import random -from .prompt_to_image import Scheduler, Optimizations, StepPreviewMode, ImageGenerationResult, _configure_model_padding, model_snapshot_folder, load_pipe -from ..models import Pipeline -from .detect_seamless import SeamlessAxes +from .prompt_to_image import Checkpoint, Scheduler, Optimizations, StepPreviewMode, ImageGenerationResult, _configure_model_padding +from ...api.models.seamless_axes import SeamlessAxes +from ..future import Future + + +logger = logging.getLogger(__name__) + def control_net( self, - pipeline: Pipeline, - - model: str, - scheduler: Scheduler, + model: str | Checkpoint, + + scheduler: str | Scheduler, optimizations: Optimizations, - control_net: list[str], + control_net: list[str | Checkpoint], control: list[NDArray] | None, controlnet_conditioning_scale: list[float], @@ -46,497 +51,139 @@ def control_net( step_preview_mode: StepPreviewMode, **kwargs -) -> Generator[NDArray, None, None]: - match pipeline: - case Pipeline.STABLE_DIFFUSION: - import diffusers - from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel, ControlNetModel - from diffusers.utils import deprecate, randn_tensor - import torch - import PIL.Image - import PIL.ImageOps - - class GeneratorPipeline(diffusers.StableDiffusionControlNetPipeline): - # copied from diffusers.StableDiffusionImg2ImgPipeline - def get_timesteps(self, num_inference_steps, strength, device): - # get the original timestep using init_timestep - init_timestep = min(int(num_inference_steps * strength), num_inference_steps) - - t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start:] - - return timesteps, num_inference_steps - t_start - - # copied from diffusers.StableDiffusionImg2ImgPipeline - def prepare_img2img_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None): - if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)): - raise ValueError( - f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}" - ) - - image = image.to(device=device, dtype=dtype) - - batch_size = batch_size * num_images_per_prompt - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - - if isinstance(generator, list): - init_latents = [ - self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) - ] - init_latents = torch.cat(init_latents, dim=0) - else: - init_latents = self.vae.encode(image).latent_dist.sample(generator) - - init_latents = self.vae.config.scaling_factor * init_latents - - if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: - # expand init_latents for batch_size - deprecation_message = ( - f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial" - " images (`image`). Initial images are now duplicating to match the number of text prompts. Note" - " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" - " your script to pass as many initial images as text prompts to suppress this warning." - ) - deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False) - additional_image_per_prompt = batch_size // init_latents.shape[0] - init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0) - elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: - raise ValueError( - f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." - ) - else: - init_latents = torch.cat([init_latents], dim=0) - - shape = init_latents.shape - noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) - - # get latents - init_latents = self.scheduler.add_noise(init_latents, noise, timestep) - latents = init_latents - - return latents - - # copied from diffusers.StableDiffusionInpaintPipeline - def prepare_mask_latents( - self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance - ): - # resize the mask to latents shape as we concatenate the mask to the latents - # we do that before converting to dtype to avoid breaking in case we're using cpu_offload - # and half precision - mask = torch.nn.functional.interpolate( - mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor) - ) - mask = mask.to(device=device, dtype=dtype) - - masked_image = masked_image.to(device=device, dtype=dtype) - - # encode the mask image into latents space so we can concatenate it to the latents - if isinstance(generator, list): - masked_image_latents = [ - self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(generator=generator[i]) - for i in range(batch_size) - ] - masked_image_latents = torch.cat(masked_image_latents, dim=0) - else: - masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator) - masked_image_latents = self.vae.config.scaling_factor * masked_image_latents - - # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method - if mask.shape[0] < batch_size: - if not batch_size % mask.shape[0] == 0: - raise ValueError( - "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to" - f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number" - " of masks that you pass is divisible by the total requested batch size." - ) - mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1) - if masked_image_latents.shape[0] < batch_size: - if not batch_size % masked_image_latents.shape[0] == 0: - raise ValueError( - "The passed images and the required batch size don't match. Images are supposed to be duplicated" - f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed." - " Make sure the number of images that you pass is divisible by the total requested batch size." - ) - masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1) - - mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask - masked_image_latents = ( - torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents - ) - - # aligning device to prevent device errors when concating it with the latent model input - masked_image_latents = masked_image_latents.to(device=device, dtype=dtype) - return mask, masked_image_latents - - @torch.no_grad() - def __call__( - self, - prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None, - - # NOTE: Modified to support initial image and inpaint. - init_image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None, - strength: float = 1.0, - mask: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None, - - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 50, - guidance_scale: float = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: Optional[int] = 1, - eta: float = 0.0, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - output_type: Optional[str] = "pil", - return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, - callback_steps: int = 1, - cross_attention_kwargs: Optional[Dict[str, Any]] = None, - controlnet_conditioning_scale: Union[float, List[float]] = 1.0, - - **kwargs - ): - # 0. Default height and width to unet - height, width = self._default_height_width(height, width, image) - - # 1. Check inputs. Raise error if not correct - self.check_inputs( - prompt, - image, - height, - width, - callback_steps, - negative_prompt, - prompt_embeds, - negative_prompt_embeds, - controlnet_conditioning_scale +) -> Generator[Future, None, None]: + future = Future() + yield future + + import diffusers + import torch + import PIL.Image + import PIL.ImageOps + + device = self.choose_device(optimizations) + + # StableDiffusionPipeline w/ caching + if image is not None: + if inpaint: + pipe = self.load_model(diffusers.AutoPipelineForInpainting, model, optimizations, scheduler, controlnet=control_net) + else: + pipe = self.load_model(diffusers.AutoPipelineForImage2Image, model, optimizations, scheduler, controlnet=control_net) + else: + pipe = self.load_model(diffusers.AutoPipelineForText2Image, model, optimizations, scheduler, controlnet=control_net) + + # Optimizations + pipe = optimizations.apply(pipe, device) + + # RNG + batch_size = len(prompt) if isinstance(prompt, list) else 1 + generator = [] + for _ in range(batch_size): + gen = torch.Generator(device="cpu" if device in ("mps", "dml") else device) # MPS and DML do not support the `Generator` API + generator.append(gen.manual_seed(random.randrange(0, np.iinfo(np.uint32).max) if seed is None else seed)) + if batch_size == 1: + # Some schedulers don't handle a list of generators: https://github.com/huggingface/diffusers/issues/1909 + generator = generator[0] + + # Init Image + # FIXME: The `unet.config.sample_size` of the depth model is `32`, not `64`. For now, this will be hardcoded to `512`. + height = height or 512 + width = width or 512 + rounded_size = ( + int(8 * (width // 8)), + int(8 * (height // 8)), + ) + control_image = [PIL.Image.fromarray(np.uint8(c * 255)).convert('RGB').resize(rounded_size) for c in control] if control is not None else None + init_image = None if image is None else (PIL.Image.open(image) if isinstance(image, str) else PIL.Image.fromarray(image.astype(np.uint8))).resize(rounded_size) + if inpaint: + match inpaint_mask_src: + case 'alpha': + mask_image = PIL.ImageOps.invert(init_image.getchannel('A')) + case 'prompt': + from transformers import AutoProcessor, CLIPSegForImageSegmentation + + processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined") + clipseg = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined") + inputs = processor(text=[text_mask], images=[init_image.convert('RGB')], return_tensors="pt", padding=True) + outputs = clipseg(**inputs) + mask_image = PIL.Image.fromarray(np.uint8((1 - torch.sigmoid(outputs.logits).lt(text_mask_confidence).int().detach().numpy()) * 255), 'L').resize(init_image.size) + else: + mask_image = None + + # Seamless + if seamless_axes == SeamlessAxes.AUTO: + init_sa = None if init_image is None else self.detect_seamless(np.array(init_image) / 255) + control_sa = None if control_image is None else self.detect_seamless(np.array(control_image[0]) / 255) + if init_sa is not None and control_sa is not None: + seamless_axes = SeamlessAxes((init_sa.x and control_sa.x, init_sa.y and control_sa.y)) + elif init_sa is not None: + seamless_axes = init_sa + elif control_sa is not None: + seamless_axes = control_sa + _configure_model_padding(pipe.unet, seamless_axes) + _configure_model_padding(pipe.vae, seamless_axes) + + # Inference + with (torch.inference_mode() if device not in ('mps', "dml") else nullcontext()), \ + (torch.autocast(device) if optimizations.can_use("amp", device) else nullcontext()): + def callback(step, timestep, latents): + if future.check_cancelled(): + raise InterruptedError() + future.add_response(ImageGenerationResult.step_preview(self, step_preview_mode, width, height, latents, generator, step)) + try: + if init_image is not None: + if mask_image is not None: + result = pipe( + prompt=prompt, + negative_prompt=negative_prompt if use_negative_prompt else None, + control_image=control_image, + controlnet_conditioning_scale=controlnet_conditioning_scale, + image=init_image.convert('RGB'), + mask_image=mask_image, + strength=strength, + width=rounded_size[0], + height=rounded_size[1], + num_inference_steps=steps, + guidance_scale=cfg_scale, + generator=generator, + callback=callback ) - - # 2. Define call parameters - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - device = self._execution_device - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - if isinstance(self.controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float): - controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(self.controlnet.nets) - - # 3. Encode input prompt - prompt_embeds = self._encode_prompt( - prompt, - device, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, + else: + result = pipe( + prompt=prompt, + negative_prompt=negative_prompt if use_negative_prompt else None, + control_image=control_image, + controlnet_conditioning_scale=controlnet_conditioning_scale, + image=init_image.convert('RGB'), + strength=strength, + width=rounded_size[0], + height=rounded_size[1], + num_inference_steps=steps, + guidance_scale=cfg_scale, + generator=generator, + callback=callback ) - - # 4. Prepare image - if isinstance(self.controlnet, ControlNetModel): - image = self.prepare_image( - image=image, - width=width, - height=height, - batch_size=batch_size * num_images_per_prompt, - num_images_per_prompt=num_images_per_prompt, - device=device, - dtype=self.controlnet.dtype, - do_classifier_free_guidance=do_classifier_free_guidance, - guess_mode=False - ) - elif isinstance(self.controlnet, MultiControlNetModel): - images = [] - - for image_ in image: - image_ = self.prepare_image( - image=image_, - width=width, - height=height, - batch_size=batch_size * num_images_per_prompt, - num_images_per_prompt=num_images_per_prompt, - device=device, - dtype=self.controlnet.dtype, - do_classifier_free_guidance=do_classifier_free_guidance, - guess_mode=False - ) - - images.append(image_) - - image = images - else: - assert False - - # 5. Prepare timesteps - # NOTE: Modified to support initial image - if init_image is not None and not inpaint: - self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) - latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) - else: - self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps = self.scheduler.timesteps - - # 6. Prepare latent variables - num_channels_latents = self.unet.in_channels - # NOTE: Modified to support initial image - if mask is not None: - num_channels_latents = self.vae.config.latent_channels - mask, masked_image = diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.prepare_mask_and_masked_image(init_image, mask) - mask, masked_image_latents = self.prepare_mask_latents( - mask, - masked_image, - batch_size * num_images_per_prompt, - height, - width, - prompt_embeds.dtype, - device, - generator, - do_classifier_free_guidance, - ) - num_channels_mask = mask.shape[1] - num_channels_masked_image = masked_image_latents.shape[1] - if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels: - raise ValueError( - f"Select an inpainting model, such as 'stabilityai/stable-diffusion-2-inpainting'" - ) - latents = self.prepare_latents( - batch_size * num_images_per_prompt, - num_channels_latents, - height, - width, - prompt_embeds.dtype, - device, - generator, - latents, - ) - elif init_image is not None: - init_image = diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess(init_image) - latents = self.prepare_img2img_latents( - init_image, - latent_timestep, - batch_size, - num_images_per_prompt, - prompt_embeds.dtype, - device, - generator - ) - else: - latents = self.prepare_latents( - batch_size * num_images_per_prompt, - num_channels_latents, - height, - width, - prompt_embeds.dtype, - device, - generator, - latents, - ) - - # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - - # 8. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - # NOTE: Modified to support disabling CFG - if do_classifier_free_guidance and (i / len(timesteps)) >= kwargs['cfg_end']: - do_classifier_free_guidance = False - prompt_embeds = prompt_embeds[prompt_embeds.size(0) // 2:] - image = [i[i.size(0) // 2:] for i in image] - if mask is not None: - mask = mask[mask.size(0) // 2:] - masked_image_latents = masked_image_latents[masked_image_latents.size(0) // 2:] - # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - - # controlnet(s) inference - down_block_res_samples, mid_block_res_sample = self.controlnet( - latent_model_input, - t, - encoder_hidden_states=prompt_embeds, - controlnet_cond=image, - conditioning_scale=controlnet_conditioning_scale, - return_dict=False, - ) - - if mask is not None: - latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1) - - # predict the noise residual - noise_pred = self.unet( - latent_model_input, - t, - encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, - down_block_additional_residuals=down_block_res_samples, - mid_block_additional_residual=mid_block_res_sample, - ).sample - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample - - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - progress_bar.update() - # NOTE: Modified to yield the latents instead of calling a callback. - yield ImageGenerationResult.step_preview(self, kwargs['step_preview_mode'], width, height, latents, generator, i) - - # If we do sequential model offloading, let's offload unet and controlnet - # manually for max memory savings - if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: - self.unet.to("cpu") - self.controlnet.to("cpu") - torch.cuda.empty_cache() - - if output_type == "latent": - image = latents - has_nsfw_concept = None - elif output_type == "pil": - # 8. Post-processing - image = self.decode_latents(latents) - - # NOTE: Add UI to enable this. - # 9. Run safety checker - # image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - - # 10. Convert to PIL - image = self.numpy_to_pil(image) - else: - # 8. Post-processing - image = self.decode_latents(latents) - - # NOTE: Add UI to enable this. - # 9. Run safety checker - # image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) - - # Offload last model to CPU - if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: - self.final_offload_hook.offload() - - # NOTE: Modified to yield the decoded image as a numpy array. - yield ImageGenerationResult( - [np.asarray(PIL.ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255. - for i, image in enumerate(image)], - [gen.initial_seed() for gen in generator] if isinstance(generator, list) else [generator.initial_seed()], - num_inference_steps, - True - ) - - if optimizations.cpu_only: - device = "cpu" - else: - device = self.choose_device() - - # Load the ControlNet model - controlnet = [] - for controlnet_name in control_net: - controlnet.append(load_pipe(self, f"control_net_model-{controlnet_name}", diffusers.ControlNetModel, controlnet_name, optimizations, None, device)) - controlnet = MultiControlNetModel(controlnet) - - # StableDiffusionPipeline w/ caching - pipe = load_pipe(self, "control_net", GeneratorPipeline, model, optimizations, scheduler, device, controlnet=controlnet) - - # Optimizations - pipe = optimizations.apply(pipe, device) - - # RNG - batch_size = len(prompt) if isinstance(prompt, list) else 1 - generator = [] - for _ in range(batch_size): - gen = torch.Generator(device="cpu" if device in ("mps", "dml") else device) # MPS and DML do not support the `Generator` API - generator.append(gen.manual_seed(random.randrange(0, np.iinfo(np.uint32).max) if seed is None else seed)) - if batch_size == 1: - # Some schedulers don't handle a list of generators: https://github.com/huggingface/diffusers/issues/1909 - generator = generator[0] - - # Init Image - # FIXME: The `unet.config.sample_size` of the depth model is `32`, not `64`. For now, this will be hardcoded to `512`. - height = height or 512 - width = width or 512 - rounded_size = ( - int(8 * (width // 8)), - int(8 * (height // 8)), - ) - control_image = [PIL.Image.fromarray(np.uint8(c * 255)).convert('RGB').resize(rounded_size) for c in control] if control is not None else None - init_image = None if image is None else (PIL.Image.open(image) if isinstance(image, str) else PIL.Image.fromarray(image.astype(np.uint8))).resize(rounded_size) - if inpaint: - match inpaint_mask_src: - case 'alpha': - mask_image = PIL.ImageOps.invert(init_image.getchannel('A')) - case 'prompt': - from transformers import AutoProcessor, CLIPSegForImageSegmentation - - processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined") - clipseg = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined") - inputs = processor(text=[text_mask], images=[init_image.convert('RGB')], return_tensors="pt", padding=True) - outputs = clipseg(**inputs) - mask_image = PIL.Image.fromarray(np.uint8((1 - torch.sigmoid(outputs.logits).lt(text_mask_confidence).int().detach().numpy()) * 255), 'L').resize(init_image.size) else: - mask_image = None - - # Seamless - if seamless_axes == SeamlessAxes.AUTO: - init_sa = None if init_image is None else self.detect_seamless(np.array(init_image) / 255) - control_sa = None if control_image is None else self.detect_seamless(np.array(control_image[0]) / 255) - if init_sa is not None and control_sa is not None: - seamless_axes = SeamlessAxes((init_sa.x and control_sa.x, init_sa.y and control_sa.y)) - elif init_sa is not None: - seamless_axes = init_sa - elif control_sa is not None: - seamless_axes = control_sa - _configure_model_padding(pipe.unet, seamless_axes) - _configure_model_padding(pipe.vae, seamless_axes) - - # Inference - with (torch.inference_mode() if device not in ('mps', "dml") else nullcontext()), \ - (torch.autocast(device) if optimizations.can_use("amp", device) else nullcontext()): - yield from pipe( + result = pipe( prompt=prompt, + negative_prompt=negative_prompt if use_negative_prompt else None, image=control_image, controlnet_conditioning_scale=controlnet_conditioning_scale, - init_image=init_image.convert('RGB') if init_image is not None else None, - mask=mask_image, - strength=strength, width=rounded_size[0], height=rounded_size[1], num_inference_steps=steps, guidance_scale=cfg_scale, - negative_prompt=negative_prompt if use_negative_prompt else None, - num_images_per_prompt=1, - eta=0.0, generator=generator, - latents=None, - output_type="pil", - return_dict=True, - callback=None, - callback_steps=1, - step_preview_mode=step_preview_mode, - cfg_end=optimizations.cfg_end + callback=callback ) - case Pipeline.STABILITY_SDK: - import stability_sdk - raise NotImplementedError() - case _: - raise Exception(f"Unsupported pipeline {pipeline}.") \ No newline at end of file + + future.add_response(ImageGenerationResult( + [np.asarray(PIL.ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255. + for image in result.images], + [gen.initial_seed() for gen in generator] if isinstance(generator, list) else [generator.initial_seed()], + steps, + True + )) + except InterruptedError: + pass + + future.set_done() \ No newline at end of file diff --git a/generator_process/actions/convert_original_stable_diffusion_to_diffusers.py b/generator_process/actions/convert_original_stable_diffusion_to_diffusers.py index 4f137b0a..cde41f97 100644 --- a/generator_process/actions/convert_original_stable_diffusion_to_diffusers.py +++ b/generator_process/actions/convert_original_stable_diffusion_to_diffusers.py @@ -1,890 +1,67 @@ import os -import enum -class ModelConfig(enum.Enum): - STABLE_DIFFUSION_1 = "v1" - STABLE_DIFFUSION_2_BASE = "v2 (512, epsilon)" - STABLE_DIFFUSION_2 = "v2 (768, v_prediction)" - STABLE_DIFFUSION_2_DEPTH = "v2 (depth)" - STABLE_DIFFUSION_2_INPAINTING = "v2 (inpainting)" +from .huggingface_hub import DownloadStatus +from ..future import Future +from ..models import ModelConfig - @property - def original_config(self): - match self: - case ModelConfig.STABLE_DIFFUSION_1: - return {'model': {'base_learning_rate': 0.0001, 'target': 'ldm.models.diffusion.ddpm.LatentDiffusion', 'params': {'linear_start': 0.00085, 'linear_end': 0.012, 'num_timesteps_cond': 1, 'log_every_t': 200, 'timesteps': 1000, 'first_stage_key': 'jpg', 'cond_stage_key': 'txt', 'image_size': 64, 'channels': 4, 'cond_stage_trainable': False, 'conditioning_key': 'crossattn', 'monitor': 'val/loss_simple_ema', 'scale_factor': 0.18215, 'use_ema': False, 'scheduler_config': {'target': 'ldm.lr_scheduler.LambdaLinearScheduler', 'params': {'warm_up_steps': [10000], 'cycle_lengths': [10000000000000], 'f_start': [1e-06], 'f_max': [1.0], 'f_min': [1.0]}}, 'unet_config': {'target': 'ldm.modules.diffusionmodules.openaimodel.UNetModel', 'params': {'image_size': 32, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2, 1], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4, 4], 'num_heads': 8, 'use_spatial_transformer': True, 'transformer_depth': 1, 'context_dim': 768, 'use_checkpoint': True, 'legacy': False}}, 'first_stage_config': {'target': 'ldm.models.autoencoder.AutoencoderKL', 'params': {'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}}, 'cond_stage_config': {'target': 'ldm.modules.encoders.modules.FrozenCLIPEmbedder'}}}} - case ModelConfig.STABLE_DIFFUSION_2_BASE: - return {'model': {'base_learning_rate': 0.0001, 'target': 'ldm.models.diffusion.ddpm.LatentDiffusion', 'params': {'linear_start': 0.00085, 'linear_end': 0.012, 'num_timesteps_cond': 1, 'log_every_t': 200, 'timesteps': 1000, 'first_stage_key': 'jpg', 'cond_stage_key': 'txt', 'image_size': 64, 'channels': 4, 'cond_stage_trainable': False, 'conditioning_key': 'crossattn', 'monitor': 'val/loss_simple_ema', 'scale_factor': 0.18215, 'use_ema': False, 'unet_config': {'target': 'ldm.modules.diffusionmodules.openaimodel.UNetModel', 'params': {'use_checkpoint': True, 'use_fp16': True, 'image_size': 32, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2, 1], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': 1, 'context_dim': 1024, 'legacy': False}}, 'first_stage_config': {'target': 'ldm.models.autoencoder.AutoencoderKL', 'params': {'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}}, 'cond_stage_config': {'target': 'ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder', 'params': {'freeze': True, 'layer': 'penultimate'}}}}} - case ModelConfig.STABLE_DIFFUSION_2: - return {'model': {'base_learning_rate': 0.0001, 'target': 'ldm.models.diffusion.ddpm.LatentDiffusion', 'params': {'parameterization': 'v', 'linear_start': 0.00085, 'linear_end': 0.012, 'num_timesteps_cond': 1, 'log_every_t': 200, 'timesteps': 1000, 'first_stage_key': 'jpg', 'cond_stage_key': 'txt', 'image_size': 64, 'channels': 4, 'cond_stage_trainable': False, 'conditioning_key': 'crossattn', 'monitor': 'val/loss_simple_ema', 'scale_factor': 0.18215, 'use_ema': False, 'unet_config': {'target': 'ldm.modules.diffusionmodules.openaimodel.UNetModel', 'params': {'use_checkpoint': True, 'use_fp16': True, 'image_size': 32, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2, 1], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': 1, 'context_dim': 1024, 'legacy': False}}, 'first_stage_config': {'target': 'ldm.models.autoencoder.AutoencoderKL', 'params': {'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}}, 'cond_stage_config': {'target': 'ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder', 'params': {'freeze': True, 'layer': 'penultimate'}}}}} - case ModelConfig.STABLE_DIFFUSION_2_DEPTH: - return {'model': {'base_learning_rate': 5e-07, 'target': 'ldm.models.diffusion.ddpm.LatentDepth2ImageDiffusion', 'params': {'linear_start': 0.00085, 'linear_end': 0.012, 'num_timesteps_cond': 1, 'log_every_t': 200, 'timesteps': 1000, 'first_stage_key': 'jpg', 'cond_stage_key': 'txt', 'image_size': 64, 'channels': 4, 'cond_stage_trainable': False, 'conditioning_key': 'hybrid', 'scale_factor': 0.18215, 'monitor': 'val/loss_simple_ema', 'finetune_keys': None, 'use_ema': False, 'depth_stage_config': {'target': 'ldm.modules.midas.api.MiDaSInference', 'params': {'model_type': 'dpt_hybrid'}}, 'unet_config': {'target': 'ldm.modules.diffusionmodules.openaimodel.UNetModel', 'params': {'use_checkpoint': True, 'image_size': 32, 'in_channels': 5, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2, 1], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': 1, 'context_dim': 1024, 'legacy': False}}, 'first_stage_config': {'target': 'ldm.models.autoencoder.AutoencoderKL', 'params': {'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}}, 'cond_stage_config': {'target': 'ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder', 'params': {'freeze': True, 'layer': 'penultimate'}}}}} - case ModelConfig.STABLE_DIFFUSION_2_INPAINTING: - return {'model': {'base_learning_rate': 5e-05, 'target': 'ldm.models.diffusion.ddpm.LatentInpaintDiffusion', 'params': {'linear_start': 0.00085, 'linear_end': 0.012, 'num_timesteps_cond': 1, 'log_every_t': 200, 'timesteps': 1000, 'first_stage_key': 'jpg', 'cond_stage_key': 'txt', 'image_size': 64, 'channels': 4, 'cond_stage_trainable': False, 'conditioning_key': 'hybrid', 'scale_factor': 0.18215, 'monitor': 'val/loss_simple_ema', 'finetune_keys': None, 'use_ema': False, 'unet_config': {'target': 'ldm.modules.diffusionmodules.openaimodel.UNetModel', 'params': {'use_checkpoint': True, 'image_size': 32, 'in_channels': 9, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2, 1], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': 1, 'context_dim': 1024, 'legacy': False}}, 'first_stage_config': {'target': 'ldm.models.autoencoder.AutoencoderKL', 'params': {'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}}, 'cond_stage_config': {'target': 'ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder', 'params': {'freeze': True, 'layer': 'penultimate'}}}}, 'data': {'target': 'ldm.data.laion.WebDataModuleFromConfig', 'params': {'tar_base': None, 'p_unsafe_threshold': 0.1, 'filter_word_list': 'data/filters.yaml', 'max_pwatermark': 0.45, 'batch_size': 8, 'num_workers': 6, 'multinode': True, 'min_size': 512, 'train': {'shards': ['pipe:aws s3 cp s3://stability-aws/laion-a-native/part-0/{00000..18699}.tar -', 'pipe:aws s3 cp s3://stability-aws/laion-a-native/part-1/{00000..18699}.tar -', 'pipe:aws s3 cp s3://stability-aws/laion-a-native/part-2/{00000..18699}.tar -', 'pipe:aws s3 cp s3://stability-aws/laion-a-native/part-3/{00000..18699}.tar -', 'pipe:aws s3 cp s3://stability-aws/laion-a-native/part-4/{00000..18699}.tar -'], 'shuffle': 10000, 'image_key': 'jpg', 'image_transforms': [{'target': 'torchvision.transforms.Resize', 'params': {'size': 512, 'interpolation': 3}}, {'target': 'torchvision.transforms.RandomCrop', 'params': {'size': 512}}], 'postprocess': {'target': 'ldm.data.laion.AddMask', 'params': {'mode': '512train-large', 'p_drop': 0.25}}}, 'validation': {'shards': ['pipe:aws s3 cp s3://deep-floyd-s3/datasets/laion_cleaned-part5/{93001..94333}.tar - '], 'shuffle': 0, 'image_key': 'jpg', 'image_transforms': [{'target': 'torchvision.transforms.Resize', 'params': {'size': 512, 'interpolation': 3}}, {'target': 'torchvision.transforms.CenterCrop', 'params': {'size': 512}}], 'postprocess': {'target': 'ldm.data.laion.AddMask', 'params': {'mode': '512train-large', 'p_drop': 0.25}}}}}, 'lightning': {'find_unused_parameters': True, 'modelcheckpoint': {'params': {'every_n_train_steps': 5000}}, 'callbacks': {'metrics_over_trainsteps_checkpoint': {'params': {'every_n_train_steps': 10000}}, 'image_logger': {'target': 'main.ImageLogger', 'params': {'enable_autocast': False, 'disabled': False, 'batch_frequency': 1000, 'max_images': 4, 'increase_log_steps': False, 'log_first_step': False, 'log_images_kwargs': {'use_ema_scope': False, 'inpaint': False, 'plot_progressive_rows': False, 'plot_diffusion_rows': False, 'N': 4, 'unconditional_guidance_scale': 5.0, 'unconditional_guidance_label': [''], 'ddim_steps': 50, 'ddim_eta': 0.0}}}}, 'trainer': {'benchmark': True, 'val_check_interval': 5000000, 'num_sanity_val_steps': 0, 'accumulate_grad_batches': 1}}} def convert_original_stable_diffusion_to_diffusers( self, checkpoint_path: str, model_config: ModelConfig, + half_precision: bool, ) -> str: import torch - from diffusers import StableDiffusionPipeline - from diffusers.utils import DIFFUSERS_CACHE, WEIGHTS_NAME, CONFIG_NAME, ONNX_WEIGHTS_NAME - - from diffusers import ( - AutoencoderKL, - DDIMScheduler, - DPMSolverMultistepScheduler, - EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, - HeunDiscreteScheduler, - LDMTextToImagePipeline, - LMSDiscreteScheduler, - PNDMScheduler, - StableDiffusionPipeline, - UNet2DConditionModel, - ) - from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel - # from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder, PaintByExamplePipeline - from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker - from transformers import AutoFeatureExtractor, BertTokenizerFast, CLIPTextModel, CLIPTokenizer, CLIPVisionConfig - - #region https://github.com/huggingface/diffusers/blob/main/scripts/convert_original_stable_diffusion_to_diffusers.py - def shave_segments(path, n_shave_prefix_segments=1): - """ - Removes segments. Positive values shave the first segments, negative shave the last segments. - """ - if n_shave_prefix_segments >= 0: - return ".".join(path.split(".")[n_shave_prefix_segments:]) - else: - return ".".join(path.split(".")[:n_shave_prefix_segments]) - - - def renew_resnet_paths(old_list, n_shave_prefix_segments=0): - """ - Updates paths inside resnets to the new naming scheme (local renaming) - """ - mapping = [] - for old_item in old_list: - new_item = old_item.replace("in_layers.0", "norm1") - new_item = new_item.replace("in_layers.2", "conv1") - - new_item = new_item.replace("out_layers.0", "norm2") - new_item = new_item.replace("out_layers.3", "conv2") - - new_item = new_item.replace("emb_layers.1", "time_emb_proj") - new_item = new_item.replace("skip_connection", "conv_shortcut") - - new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) - - mapping.append({"old": old_item, "new": new_item}) - - return mapping - - - def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0): - """ - Updates paths inside resnets to the new naming scheme (local renaming) - """ - mapping = [] - for old_item in old_list: - new_item = old_item - - new_item = new_item.replace("nin_shortcut", "conv_shortcut") - new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) - - mapping.append({"old": old_item, "new": new_item}) - - return mapping - - - def renew_attention_paths(old_list, n_shave_prefix_segments=0): - """ - Updates paths inside attentions to the new naming scheme (local renaming) - """ - mapping = [] - for old_item in old_list: - new_item = old_item - - # new_item = new_item.replace('norm.weight', 'group_norm.weight') - # new_item = new_item.replace('norm.bias', 'group_norm.bias') - - # new_item = new_item.replace('proj_out.weight', 'proj_attn.weight') - # new_item = new_item.replace('proj_out.bias', 'proj_attn.bias') - - # new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) - - mapping.append({"old": old_item, "new": new_item}) - - return mapping - - - def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): - """ - Updates paths inside attentions to the new naming scheme (local renaming) - """ - mapping = [] - for old_item in old_list: - new_item = old_item - - new_item = new_item.replace("norm.weight", "group_norm.weight") - new_item = new_item.replace("norm.bias", "group_norm.bias") - - new_item = new_item.replace("q.weight", "query.weight") - new_item = new_item.replace("q.bias", "query.bias") - - new_item = new_item.replace("k.weight", "key.weight") - new_item = new_item.replace("k.bias", "key.bias") - - new_item = new_item.replace("v.weight", "value.weight") - new_item = new_item.replace("v.bias", "value.bias") - - new_item = new_item.replace("proj_out.weight", "proj_attn.weight") - new_item = new_item.replace("proj_out.bias", "proj_attn.bias") - - new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) - - mapping.append({"old": old_item, "new": new_item}) - - return mapping - - - def assign_to_checkpoint( - paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None - ): - """ - This does the final conversion step: take locally converted weights and apply a global renaming - to them. It splits attention layers, and takes into account additional replacements - that may arise. - Assigns the weights to the new checkpoint. - """ - assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys." - - # Splits the attention layers into three variables. - if attention_paths_to_split is not None: - for path, path_map in attention_paths_to_split.items(): - old_tensor = old_checkpoint[path] - channels = old_tensor.shape[0] // 3 - - target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1) - - num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3 - - old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:]) - query, key, value = old_tensor.split(channels // num_heads, dim=1) - - checkpoint[path_map["query"]] = query.reshape(target_shape) - checkpoint[path_map["key"]] = key.reshape(target_shape) - checkpoint[path_map["value"]] = value.reshape(target_shape) - - for path in paths: - new_path = path["new"] - - # These have already been assigned - if attention_paths_to_split is not None and new_path in attention_paths_to_split: - continue - - # Global renaming happens here - new_path = new_path.replace("middle_block.0", "mid_block.resnets.0") - new_path = new_path.replace("middle_block.1", "mid_block.attentions.0") - new_path = new_path.replace("middle_block.2", "mid_block.resnets.1") - - if additional_replacements is not None: - for replacement in additional_replacements: - new_path = new_path.replace(replacement["old"], replacement["new"]) - - # proj_attn.weight has to be converted from conv 1D to linear - if "proj_attn.weight" in new_path: - checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0] - else: - checkpoint[new_path] = old_checkpoint[path["old"]] - - - def conv_attn_to_linear(checkpoint): - keys = list(checkpoint.keys()) - attn_keys = ["query.weight", "key.weight", "value.weight"] - for key in keys: - if ".".join(key.split(".")[-2:]) in attn_keys: - if checkpoint[key].ndim > 2: - checkpoint[key] = checkpoint[key][:, :, 0, 0] - elif "proj_attn.weight" in key: - if checkpoint[key].ndim > 2: - checkpoint[key] = checkpoint[key][:, :, 0] - - - def create_unet_diffusers_config(original_config, image_size: int): - """ - Creates a config for the diffusers based on the config of the LDM model. - """ - unet_params = original_config.model.params.unet_config.params - vae_params = original_config.model.params.first_stage_config.params.ddconfig - - block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult] - - down_block_types = [] - resolution = 1 - for i in range(len(block_out_channels)): - block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D" - down_block_types.append(block_type) - if i != len(block_out_channels) - 1: - resolution *= 2 - - up_block_types = [] - for i in range(len(block_out_channels)): - block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" - up_block_types.append(block_type) - resolution //= 2 - - vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1) - - head_dim = unet_params.num_heads if "num_heads" in unet_params else None - use_linear_projection = ( - unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False - ) - if use_linear_projection: - # stable diffusion 2-base-512 and 2-768 - if head_dim is None: - head_dim = [5, 10, 20, 20] - - config = dict( - sample_size=image_size // vae_scale_factor, - in_channels=unet_params.in_channels, - out_channels=unet_params.out_channels, - down_block_types=tuple(down_block_types), - up_block_types=tuple(up_block_types), - block_out_channels=tuple(block_out_channels), - layers_per_block=unet_params.num_res_blocks, - cross_attention_dim=unet_params.context_dim, - attention_head_dim=head_dim, - use_linear_projection=use_linear_projection, - ) - - return config - - - def create_vae_diffusers_config(original_config, image_size: int): - """ - Creates a config for the diffusers based on the config of the LDM model. - """ - vae_params = original_config.model.params.first_stage_config.params.ddconfig - _ = original_config.model.params.first_stage_config.params.embed_dim - - block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult] - down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels) - up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels) - - config = dict( - sample_size=image_size, - in_channels=vae_params.in_channels, - out_channels=vae_params.out_ch, - down_block_types=tuple(down_block_types), - up_block_types=tuple(up_block_types), - block_out_channels=tuple(block_out_channels), - latent_channels=vae_params.z_channels, - layers_per_block=vae_params.num_res_blocks, - ) - return config - - - def create_diffusers_schedular(original_config): - schedular = DDIMScheduler( - num_train_timesteps=original_config.model.params.timesteps, - beta_start=original_config.model.params.linear_start, - beta_end=original_config.model.params.linear_end, - beta_schedule="scaled_linear", - ) - return schedular - - - def create_ldm_bert_config(original_config): - bert_params = original_config.model.parms.cond_stage_config.params - config = LDMBertConfig( - d_model=bert_params.n_embed, - encoder_layers=bert_params.n_layer, - encoder_ffn_dim=bert_params.n_embed * 4, - ) - return config - - - def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False): - """ - Takes a state dict and a config, and returns a converted checkpoint. - """ - - # extract state_dict for UNet - unet_state_dict = {} - keys = list(checkpoint.keys()) - - unet_key = "model.diffusion_model." - # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA - if sum(k.startswith("model_ema") for k in keys) > 100: - print(f"Checkpoint {path} has both EMA and non-EMA weights.") - if extract_ema: - print( - "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA" - " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag." - ) - for key in keys: - if key.startswith("model.diffusion_model"): - flat_ema_key = "model_ema." + "".join(key.split(".")[1:]) - unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key) - else: - print( - "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA" - " weights (usually better for inference), please make sure to add the `--extract_ema` flag." - ) - - for key in keys: - if key.startswith(unet_key): - unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key) - - new_checkpoint = {} - - new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"] - new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"] - new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"] - new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"] - - new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"] - new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"] - - new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"] - new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"] - new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"] - new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"] - - # Retrieves the keys for the input blocks only - num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer}) - input_blocks = { - layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key] - for layer_id in range(num_input_blocks) - } - - # Retrieves the keys for the middle blocks only - num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer}) - middle_blocks = { - layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key] - for layer_id in range(num_middle_blocks) - } - - # Retrieves the keys for the output blocks only - num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer}) - output_blocks = { - layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key] - for layer_id in range(num_output_blocks) - } - - for i in range(1, num_input_blocks): - block_id = (i - 1) // (config["layers_per_block"] + 1) - layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1) - - resnets = [ - key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key - ] - attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key] - - if f"input_blocks.{i}.0.op.weight" in unet_state_dict: - new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop( - f"input_blocks.{i}.0.op.weight" - ) - new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop( - f"input_blocks.{i}.0.op.bias" - ) - - paths = renew_resnet_paths(resnets) - meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"} - assign_to_checkpoint( - paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config - ) - - if len(attentions): - paths = renew_attention_paths(attentions) - meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"} - assign_to_checkpoint( - paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config - ) - - resnet_0 = middle_blocks[0] - attentions = middle_blocks[1] - resnet_1 = middle_blocks[2] - - resnet_0_paths = renew_resnet_paths(resnet_0) - assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config) - - resnet_1_paths = renew_resnet_paths(resnet_1) - assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config) - - attentions_paths = renew_attention_paths(attentions) - meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"} - assign_to_checkpoint( - attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config - ) - - for i in range(num_output_blocks): - block_id = i // (config["layers_per_block"] + 1) - layer_in_block_id = i % (config["layers_per_block"] + 1) - output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]] - output_block_list = {} - - for layer in output_block_layers: - layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1) - if layer_id in output_block_list: - output_block_list[layer_id].append(layer_name) - else: - output_block_list[layer_id] = [layer_name] - - if len(output_block_list) > 1: - resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key] - attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key] - - resnet_0_paths = renew_resnet_paths(resnets) - paths = renew_resnet_paths(resnets) - - meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"} - assign_to_checkpoint( - paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config - ) - - if ["conv.weight", "conv.bias"] in output_block_list.values(): - index = list(output_block_list.values()).index(["conv.weight", "conv.bias"]) - new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[ - f"output_blocks.{i}.{index}.conv.weight" - ] - new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[ - f"output_blocks.{i}.{index}.conv.bias" - ] - - # Clear attentions as they have been attributed above. - if len(attentions) == 2: - attentions = [] - - if len(attentions): - paths = renew_attention_paths(attentions) - meta_path = { - "old": f"output_blocks.{i}.1", - "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}", - } - assign_to_checkpoint( - paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config - ) - else: - resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1) - for path in resnet_0_paths: - old_path = ".".join(["output_blocks", str(i), path["old"]]) - new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]]) - - new_checkpoint[new_path] = unet_state_dict[old_path] - - return new_checkpoint - - - def convert_ldm_vae_checkpoint(checkpoint, config): - # extract state dict for VAE - vae_state_dict = {} - vae_key = "first_stage_model." - keys = list(checkpoint.keys()) - for key in keys: - if key.startswith(vae_key): - vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key) - - new_checkpoint = {} - - new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"] - new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"] - new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"] - new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"] - new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"] - new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"] - - new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"] - new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"] - new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"] - new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"] - new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"] - new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"] - - new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"] - new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"] - new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"] - new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"] - - # Retrieves the keys for the encoder down blocks only - num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer}) - down_blocks = { - layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks) - } - - # Retrieves the keys for the decoder up blocks only - num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer}) - up_blocks = { - layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks) - } - - for i in range(num_down_blocks): - resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key] - - if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict: - new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( - f"encoder.down.{i}.downsample.conv.weight" - ) - new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( - f"encoder.down.{i}.downsample.conv.bias" - ) - - paths = renew_vae_resnet_paths(resnets) - meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"} - assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) - - mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key] - num_mid_res_blocks = 2 - for i in range(1, num_mid_res_blocks + 1): - resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key] - - paths = renew_vae_resnet_paths(resnets) - meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} - assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) - - mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key] - paths = renew_vae_attention_paths(mid_attentions) - meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} - assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) - conv_attn_to_linear(new_checkpoint) - - for i in range(num_up_blocks): - block_id = num_up_blocks - 1 - i - resnets = [ - key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key - ] - - if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict: - new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[ - f"decoder.up.{block_id}.upsample.conv.weight" - ] - new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[ - f"decoder.up.{block_id}.upsample.conv.bias" - ] - - paths = renew_vae_resnet_paths(resnets) - meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"} - assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) - - mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key] - num_mid_res_blocks = 2 - for i in range(1, num_mid_res_blocks + 1): - resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key] - - paths = renew_vae_resnet_paths(resnets) - meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} - assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) - - mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key] - paths = renew_vae_attention_paths(mid_attentions) - meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} - assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) - conv_attn_to_linear(new_checkpoint) - return new_checkpoint - - - def convert_ldm_bert_checkpoint(checkpoint, config): - def _copy_attn_layer(hf_attn_layer, pt_attn_layer): - hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight - hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight - hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight - - hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight - hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias - - def _copy_linear(hf_linear, pt_linear): - hf_linear.weight = pt_linear.weight - hf_linear.bias = pt_linear.bias - - def _copy_layer(hf_layer, pt_layer): - # copy layer norms - _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0]) - _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0]) - - # copy attn - _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1]) - - # copy MLP - pt_mlp = pt_layer[1][1] - _copy_linear(hf_layer.fc1, pt_mlp.net[0][0]) - _copy_linear(hf_layer.fc2, pt_mlp.net[2]) - - def _copy_layers(hf_layers, pt_layers): - for i, hf_layer in enumerate(hf_layers): - if i != 0: - i += i - pt_layer = pt_layers[i : i + 2] - _copy_layer(hf_layer, pt_layer) - - hf_model = LDMBertModel(config).eval() - - # copy embeds - hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight - hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight - - # copy layer norm - _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm) - - # copy hidden layers - _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers) - - _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits) - - return hf_model - - - def convert_ldm_clip_checkpoint(checkpoint): - text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") - - keys = list(checkpoint.keys()) - - text_model_dict = {} - - for key in keys: - if key.startswith("cond_stage_model.transformer"): - text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key] - - text_model.load_state_dict(text_model_dict) - - return text_model - - - def convert_paint_by_example_checkpoint(checkpoint): - config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14") - model = PaintByExampleImageEncoder(config) - - keys = list(checkpoint.keys()) - - text_model_dict = {} - - for key in keys: - if key.startswith("cond_stage_model.transformer"): - text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key] - - # load clip vision - model.model.load_state_dict(text_model_dict) - - # load mapper - keys_mapper = { - k[len("cond_stage_model.mapper.res") :]: v - for k, v in checkpoint.items() - if k.startswith("cond_stage_model.mapper") - } - - MAPPING = { - "attn.c_qkv": ["attn1.to_q", "attn1.to_k", "attn1.to_v"], - "attn.c_proj": ["attn1.to_out.0"], - "ln_1": ["norm1"], - "ln_2": ["norm3"], - "mlp.c_fc": ["ff.net.0.proj"], - "mlp.c_proj": ["ff.net.2"], - } - - mapped_weights = {} - for key, value in keys_mapper.items(): - prefix = key[: len("blocks.i")] - suffix = key.split(prefix)[-1].split(".")[-1] - name = key.split(prefix)[-1].split(suffix)[0][1:-1] - mapped_names = MAPPING[name] - - num_splits = len(mapped_names) - for i, mapped_name in enumerate(mapped_names): - new_name = ".".join([prefix, mapped_name, suffix]) - shape = value.shape[0] // num_splits - mapped_weights[new_name] = value[i * shape : (i + 1) * shape] - - model.mapper.load_state_dict(mapped_weights) - - # load final layer norm - model.final_layer_norm.load_state_dict( - { - "bias": checkpoint["cond_stage_model.final_ln.bias"], - "weight": checkpoint["cond_stage_model.final_ln.weight"], - } + from diffusers.utils import DIFFUSERS_CACHE + from diffusers.pipelines.stable_diffusion.convert_from_ckpt import download_from_original_stable_diffusion_ckpt, download_controlnet_from_original_ckpt + + future = Future() + yield future + DownloadStatus.hook_download_tqdm(future) + + future.add_response(DownloadStatus(f"Reading {checkpoint_path}", 0, 1)) + index = 0 + def hook_save_pretrained(model, dirs_count, total): + old_save_pretrained = model.save_pretrained + def save_pretrained(self, save_directory, *args, **kwargs): + nonlocal index + dirs = [] + directory = save_directory + for _ in range(dirs_count): + dirs.append(os.path.basename(directory)) + directory = os.path.dirname(directory) + dirs.reverse() + future.add_response(DownloadStatus(f"Saving {os.path.join(*dirs)}", index, total)) + index += 1 + return old_save_pretrained(save_directory, *args, **kwargs) + model.save_pretrained = save_pretrained.__get__(model) + + if model_config in [ModelConfig.CONTROL_NET_1_5, ModelConfig.CONTROL_NET_2_1]: + pipe = download_controlnet_from_original_ckpt( + checkpoint_path, + original_config_file=model_config.original_config, + from_safetensors=checkpoint_path.endswith(".safetensors"), ) - - # load final proj - model.proj_out.load_state_dict( - { - "bias": checkpoint["proj_out.bias"], - "weight": checkpoint["proj_out.weight"], - } - ) - - # load uncond vector - model.uncond_vector.data = torch.nn.Parameter(checkpoint["learnable_vector"]) - return model - - - def convert_open_clip_checkpoint(checkpoint): - text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder") - - # SKIP for now - need openclip -> HF conversion script here - # keys = list(checkpoint.keys()) - # - # text_model_dict = {} - # for key in keys: - # if key.startswith("cond_stage_model.model.transformer"): - # text_model_dict[key[len("cond_stage_model.model.transformer.") :]] = checkpoint[key] - # - # text_model.load_state_dict(text_model_dict) - - return text_model - #endregion - - prediction_type = None - image_size = None - scheduler_type = "pndm" - extract_ema = False - pipeline_type = None - dump_path = os.path.join(DIFFUSERS_CACHE, os.path.splitext(os.path.basename(checkpoint_path))[0]) - - checkpoint = torch.load(checkpoint_path) - global_step = checkpoint.get("global_step", None) - checkpoint = checkpoint["state_dict"] - - key_name = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight" - - original_config = model_config.original_config - # if key_name in checkpoint and checkpoint[key_name].shape[-1] == 1024: - # # model_type = "v2" - # original_config = {'model': {'base_learning_rate': 0.0001, 'target': 'ldm.models.diffusion.ddpm.LatentDiffusion', 'params': {'parameterization': 'v', 'linear_start': 0.00085, 'linear_end': 0.012, 'num_timesteps_cond': 1, 'log_every_t': 200, 'timesteps': 1000, 'first_stage_key': 'jpg', 'cond_stage_key': 'txt', 'image_size': 64, 'channels': 4, 'cond_stage_trainable': False, 'conditioning_key': 'crossattn', 'monitor': 'val/loss_simple_ema', 'scale_factor': 0.18215, 'use_ema': False, 'unet_config': {'target': 'ldm.modules.diffusionmodules.openaimodel.UNetModel', 'params': {'use_checkpoint': True, 'use_fp16': True, 'image_size': 32, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2, 1], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': 1, 'context_dim': 1024, 'legacy': False}}, 'first_stage_config': {'target': 'ldm.models.autoencoder.AutoencoderKL', 'params': {'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}}, 'cond_stage_config': {'target': 'ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder', 'params': {'freeze': True, 'layer': 'penultimate'}}}}} - # else: - # # model_type = "v1" - # original_config = {'model': {'base_learning_rate': 0.0001, 'target': 'ldm.models.diffusion.ddpm.LatentDiffusion', 'params': {'linear_start': 0.00085, 'linear_end': 0.012, 'num_timesteps_cond': 1, 'log_every_t': 200, 'timesteps': 1000, 'first_stage_key': 'jpg', 'cond_stage_key': 'txt', 'image_size': 64, 'channels': 4, 'cond_stage_trainable': False, 'conditioning_key': 'crossattn', 'monitor': 'val/loss_simple_ema', 'scale_factor': 0.18215, 'use_ema': False, 'scheduler_config': {'target': 'ldm.lr_scheduler.LambdaLinearScheduler', 'params': {'warm_up_steps': [10000], 'cycle_lengths': [10000000000000], 'f_start': [1e-06], 'f_max': [1.0], 'f_min': [1.0]}}, 'unet_config': {'target': 'ldm.modules.diffusionmodules.openaimodel.UNetModel', 'params': {'image_size': 32, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2, 1], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4, 4], 'num_heads': 8, 'use_spatial_transformer': True, 'transformer_depth': 1, 'context_dim': 768, 'use_checkpoint': True, 'legacy': False}}, 'first_stage_config': {'target': 'ldm.models.autoencoder.AutoencoderKL', 'params': {'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}}, 'cond_stage_config': {'target': 'ldm.modules.encoders.modules.FrozenCLIPEmbedder'}}}} - - class dotdict(dict): - __getattr__ = dict.get - __setattr__ = dict.__setitem__ - __delattr__ = dict.__delitem__ - @staticmethod - def deep(original): - return dotdict({ - k: (dotdict.deep(v) if isinstance(v, dict) else v) for k, v in original.items() - }) - - original_config = dotdict.deep(original_config) - - if ( - "parameterization" in original_config["model"]["params"] - and original_config["model"]["params"]["parameterization"] == "v" - and global_step is not None - ): - if prediction_type is None: - # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"` - # as it relies on a brittle global step parameter here - prediction_type = "epsilon" if global_step == 875000 else "v_prediction" - if image_size is None: - # NOTE: For stable diffusion 2 base one has to pass `image_size==512` - # as it relies on a brittle global step parameter here - image_size = 512 if global_step == 875000 else 768 - else: - if prediction_type is None: - prediction_type = "epsilon" - if image_size is None: - image_size = 512 - - num_train_timesteps = original_config['model']['params']['timesteps'] - beta_start = original_config['model']['params']['linear_start'] - beta_end = original_config['model']['params']['linear_end'] - - scheduler = DDIMScheduler( - beta_end=beta_end, - beta_schedule="scaled_linear", - beta_start=beta_start, - num_train_timesteps=num_train_timesteps, - steps_offset=1, - clip_sample=False, - set_alpha_to_one=False, - prediction_type=prediction_type, - ) - if scheduler_type == "pndm": - config = dict(scheduler.config) - config["skip_prk_steps"] = True - scheduler = PNDMScheduler.from_config(config) - elif scheduler_type == "lms": - scheduler = LMSDiscreteScheduler.from_config(scheduler.config) - elif scheduler_type == "heun": - scheduler = HeunDiscreteScheduler.from_config(scheduler.config) - elif scheduler_type == "euler": - scheduler = EulerDiscreteScheduler.from_config(scheduler.config) - elif scheduler_type == "euler-ancestral": - scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config) - elif scheduler_type == "dpm": - scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config) - elif scheduler_type == "ddim": - scheduler = scheduler + if half_precision: + pipe.to(dtype=torch.float16) + index = 1 + hook_save_pretrained(pipe, 1, 2) else: - raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") - - # Convert the UNet2DConditionModel model. - unet_config = create_unet_diffusers_config(original_config, image_size=image_size) - unet = UNet2DConditionModel(**unet_config) - - converted_unet_checkpoint = convert_ldm_unet_checkpoint( - checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema - ) - - unet.load_state_dict(converted_unet_checkpoint) - - # Convert the VAE model. - vae_config = create_vae_diffusers_config(original_config, image_size=image_size) - converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) - - vae = AutoencoderKL(**vae_config) - vae.load_state_dict(converted_vae_checkpoint) - - # Convert the text model. - model_type = pipeline_type - if model_type is None: - model_type = original_config.model.params.cond_stage_config.target.split(".")[-1] - - if model_type == "FrozenOpenCLIPEmbedder": - text_model = convert_open_clip_checkpoint(checkpoint) - tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer") - pipe = StableDiffusionPipeline( - vae=vae, - text_encoder=text_model, - tokenizer=tokenizer, - unet=unet, - scheduler=scheduler, - safety_checker=None, - feature_extractor=None, - requires_safety_checker=False, - ) - elif model_type == "PaintByExample": - vision_model = convert_paint_by_example_checkpoint(checkpoint) - tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") - feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker") - pipe = PaintByExamplePipeline( - vae=vae, - image_encoder=vision_model, - unet=unet, - scheduler=scheduler, - safety_checker=None, - feature_extractor=feature_extractor, - ) - elif model_type == "FrozenCLIPEmbedder": - text_model = convert_ldm_clip_checkpoint(checkpoint) - tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") - safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") - feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker") - pipe = StableDiffusionPipeline( - vae=vae, - text_encoder=text_model, - tokenizer=tokenizer, - unet=unet, - scheduler=scheduler, - safety_checker=safety_checker, - feature_extractor=feature_extractor, + pipe = download_from_original_stable_diffusion_ckpt( + checkpoint_path, + original_config_file=model_config.original_config, + from_safetensors=checkpoint_path.endswith(".safetensors"), + pipeline_class=model_config.pipeline ) - else: - text_config = create_ldm_bert_config(original_config) - text_model = convert_ldm_bert_checkpoint(checkpoint, text_config) - tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") - pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler) - - pipe.save_pretrained(dump_path) - return dump_path \ No newline at end of file + if half_precision: + pipe.to(torch_dtype=torch.float16) + models = [] + for name in pipe._get_signature_keys(pipe)[0]: + model = getattr(pipe, name, None) + if model is not None and hasattr(model, "save_pretrained"): + models.append(model) + for i, model in enumerate(models): + hook_save_pretrained(model, 2, len(models)) + dump_path = os.path.join(DIFFUSERS_CACHE, os.path.splitext(os.path.basename(checkpoint_path))[0]) + pipe.save_pretrained(dump_path, variant="fp16" if half_precision else None) + future.set_done() diff --git a/generator_process/actions/depth_to_image.py b/generator_process/actions/depth_to_image.py index 84a1f3c9..a2b06db8 100644 --- a/generator_process/actions/depth_to_image.py +++ b/generator_process/actions/depth_to_image.py @@ -5,18 +5,16 @@ from numpy.typing import NDArray import numpy as np import random -from .prompt_to_image import Scheduler, Optimizations, StepPreviewMode, ImageGenerationResult, _configure_model_padding, model_snapshot_folder, load_pipe -from ..models import Pipeline -from .detect_seamless import SeamlessAxes - +from .prompt_to_image import Checkpoint, Scheduler, Optimizations, StepPreviewMode, ImageGenerationResult, _configure_model_padding +from ...api.models.seamless_axes import SeamlessAxes +from ..future import Future def depth_to_image( self, - pipeline: Pipeline, - model: str, + model: str | Checkpoint, - scheduler: Scheduler, + scheduler: str | Scheduler, optimizations: Optimizations, @@ -39,362 +37,364 @@ def depth_to_image( step_preview_mode: StepPreviewMode, **kwargs -) -> Generator[NDArray, None, None]: - match pipeline: - case Pipeline.STABLE_DIFFUSION: - import diffusers - import torch - import PIL.Image - import PIL.ImageOps - - class GeneratorPipeline(diffusers.StableDiffusionInpaintPipeline): - def prepare_depth(self, depth, image, dtype, device): - device = torch.device('cpu' if device.type == 'mps' else device.type) - if depth is None: - from transformers import DPTFeatureExtractor, DPTForDepthEstimation - import contextlib - feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large") - depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-large") - depth_estimator = depth_estimator.to(device) - - pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values - pixel_values = pixel_values.to(device=device) - # The DPT-Hybrid model uses batch-norm layers which are not compatible with fp16. - # So we use `torch.autocast` here for half precision inference. - context_manger = torch.autocast("cuda", dtype=dtype) if device.type == "cuda" else contextlib.nullcontext() - with context_manger: - depth_map = depth_estimator(pixel_values).predicted_depth - depth_map = torch.nn.functional.interpolate( - depth_map.unsqueeze(1), - size=(height // self.vae_scale_factor, width // self.vae_scale_factor), - mode="bicubic", - align_corners=False, - ) - - depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True) - depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True) - depth_map = 2.0 * (depth_map - depth_min) / (depth_max - depth_min) - 1.0 - depth_map = depth_map.to(device) - return depth_map - else: - if isinstance(depth, PIL.Image.Image): - depth = np.array(depth.convert("L")) - depth = depth.astype(np.float32) / 255.0 - depth = depth[None, None] - depth = torch.from_numpy(depth) - return depth - - def prepare_depth_latents( - self, depth, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance - ): - # resize the mask to latents shape as we concatenate the mask to the latents - # we do that before converting to dtype to avoid breaking in case we're using cpu_offload - # and half precision - depth = torch.nn.functional.interpolate( - depth, size=(height // self.vae_scale_factor, width // self.vae_scale_factor) - ) - depth = depth.to(device=device, dtype=dtype) - - # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method - depth = depth.repeat(batch_size, 1, 1, 1) - depth = torch.cat([depth] * 2) if do_classifier_free_guidance else depth - return depth - - def prepare_img2img_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None, image=None, timestep=None): - shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - - if latents is None: - rand_device = "cpu" if device.type == "mps" else device - - if isinstance(generator, list): - shape = (1,) + shape[1:] - latents = [ - torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype) - for i in range(batch_size) - ] - latents = torch.cat(latents, dim=0).to(device) - else: - latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device) - else: - if latents.shape != shape: - raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") - latents = latents.to(device) - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * self.scheduler.init_noise_sigma - - if image is not None: - image = image.to(device=device, dtype=dtype) - if isinstance(generator, list): - image_latents = [ - self.vae.encode(image[0:1]).latent_dist.sample(generator[i]) for i in range(batch_size) - ] - image_latents = torch.cat(image_latents, dim=0) - else: - image_latents = self.vae.encode(image).latent_dist.sample(generator) - image_latents = torch.nn.functional.interpolate( - image_latents, size=(height // self.vae_scale_factor, width // self.vae_scale_factor) - ) - image_latents = 0.18215 * image_latents - rand_device = "cpu" if device.type == "mps" else device - shape = image_latents.shape - if isinstance(generator, list): - shape = (1,) + shape[1:] - noise = [ - torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype) for i in - range(batch_size) - ] - noise = torch.cat(noise, dim=0).to(device) - else: - noise = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device) - latents = self.scheduler.add_noise(image_latents, noise, timestep) - - return latents - - - def get_timesteps(self, num_inference_steps, strength, device): - # get the original timestep using init_timestep - offset = self.scheduler.config.get("steps_offset", 0) - init_timestep = int(num_inference_steps * strength) + offset - init_timestep = min(init_timestep, num_inference_steps) - - t_start = max(num_inference_steps - init_timestep + offset, 0) - timesteps = self.scheduler.timesteps[t_start:] - - return timesteps, num_inference_steps - t_start - - @torch.no_grad() - def __call__( - self, - prompt: Union[str, List[str]], - depth_image: Union[torch.FloatTensor, PIL.Image.Image], - image: Optional[Union[torch.FloatTensor, PIL.Image.Image]] = None, - strength: float = 0.8, - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 50, - guidance_scale: float = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: Optional[int] = 1, - eta: float = 0.0, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - output_type: Optional[str] = "pil", - return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, - callback_steps: Optional[int] = 1, - **kwargs, - ): - - # 0. Default height and width to unet - height = height or self.unet.config.sample_size * self.vae_scale_factor - width = width or self.unet.config.sample_size * self.vae_scale_factor - - # 1. Check inputs - self.check_inputs(prompt, height, width, callback_steps) - - # 2. Define call parameters - batch_size = 1 if isinstance(prompt, str) else len(prompt) - device = self._execution_device - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - # 3. Encode input prompt - text_embeddings = self._encode_prompt( - prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt - ) - - # 4. Prepare the depth image - depth = self.prepare_depth(depth_image, image, text_embeddings.dtype, device) - - if image is not None and isinstance(image, PIL.Image.Image): - image = diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess(image) - - # 5. set timesteps - self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps = self.scheduler.timesteps - if image is not None: - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) - - # 6. Prepare latent variables - num_channels_latents = self.vae.config.latent_channels - if image is not None: - latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) - latents = self.prepare_img2img_latents( - batch_size * num_images_per_prompt, - num_channels_latents, - height, - width, - text_embeddings.dtype, - device, - generator, - latents, - image, - latent_timestep - ) - else: - latents = self.prepare_latents( - batch_size * num_images_per_prompt, - num_channels_latents, - height, - width, - text_embeddings.dtype, - device, - generator, - latents, - ) - - # 7. Prepare mask latent variables - depth = self.prepare_depth_latents( - depth, - batch_size * num_images_per_prompt, - height, - width, - text_embeddings.dtype, - device, - generator, - do_classifier_free_guidance, - ) - - # 8. Check that sizes of mask, masked image and latents match - num_channels_depth = depth.shape[1] - if num_channels_latents + num_channels_depth != self.unet.config.in_channels: - raise ValueError( - f"Select a depth model, such as 'stabilityai/stable-diffusion-2-depth'" - ) - - # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - - # 10. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - # NOTE: Modified to support disabling CFG - if do_classifier_free_guidance and (i / len(timesteps)) >= kwargs['cfg_end']: - do_classifier_free_guidance = False - text_embeddings = text_embeddings[text_embeddings.size(0) // 2:] - depth = depth[depth.size(0) // 2:] - # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents - - # concat latents, mask, masked_image_latents in the channel dimension - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - latent_model_input = torch.cat([latent_model_input, depth], dim=1) - - # predict the noise residual - noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample - - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - progress_bar.update() - # NOTE: Modified to yield the latents instead of calling a callback. - yield ImageGenerationResult.step_preview(self, kwargs['step_preview_mode'], width, height, latents, generator, i) - - # 11. Post-processing - image = self.decode_latents(latents) - - # TODO: Add UI to enable this. - # 12. Run safety checker - # image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype) - - # Offload last model to CPU - if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: - self.final_offload_hook.offload() - - # NOTE: Modified to yield the decoded image as a numpy array. - yield ImageGenerationResult( - [np.asarray(PIL.ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255. - for i, image in enumerate(self.numpy_to_pil(image))], - [gen.initial_seed() for gen in generator] if isinstance(generator, list) else [generator.initial_seed()], - num_inference_steps, - True - ) +) -> Generator[Future, None, None]: + future = Future() + yield future + + import diffusers + import torch + import PIL.Image + import PIL.ImageOps + + class DreamTexturesDepth2ImgPipeline(diffusers.StableDiffusionInpaintPipeline): + def prepare_depth(self, depth, image, dtype, device): + device = torch.device('cpu' if device.type == 'mps' else device.type) + if depth is None: + from transformers import DPTFeatureExtractor, DPTForDepthEstimation + import contextlib + feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large") + depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-large") + depth_estimator = depth_estimator.to(device) + + pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values + pixel_values = pixel_values.to(device=device) + # The DPT-Hybrid model uses batch-norm layers which are not compatible with fp16. + # So we use `torch.autocast` here for half precision inference. + context_manger = torch.autocast("cuda", dtype=dtype) if device.type == "cuda" else contextlib.nullcontext() + with context_manger: + depth_map = depth_estimator(pixel_values).predicted_depth + depth_map = torch.nn.functional.interpolate( + depth_map.unsqueeze(1), + size=(height // self.vae_scale_factor, width // self.vae_scale_factor), + mode="bicubic", + align_corners=False, + ) + + depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True) + depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True) + depth_map = 2.0 * (depth_map - depth_min) / (depth_max - depth_min) - 1.0 + depth_map = depth_map.to(device) + return depth_map + else: + if isinstance(depth, PIL.Image.Image): + depth = np.array(depth.convert("L")) + depth = depth.astype(np.float32) / 255.0 + depth = depth[None, None] + depth = torch.from_numpy(depth) + return depth + + def prepare_depth_latents( + self, depth, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance + ): + # resize the mask to latents shape as we concatenate the mask to the latents + # we do that before converting to dtype to avoid breaking in case we're using cpu_offload + # and half precision + depth = torch.nn.functional.interpolate( + depth, size=(height // self.vae_scale_factor, width // self.vae_scale_factor) + ) + depth = depth.to(device=device, dtype=dtype) + + # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method + depth = depth.repeat(batch_size, 1, 1, 1) + depth = torch.cat([depth] * 2) if do_classifier_free_guidance else depth + return depth + + def prepare_img2img_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None, image=None, timestep=None): + shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + rand_device = "cpu" if device.type == "mps" else device + + if isinstance(generator, list): + shape = (1,) + shape[1:] + latents = [ + torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype) + for i in range(batch_size) + ] + latents = torch.cat(latents, dim=0).to(device) + else: + latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device) + else: + if latents.shape != shape: + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + + if image is not None: + image = image.to(device=device, dtype=dtype) + if isinstance(generator, list): + image_latents = [ + self.vae.encode(image[0:1]).latent_dist.sample(generator[i]) for i in range(batch_size) + ] + image_latents = torch.cat(image_latents, dim=0) + else: + image_latents = self.vae.encode(image).latent_dist.sample(generator) + image_latents = torch.nn.functional.interpolate( + image_latents, size=(height // self.vae_scale_factor, width // self.vae_scale_factor) + ) + image_latents = 0.18215 * image_latents + rand_device = "cpu" if device.type == "mps" else device + shape = image_latents.shape + if isinstance(generator, list): + shape = (1,) + shape[1:] + noise = [ + torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype) for i in + range(batch_size) + ] + noise = torch.cat(noise, dim=0).to(device) + else: + noise = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device) + latents = self.scheduler.add_noise(image_latents, noise, timestep) + + return latents + + + def get_timesteps(self, num_inference_steps, strength, device): + # get the original timestep using init_timestep + offset = self.scheduler.config.get("steps_offset", 0) + init_timestep = int(num_inference_steps * strength) + offset + init_timestep = min(init_timestep, num_inference_steps) + + t_start = max(num_inference_steps - init_timestep + offset, 0) + timesteps = self.scheduler.timesteps[t_start:] + + return timesteps, num_inference_steps - t_start + + @torch.no_grad() + def __call__( + self, + prompt: Union[str, List[str]], + depth_image: Union[torch.FloatTensor, PIL.Image.Image], + image: Optional[Union[torch.FloatTensor, PIL.Image.Image]] = None, + strength: float = 0.8, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: Optional[int] = 1, + **kwargs, + ): - if optimizations.cpu_only: - device = "cpu" + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + + # 1. Check inputs + self.check_inputs(prompt, height, width, strength, callback_steps) + + # 2. Define call parameters + batch_size = 1 if isinstance(prompt, str) else len(prompt) + device = self._execution_device + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + text_embeddings = self._encode_prompt( + prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt + ) + + # 4. Prepare the depth image + depth = self.prepare_depth(depth_image, image, text_embeddings.dtype, device) + + if image is not None and isinstance(image, PIL.Image.Image): + image = diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess(image) + + # 5. set timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + if image is not None: + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) + + # 6. Prepare latent variables + num_channels_latents = self.vae.config.latent_channels + if image is not None: + latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) + latents = self.prepare_img2img_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + text_embeddings.dtype, + device, + generator, + latents, + image, + latent_timestep + ) else: - device = self.choose_device() - - # StableDiffusionPipeline w/ caching - pipe = load_pipe(self, "depth", GeneratorPipeline, model, optimizations, scheduler, device) - - # Optimizations - pipe = optimizations.apply(pipe, device) - - # RNG - batch_size = len(prompt) if isinstance(prompt, list) else 1 - generator = [] - for _ in range(batch_size): - gen = torch.Generator(device="cpu" if device in ("mps", "dml") else device) # MPS and DML do not support the `Generator` API - generator.append(gen.manual_seed(random.randrange(0, np.iinfo(np.uint32).max) if seed is None else seed)) - if batch_size == 1: - # Some schedulers don't handle a list of generators: https://github.com/huggingface/diffusers/issues/1909 - generator = generator[0] - - # Init Image - # FIXME: The `unet.config.sample_size` of the depth model is `32`, not `64`. For now, this will be hardcoded to `512`. - height = height or 512 - width = width or 512 - rounded_size = ( - int(8 * (width // 8)), - int(8 * (height // 8)), + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + text_embeddings.dtype, + device, + generator, + latents, + )[0] + + # 7. Prepare mask latent variables + depth = self.prepare_depth_latents( + depth, + batch_size * num_images_per_prompt, + height, + width, + text_embeddings.dtype, + device, + generator, + do_classifier_free_guidance, ) - depth_image = PIL.ImageOps.flip(PIL.Image.fromarray(np.uint8(depth * 255)).convert('L')).resize(rounded_size) if depth is not None else None - init_image = None if image is None else (PIL.Image.open(image) if isinstance(image, str) else PIL.Image.fromarray(image.astype(np.uint8))).convert('RGB').resize(rounded_size) - - # Seamless - if seamless_axes == SeamlessAxes.AUTO: - init_sa = None if init_image is None else self.detect_seamless(np.array(init_image) / 255) - depth_sa = None if depth_image is None else self.detect_seamless(np.array(depth_image.convert('RGB')) / 255) - if init_sa is not None and depth_sa is not None: - seamless_axes = SeamlessAxes((init_sa.x and depth_sa.x, init_sa.y and depth_sa.y)) - elif init_sa is not None: - seamless_axes = init_sa - elif depth_sa is not None: - seamless_axes = depth_sa - _configure_model_padding(pipe.unet, seamless_axes) - _configure_model_padding(pipe.vae, seamless_axes) - - # Inference - with torch.inference_mode() if device not in ('mps', "dml") else nullcontext(): - yield from pipe( - prompt=prompt, - depth_image=depth_image, - image=init_image, - strength=strength, - width=rounded_size[0], - height=rounded_size[1], - num_inference_steps=steps, - guidance_scale=cfg_scale, - negative_prompt=negative_prompt if use_negative_prompt else None, - num_images_per_prompt=1, - eta=0.0, - generator=generator, - latents=None, - output_type="pil", - return_dict=True, - callback=None, - callback_steps=1, - step_preview_mode=step_preview_mode, - cfg_end=optimizations.cfg_end + + # 8. Check that sizes of mask, masked image and latents match + num_channels_depth = depth.shape[1] + if num_channels_latents + num_channels_depth != self.unet.config.in_channels: + raise ValueError( + f"Select a depth model, such as 'stabilityai/stable-diffusion-2-depth'" ) - case Pipeline.STABILITY_SDK: - import stability_sdk - raise NotImplementedError() - case _: - raise Exception(f"Unsupported pipeline {pipeline}.") \ No newline at end of file + + # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 10. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + + # concat latents, mask, masked_image_latents in the channel dimension + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + latent_model_input = torch.cat([latent_model_input, depth], dim=1) + + # predict the noise residual + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + + if not output_type == "latent": + condition_kwargs = {} + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, **condition_kwargs)[0] + image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype) + else: + image = latents + has_nsfw_concept = None + + if has_nsfw_concept is None: + do_denormalize = [True] * image.shape[0] + else: + do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] + + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (image, has_nsfw_concept) + + return diffusers.pipelines.stable_diffusion.StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) + + device = self.choose_device(optimizations) + + # StableDiffusionPipeline w/ caching + pipe = self.load_model(DreamTexturesDepth2ImgPipeline, model, optimizations, scheduler) + + # Optimizations + pipe = optimizations.apply(pipe, device) + + # RNG + batch_size = len(prompt) if isinstance(prompt, list) else 1 + generator = [] + for _ in range(batch_size): + gen = torch.Generator(device="cpu" if device in ("mps", "dml") else device) # MPS and DML do not support the `Generator` API + generator.append(gen.manual_seed(random.randrange(0, np.iinfo(np.uint32).max) if seed is None else seed)) + if batch_size == 1: + # Some schedulers don't handle a list of generators: https://github.com/huggingface/diffusers/issues/1909 + generator = generator[0] + + # Init Image + # FIXME: The `unet.config.sample_size` of the depth model is `32`, not `64`. For now, this will be hardcoded to `512`. + height = height or 512 + width = width or 512 + rounded_size = ( + int(8 * (width // 8)), + int(8 * (height // 8)), + ) + depth_image = PIL.ImageOps.flip(PIL.Image.fromarray(np.uint8(depth * 255)).convert('L')).resize(rounded_size) if depth is not None else None + init_image = None if image is None else (PIL.Image.open(image) if isinstance(image, str) else PIL.Image.fromarray(image.astype(np.uint8))).convert('RGB').resize(rounded_size) + + # Seamless + if seamless_axes == SeamlessAxes.AUTO: + init_sa = None if init_image is None else self.detect_seamless(np.array(init_image) / 255) + depth_sa = None if depth_image is None else self.detect_seamless(np.array(depth_image.convert('RGB')) / 255) + if init_sa is not None and depth_sa is not None: + seamless_axes = SeamlessAxes((init_sa.x and depth_sa.x, init_sa.y and depth_sa.y)) + elif init_sa is not None: + seamless_axes = init_sa + elif depth_sa is not None: + seamless_axes = depth_sa + _configure_model_padding(pipe.unet, seamless_axes) + _configure_model_padding(pipe.vae, seamless_axes) + + # Inference + with torch.inference_mode() if device not in ('mps', "dml") else nullcontext(): + def callback(step, timestep, latents): + if future.check_cancelled(): + raise InterruptedError() + future.add_response(ImageGenerationResult.step_preview(self, step_preview_mode, width, height, latents, generator, step)) + try: + result = pipe( + prompt=prompt, + negative_prompt=negative_prompt if use_negative_prompt else None, + depth_image=depth_image, + image=init_image, + strength=strength, + width=rounded_size[0], + height=rounded_size[1], + num_inference_steps=steps, + guidance_scale=cfg_scale, + generator=generator, + callback=callback + ) + + future.add_response(ImageGenerationResult( + [np.asarray(PIL.ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255. + for image in result.images], + [gen.initial_seed() for gen in generator] if isinstance(generator, list) else [generator.initial_seed()], + steps, + True + )) + except InterruptedError: + pass + + future.set_done() \ No newline at end of file diff --git a/generator_process/actions/detect_seamless/__init__.py b/generator_process/actions/detect_seamless/__init__.py index 221e55d5..48cba657 100644 --- a/generator_process/actions/detect_seamless/__init__.py +++ b/generator_process/actions/detect_seamless/__init__.py @@ -3,81 +3,7 @@ import numpy as np from numpy.typing import NDArray - -class SeamlessAxes(Enum): - """Unified handling of seamless axes. - Can be converted from str (id or text) or bool tuple/list (x, y). - Each enum is equal to their respective convertible values. - Special cases: - AUTO: None - OFF: False, empty str - BOTH: True - """ - - AUTO = 'auto', 'Auto-detect', None, None - OFF = 'off', 'Off', False, False - HORIZONTAL = 'x', 'X', True, False - VERTICAL = 'y', 'Y', False, True - BOTH = 'xy', 'Both', True, True - - def __init__(self, id, text, x, y): - self.id = id - self.text = text - self.x = x - self.y = y - - def __eq__(self, other): - if isinstance(other, type(self)): - return self is other - if isinstance(other, str): - return self.id == other or self.text == other or (other == '' and self is self.OFF) - if isinstance(other, (tuple, list)) and len(other) == 2: - return self.x == other[0] and self.y == other[1] - if other is True and self is self.BOTH: - return True - if other is False and self is self.OFF: - return True - if other is None and self is self.AUTO: - return True - return False - - def __and__(self, other): - return SeamlessAxes((self.x and other.x, self.y and other.y)) - - def __or__(self, other): - return SeamlessAxes((self.x or other.x, self.y or other.y)) - - def __xor__(self, other): - return SeamlessAxes((self.x != other.x, self.y != other.y)) - - def __invert__(self): - return SeamlessAxes((not self.x, not self.y)) - - @classmethod - def _missing_(cls, value): - if isinstance(value, str): - if value == '': - return cls.OFF - for e in cls: - if e.id == value or e.text == value: - return e - raise ValueError(f'no {cls.__name__} with id {repr(id)}') - elif isinstance(value, (tuple, list)) and len(value) == 2: - for e in cls: - if e.x == value[0] and e.y == value[1]: - return e - raise ValueError(f'no {cls.__name__} with x {value[0]} and y {value[1]}') - elif value is True: - return cls.BOTH - elif value is False: - return cls.OFF - elif value is None: - return cls.AUTO - raise TypeError(f'expected str, bool, tuple[bool, bool], or None, got {repr(value)}') - - def bpy_enum(self, *args): - return self.id, self.text, *args - +from ....api.models.seamless_axes import SeamlessAxes def detect_seamless(self, image: NDArray) -> SeamlessAxes: import os diff --git a/generator_process/actions/huggingface_hub.py b/generator_process/actions/huggingface_hub.py index ebaba892..13b78217 100644 --- a/generator_process/actions/huggingface_hub.py +++ b/generator_process/actions/huggingface_hub.py @@ -15,39 +15,8 @@ import json import enum from ..future import Future +from ..models import ModelType -class ModelType(enum.IntEnum): - """ - Inferred model type from the U-Net `in_channels`. - """ - UNKNOWN = 0 - PROMPT_TO_IMAGE = 4 - DEPTH = 5 - UPSCALING = 7 - INPAINTING = 9 - - CONTROL_NET = -1 - - @classmethod - def _missing_(cls, _): - return cls.UNKNOWN - - def recommended_model(self) -> str: - """Provides a recommended model for a given task. - - This method has a bias towards the latest version of official Stability AI models. - """ - match self: - case ModelType.PROMPT_TO_IMAGE: - return "stabilityai/stable-diffusion-2-1" - case ModelType.DEPTH: - return "stabilityai/stable-diffusion-2-depth" - case ModelType.UPSCALING: - return "stabilityai/stable-diffusion-x4-upscaler" - case ModelType.INPAINTING: - return "stabilityai/stable-diffusion-2-inpainting" - case _: - return "stabilityai/stable-diffusion-2-1" @dataclass class Model: @@ -111,7 +80,7 @@ def _map_model(file): storage_folder = os.path.join(cache_dir, file) model_type = ModelType.UNKNOWN - if os.path.exists(os.path.join(storage_folder, 'model_index.json')): + if os.path.exists(os.path.join(storage_folder, 'model_index.json')) or os.path.exists(os.path.join(storage_folder, 'config.json')): snapshot_folder = storage_folder model_type = detect_model_type(snapshot_folder) else: @@ -154,69 +123,82 @@ class DownloadStatus: index: int total: int + @classmethod + def hook_download_tqdm(cls, future): + from huggingface_hub import utils, file_download + progresses = set() + + class future_tqdm(utils.tqdm): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.progress() + + def update(self, n=1): + ret = super().update(n=n) + self.progress() + return ret + + def progress(self): + nonlocal progresses + progresses.add(self) + ratio = self.n / self.total + count = 0 + for tqdm in progresses: + r = tqdm.n / tqdm.total + if r == 1: + continue + count += 1 + if tqdm != self and ratio < r: + # only show download status of most complete file + return + future.add_response(cls(f"{count} file{'' if count == 1 else 's'}: {self.desc}", self.n, self.total)) + file_download.tqdm = future_tqdm + def hf_snapshot_download( self, model: str, token: str, - revision: str | None = None + variant: str | None = None, + resume_download=True ): - from huggingface_hub import utils + from huggingface_hub import snapshot_download, repo_info + from diffusers import StableDiffusionPipeline future = Future() yield future + DownloadStatus.hook_download_tqdm(future) - class future_tqdm(utils.tqdm): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - future.add_response(DownloadStatus(self.desc, 0, self.total)) + info = repo_info(model, token=token) + files = [file.rfilename for file in info.siblings] - def update(self, n=1): - future.add_response(DownloadStatus(self.desc, self.last_print_n + n, self.total)) - return super().update(n=n) - - from huggingface_hub import file_download - file_download.tqdm = future_tqdm - from huggingface_hub import _snapshot_download - - from diffusers import StableDiffusionPipeline - from diffusers.utils import DIFFUSERS_CACHE, WEIGHTS_NAME, CONFIG_NAME, ONNX_WEIGHTS_NAME - from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME - - try: - config_dict = StableDiffusionPipeline.load_config( + if "model_index.json" in files: + StableDiffusionPipeline.download( model, - cache_dir=DIFFUSERS_CACHE, - resume_download=True, - force_download=False, - use_auth_token=token + use_auth_token=token, + variant=variant, + resume_download=resume_download, ) - folder_names = [k for k in config_dict.keys() if not k.startswith("_")] - allow_patterns = [os.path.join(k, "*") for k in folder_names] - allow_patterns += [WEIGHTS_NAME, SCHEDULER_CONFIG_NAME, CONFIG_NAME, ONNX_WEIGHTS_NAME, StableDiffusionPipeline.config_name] - except: - allow_patterns = None - - # make sure we don't download flax, safetensors, or ckpt weights. - ignore_patterns = ["*.msgpack", "*.safetensors", "*.ckpt"] + elif "config.json" in files: + # individual model, such as controlnet or vae - try: - _snapshot_download.snapshot_download( - model, - cache_dir=DIFFUSERS_CACHE, - token=token, - revision=revision, - resume_download=True, - allow_patterns=allow_patterns, - ignore_patterns=ignore_patterns - ) - except utils._errors.RevisionNotFoundError: - _snapshot_download.snapshot_download( + fp16_weights = ["diffusion_pytorch_model.fp16.safetensors", "diffusion_pytorch_model.fp16.bin"] + fp32_weights = ["diffusion_pytorch_model.safetensors", "diffusion_pytorch_model.bin"] + if variant == "fp16": + weights_names = fp16_weights + fp32_weights + else: + weights_names = fp32_weights + fp16_weights + + weights = next((name for name in weights_names if name in files), None) + if weights is None: + raise FileNotFoundError(f"Can't find appropriate weights in {model}") + + snapshot_download( model, - cache_dir=DIFFUSERS_CACHE, token=token, - resume_download=True, - allow_patterns=allow_patterns, - ignore_patterns=ignore_patterns + resume_download=resume_download, + allow_patterns=["config.json", weights] ) + else: + raise ValueError(f"{model} doesn't appear to be a pipeline or model") future.set_done() \ No newline at end of file diff --git a/generator_process/actions/image_to_image.py b/generator_process/actions/image_to_image.py index 59373eb0..8a60a4f1 100644 --- a/generator_process/actions/image_to_image.py +++ b/generator_process/actions/image_to_image.py @@ -5,18 +5,17 @@ from numpy.typing import NDArray import numpy as np import random -from .prompt_to_image import Scheduler, Optimizations, StepPreviewMode, ImageGenerationResult, _configure_model_padding, model_snapshot_folder, load_pipe -from ..models import Pipeline -from .detect_seamless import SeamlessAxes +from .prompt_to_image import Checkpoint, Scheduler, Optimizations, StepPreviewMode, ImageGenerationResult, _configure_model_padding +from ...api.models.seamless_axes import SeamlessAxes +from ..future import Future def image_to_image( self, - pipeline: Pipeline, - model: str, + model: str | Checkpoint, - scheduler: Scheduler, + scheduler: str | Scheduler, optimizations: Optimizations, @@ -41,206 +40,75 @@ def image_to_image( key: str | None = None, **kwargs -) -> Generator[ImageGenerationResult, None, None]: - match pipeline: - case Pipeline.STABLE_DIFFUSION: - import diffusers - import torch - from PIL import Image, ImageOps - import PIL.Image - - # Mostly copied from `diffusers.StableDiffusionImg2ImgPipeline`, with slight modifications to yield the latents at each step. - class GeneratorPipeline(diffusers.StableDiffusionImg2ImgPipeline): - @torch.no_grad() - def __call__( - self, - prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image], - strength: float = 0.8, - num_inference_steps: Optional[int] = 50, - guidance_scale: Optional[float] = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: Optional[int] = 1, - eta: Optional[float] = 0.0, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - output_type: Optional[str] = "pil", - return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, - callback_steps: Optional[int] = 1, - **kwargs, - ): - # 1. Check inputs - self.check_inputs(prompt, strength, callback_steps) - - # 2. Define call parameters - batch_size = 1 if isinstance(prompt, str) else len(prompt) - device = self._execution_device - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - # 3. Encode input prompt - text_embeddings = self._encode_prompt( - prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt - ) - - # 4. Preprocess image - image = diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess(image) - - # 5. set timesteps - self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) - latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) - - # 6. Prepare latent variables - latents = self.prepare_latents( - image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator - ) - - # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - - # 8. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - # NOTE: Modified to support disabling CFG - if do_classifier_free_guidance and (i / len(timesteps)) >= kwargs['cfg_end']: - do_classifier_free_guidance = False - text_embeddings = text_embeddings[text_embeddings.size(0) // 2:] - # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - - # predict the noise residual - noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample - - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - progress_bar.update() - # NOTE: Modified to yield the latents instead of calling a callback. - yield ImageGenerationResult.step_preview(self, kwargs['step_preview_mode'], width, height, latents, generator, i) - - # 9. Post-processing - image = self.decode_latents(latents) - - # TODO: Add UI to enable this - # 10. Run safety checker - # image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype) - - # Offload last model to CPU - if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: - self.final_offload_hook.offload() - - # NOTE: Modified to yield the decoded image as a numpy array. - yield ImageGenerationResult( - [np.asarray(ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255. - for i, image in enumerate(self.numpy_to_pil(image))], - [gen.initial_seed() for gen in generator] if isinstance(generator, list) else [generator.initial_seed()], - num_inference_steps, - True - ) - - if optimizations.cpu_only: - device = "cpu" - else: - device = self.choose_device() - - # StableDiffusionPipeline w/ caching - pipe = load_pipe(self, "modify", GeneratorPipeline, model, optimizations, scheduler, device) - - # Optimizations - pipe = optimizations.apply(pipe, device) - - # RNG - batch_size = len(prompt) if isinstance(prompt, list) else 1 - generator = [] - for _ in range(batch_size): - gen = torch.Generator(device="cpu" if device in ("mps", "dml") else device) # MPS and DML do not support the `Generator` API - generator.append(gen.manual_seed(random.randrange(0, np.iinfo(np.uint32).max) if seed is None else seed)) - if batch_size == 1: - # Some schedulers don't handle a list of generators: https://github.com/huggingface/diffusers/issues/1909 - generator = generator[0] - - # Init Image - init_image = Image.fromarray(image).convert('RGB') - - if fit: - height = height or pipe.unet.config.sample_size * pipe.vae_scale_factor - width = width or pipe.unet.config.sample_size * pipe.vae_scale_factor - init_image = init_image.resize((width, height)) - else: - width = init_image.width - height = init_image.height - - # Seamless - if seamless_axes == SeamlessAxes.AUTO: - seamless_axes = self.detect_seamless(np.array(init_image) / 255) - _configure_model_padding(pipe.unet, seamless_axes) - _configure_model_padding(pipe.vae, seamless_axes) - - # Inference - with torch.inference_mode() if device not in ('mps', "dml") else nullcontext(): - yield from pipe( - prompt=prompt, - image=[init_image] * batch_size, - strength=strength, - num_inference_steps=steps, - guidance_scale=cfg_scale, - negative_prompt=negative_prompt if use_negative_prompt else None, - num_images_per_prompt=1, - eta=0.0, - generator=generator, - output_type="pil", - return_dict=True, - callback=None, - callback_steps=1, - step_preview_mode=step_preview_mode, - cfg_end=optimizations.cfg_end - ) - case Pipeline.STABILITY_SDK: - import stability_sdk.client - import stability_sdk.interfaces.gooseai.generation.generation_pb2 - from PIL import Image, ImageOps - import io - - if key is None: - raise ValueError("DreamStudio key not provided. Enter your key in the add-on preferences.") - client = stability_sdk.client.StabilityInference(key=key, engine=model) - - if seed is None: - seed = random.randrange(0, np.iinfo(np.uint32).max) - - answers = client.generate( +) -> Generator[Future, None, None]: + future = Future() + yield future + + import diffusers + import torch + from PIL import Image, ImageOps + import PIL.Image + + device = self.choose_device(optimizations) + + # Stable Diffusion pipeline w/ caching + pipe = self.load_model(diffusers.AutoPipelineForImage2Image, model, optimizations, scheduler) + + # Optimizations + pipe = optimizations.apply(pipe, device) + + # RNG + batch_size = len(prompt) if isinstance(prompt, list) else 1 + generator = [] + for _ in range(batch_size): + gen = torch.Generator(device="cpu" if device in ("mps", "dml") else device) # MPS and DML do not support the `Generator` API + generator.append(gen.manual_seed(random.randrange(0, np.iinfo(np.uint32).max) if seed is None else seed)) + if batch_size == 1: + # Some schedulers don't handle a list of generators: https://github.com/huggingface/diffusers/issues/1909 + generator = generator[0] + + # Init Image + init_image = Image.fromarray(image).convert('RGB') + + if fit: + height = height or pipe.unet.config.sample_size * pipe.vae_scale_factor + width = width or pipe.unet.config.sample_size * pipe.vae_scale_factor + init_image = init_image.resize((width, height)) + else: + width = init_image.width + height = init_image.height + + # Seamless + if seamless_axes == SeamlessAxes.AUTO: + seamless_axes = self.detect_seamless(np.array(init_image) / 255) + _configure_model_padding(pipe.unet, seamless_axes) + _configure_model_padding(pipe.vae, seamless_axes) + + # Inference + with torch.inference_mode() if device not in ('mps', "dml") else nullcontext(): + def callback(step, timestep, latents): + if future.check_cancelled(): + raise InterruptedError() + future.add_response(ImageGenerationResult.step_preview(self, step_preview_mode, width, height, latents, generator, step)) + try: + result = pipe( prompt=prompt, - width=width, - height=height, - cfg_scale=cfg_scale, - sampler=scheduler.stability_sdk(), - steps=steps, - seed=seed, - init_image=(Image.open(image) if isinstance(image, str) else Image.fromarray(image)).convert('RGB'), - start_schedule=strength, + negative_prompt=negative_prompt if use_negative_prompt else None, + image=[init_image] * batch_size, + strength=strength, + num_inference_steps=steps, + guidance_scale=cfg_scale, + generator=generator, + callback=callback ) - for answer in answers: - for artifact in answer.artifacts: - if artifact.finish_reason == stability_sdk.interfaces.gooseai.generation.generation_pb2.FILTER: - raise ValueError("Your request activated DreamStudio's safety filter. Please modify your prompt and try again.") - if artifact.type == stability_sdk.interfaces.gooseai.generation.generation_pb2.ARTIFACT_IMAGE: - image = Image.open(io.BytesIO(artifact.binary)) - yield ImageGenerationResult( - [np.asarray(ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255.], - [seed], - steps, - True - ) - case _: - raise Exception(f"Unsupported pipeline {pipeline}.") \ No newline at end of file + future.add_response(ImageGenerationResult( + [np.asarray(ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255. + for image in result.images], + [gen.initial_seed() for gen in generator] if isinstance(generator, list) else [generator.initial_seed()], + steps, + True + )) + except InterruptedError: + pass + + future.set_done() \ No newline at end of file diff --git a/generator_process/actions/inpaint.py b/generator_process/actions/inpaint.py index a3977064..b163db58 100644 --- a/generator_process/actions/inpaint.py +++ b/generator_process/actions/inpaint.py @@ -4,16 +4,16 @@ from numpy.typing import NDArray import numpy as np import random -from .prompt_to_image import Pipeline, Scheduler, Optimizations, StepPreviewMode, ImageGenerationResult, _configure_model_padding, model_snapshot_folder, load_pipe -from .detect_seamless import SeamlessAxes +from .prompt_to_image import Checkpoint, Scheduler, Optimizations, StepPreviewMode, ImageGenerationResult, _configure_model_padding +from ...api.models.seamless_axes import SeamlessAxes +from ..future import Future def inpaint( self, - pipeline: Pipeline, - model: str, + model: str | Checkpoint, - scheduler: Scheduler, + scheduler: str | Scheduler, optimizations: Optimizations, @@ -45,257 +45,84 @@ def inpaint( **kwargs ) -> Generator[NDArray, None, None]: - match pipeline: - case Pipeline.STABLE_DIFFUSION: - import diffusers - import torch - from PIL import Image, ImageOps - import PIL.Image + future = Future() + yield future - # Mostly copied from `diffusers.StableDiffusionInpaintPipeline`, with slight modifications to yield the latents at each step. - class GeneratorPipeline(diffusers.StableDiffusionInpaintPipeline): - @torch.no_grad() - def __call__( - self, - prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image], - mask_image: Union[torch.FloatTensor, PIL.Image.Image], - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 50, - guidance_scale: float = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: Optional[int] = 1, - eta: float = 0.0, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - output_type: Optional[str] = "pil", - return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, - callback_steps: Optional[int] = 1, - **kwargs, - ): - # 0. Default height and width to unet - height = height or self.unet.config.sample_size * self.vae_scale_factor - width = width or self.unet.config.sample_size * self.vae_scale_factor - - # 1. Check inputs - self.check_inputs(prompt, height, width, callback_steps) - - # 2. Define call parameters - batch_size = 1 if isinstance(prompt, str) else len(prompt) - device = self._execution_device - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - # 3. Encode input prompt - text_embeddings = self._encode_prompt( - prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt - ) - - # 4. Preprocess mask and image - mask, masked_image = diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.prepare_mask_and_masked_image(image, mask_image) - - # 5. set timesteps - self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps = self.scheduler.timesteps - - # 6. Prepare latent variables - num_channels_latents = self.vae.config.latent_channels - latents = self.prepare_latents( - batch_size * num_images_per_prompt, - num_channels_latents, - height, - width, - text_embeddings.dtype, - device, - generator, - latents, - ) - - # 7. Prepare mask latent variables - mask, masked_image_latents = self.prepare_mask_latents( - mask, - masked_image, - batch_size * num_images_per_prompt, - height, - width, - text_embeddings.dtype, - device, - generator, - do_classifier_free_guidance, - ) - - # 8. Check that sizes of mask, masked image and latents match - num_channels_mask = mask.shape[1] - num_channels_masked_image = masked_image_latents.shape[1] - if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels: - raise ValueError( - f"Select an inpainting model, such as 'stabilityai/stable-diffusion-2-inpainting'" - ) - - # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - - # 10. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - with self.progress_bar(total=num_inference_steps) as progress_bar: - for i, t in enumerate(timesteps): - # NOTE: Modified to support disabling CFG - if do_classifier_free_guidance and (i / len(timesteps)) >= kwargs['cfg_end']: - do_classifier_free_guidance = False - text_embeddings = text_embeddings[text_embeddings.size(0) // 2:] - mask = mask[mask.size(0) // 2:] - masked_image_latents = masked_image_latents[masked_image_latents.size(0) // 2:] - # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents - - # concat latents, mask, masked_image_latents in the channel dimension - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1) - - # predict the noise residual - noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample - - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - progress_bar.update() - # NOTE: Modified to yield the latents instead of calling a callback. - yield ImageGenerationResult.step_preview(self, kwargs['step_preview_mode'], width, height, latents, generator, i) - - # 11. Post-processing - image = self.decode_latents(latents) - - # TODO: Add UI to enable this - # 10. Run safety checker - # image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype) - - # Offload last model to CPU - if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: - self.final_offload_hook.offload() - - # NOTE: Modified to yield the decoded image as a numpy array. - yield ImageGenerationResult( - [np.asarray(ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255. - for i, image in enumerate(self.numpy_to_pil(image))], - [gen.initial_seed() for gen in generator] if isinstance(generator, list) else [generator.initial_seed()], - num_inference_steps, - True - ) - - if optimizations.cpu_only: - device = "cpu" - else: - device = self.choose_device() - - # StableDiffusionPipeline w/ caching - pipe = load_pipe(self, "inpaint", GeneratorPipeline, model, optimizations, scheduler, device) - - # Optimizations - pipe = optimizations.apply(pipe, device) - - # RNG - batch_size = len(prompt) if isinstance(prompt, list) else 1 - generator = [] - for _ in range(batch_size): - gen = torch.Generator(device="cpu" if device in ("mps", "dml") else device) # MPS and DML do not support the `Generator` API - generator.append(gen.manual_seed(random.randrange(0, np.iinfo(np.uint32).max) if seed is None else seed)) - if batch_size == 1: - # Some schedulers don't handle a list of generators: https://github.com/huggingface/diffusers/issues/1909 - generator = generator[0] - - # Init Image - init_image = Image.fromarray(image) - - # Seamless - if seamless_axes == SeamlessAxes.AUTO: - seamless_axes = self.detect_seamless(np.array(init_image) / 255) - _configure_model_padding(pipe.unet, seamless_axes) - _configure_model_padding(pipe.vae, seamless_axes) - - # Inference - with torch.inference_mode() if device not in ('mps', "dml") else nullcontext(): - match inpaint_mask_src: - case 'alpha': - mask_image = ImageOps.invert(init_image.getchannel('A')) - case 'prompt': - from transformers import AutoProcessor, CLIPSegForImageSegmentation - - processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined") - clipseg = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined") - inputs = processor(text=[text_mask], images=[init_image.convert('RGB')], return_tensors="pt", padding=True) - outputs = clipseg(**inputs) - mask_image = Image.fromarray(np.uint8((1 - torch.sigmoid(outputs.logits).lt(text_mask_confidence).int().detach().numpy()) * 255), 'L').resize(init_image.size) - - yield from pipe( - prompt=prompt, - image=[init_image.convert('RGB')] * batch_size, - mask_image=[mask_image] * batch_size, - strength=strength, - height=init_image.size[1] if fit else height, - width=init_image.size[0] if fit else width, - num_inference_steps=steps, - guidance_scale=cfg_scale, - negative_prompt=negative_prompt if use_negative_prompt else None, - num_images_per_prompt=1, - eta=0.0, - generator=generator, - latents=None, - output_type="pil", - return_dict=True, - callback=None, - callback_steps=1, - step_preview_mode=step_preview_mode, - cfg_end=optimizations.cfg_end - ) - case Pipeline.STABILITY_SDK: - import stability_sdk.client - import stability_sdk.interfaces.gooseai.generation.generation_pb2 - from PIL import Image, ImageOps - import io - - if key is None: - raise ValueError("DreamStudio key not provided. Enter your key in the add-on preferences.") - client = stability_sdk.client.StabilityInference(key=key, engine=model) - - if seed is None: - seed = random.randrange(0, np.iinfo(np.uint32).max) - - init_image = Image.open(image) if isinstance(image, str) else Image.fromarray(image) - - answers = client.generate( + import diffusers + import torch + from PIL import Image, ImageOps + import PIL.Image + + device = self.choose_device(optimizations) + + # StableDiffusionPipeline w/ caching + pipe = self.load_model(diffusers.AutoPipelineForInpainting, model, optimizations, scheduler) + height = height or pipe.unet.config.sample_size * pipe.vae_scale_factor + width = width or pipe.unet.config.sample_size * pipe.vae_scale_factor + + # Optimizations + pipe = optimizations.apply(pipe, device) + + # RNG + batch_size = len(prompt) if isinstance(prompt, list) else 1 + generator = [] + for _ in range(batch_size): + gen = torch.Generator(device="cpu" if device in ("mps", "dml") else device) # MPS and DML do not support the `Generator` API + generator.append(gen.manual_seed(random.randrange(0, np.iinfo(np.uint32).max) if seed is None else seed)) + if batch_size == 1: + # Some schedulers don't handle a list of generators: https://github.com/huggingface/diffusers/issues/1909 + generator = generator[0] + + # Init Image + init_image = Image.fromarray(image) + + # Seamless + if seamless_axes == SeamlessAxes.AUTO: + seamless_axes = self.detect_seamless(np.array(init_image) / 255) + _configure_model_padding(pipe.unet, seamless_axes) + _configure_model_padding(pipe.vae, seamless_axes) + + # Inference + with torch.inference_mode() if device not in ('mps', "dml") else nullcontext(): + match inpaint_mask_src: + case 'alpha': + mask_image = ImageOps.invert(init_image.getchannel('A')) + case 'prompt': + from transformers import AutoProcessor, CLIPSegForImageSegmentation + + processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined") + clipseg = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined") + inputs = processor(text=[text_mask], images=[init_image.convert('RGB')], return_tensors="pt", padding=True) + outputs = clipseg(**inputs) + mask_image = Image.fromarray(np.uint8((1 - torch.sigmoid(outputs.logits).lt(text_mask_confidence).int().detach().numpy()) * 255), 'L').resize(init_image.size) + + def callback(step, timestep, latents): + if future.check_cancelled(): + raise InterruptedError() + future.add_response(ImageGenerationResult.step_preview(self, step_preview_mode, width, height, latents, generator, step)) + try: + result = pipe( prompt=prompt, - width=width or 512, - height=height or 512, - cfg_scale=cfg_scale, - sampler=scheduler.stability_sdk(), - steps=steps, - seed=seed, - init_image=init_image.convert('RGB'), - mask_image=init_image.getchannel('A'), - start_schedule=strength, + negative_prompt=negative_prompt if use_negative_prompt else None, + image=[init_image.convert('RGB')] * batch_size, + mask_image=[mask_image] * batch_size, + strength=strength, + height=init_image.size[1] if fit else height, + width=init_image.size[0] if fit else width, + num_inference_steps=steps, + guidance_scale=cfg_scale, + generator=generator, + callback=callback ) - for answer in answers: - for artifact in answer.artifacts: - if artifact.finish_reason == stability_sdk.interfaces.gooseai.generation.generation_pb2.FILTER: - raise ValueError("Your request activated DreamStudio's safety filter. Please modify your prompt and try again.") - if artifact.type == stability_sdk.interfaces.gooseai.generation.generation_pb2.ARTIFACT_IMAGE: - image = Image.open(io.BytesIO(artifact.binary)) - yield ImageGenerationResult( - [np.asarray(ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255.], - [seed], - steps, - True - ) - - case _: - raise Exception(f"Unsupported pipeline {pipeline}.") \ No newline at end of file + + future.add_response(ImageGenerationResult( + [np.asarray(ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255. + for image in result.images], + [gen.initial_seed() for gen in generator] if isinstance(generator, list) else [generator.initial_seed()], + steps, + True + )) + except InterruptedError: + pass + + future.set_done() \ No newline at end of file diff --git a/generator_process/actions/load_model.py b/generator_process/actions/load_model.py new file mode 100644 index 00000000..b930fa4a --- /dev/null +++ b/generator_process/actions/load_model.py @@ -0,0 +1,239 @@ +import gc +import logging +import os +from ..models import Checkpoint, ModelConfig, Scheduler + +logger = logging.getLogger(__name__) + + +def revision_paths(model, config="model_index.json"): + from diffusers.utils import DIFFUSERS_CACHE + + is_repo = "/" in model + if os.path.exists(os.path.join(model, config)): + is_repo = False + elif not is_repo and os.path.exists(os.path.join(DIFFUSERS_CACHE, model, config)): + model = os.path.join(DIFFUSERS_CACHE, model) + elif not is_repo: + raise ValueError(f"{model} is not a valid repo, imported checkpoint, or path") + + if not is_repo: + return {"main": model} + + model_path = os.path.join(DIFFUSERS_CACHE, "--".join(["models", *model.split("/")])) + refs_path = os.path.join(model_path, "refs") + revisions = {} + if not os.path.isdir(refs_path): + return revisions + for ref in os.listdir(refs_path): + with open(os.path.join(refs_path, ref)) as f: + commit_hash = f.read() + snapshot_path = os.path.join(model_path, "snapshots", commit_hash) + if os.path.isdir(snapshot_path): + revisions[ref] = snapshot_path + return revisions + + +def cache_check(*, exists_callback=None): + def decorator(func): + def wrapper(cache, model, *args, **kwargs): + if model in cache: + r = cache[model] + if exists_callback is not None: + r = cache[model] = exists_callback(cache, model, r, *args, **kwargs) + else: + r = cache[model] = func(cache, model, *args, **kwargs) + return r + return wrapper + return decorator + + +@cache_check() +def _load_controlnet_model(cache, model, half_precision): + from diffusers import ControlNetModel + import torch + + if isinstance(model, str) and os.path.isfile(model): + model = Checkpoint(model, None) + + if isinstance(model, Checkpoint): + control_net_model = ControlNetModel.from_single_file( + model.path, + config_file=model.config.original_config if isinstance(model.config, ModelConfig) else model.config, + ) + if half_precision: + control_net_model.to(torch.float16) + return control_net_model + + revisions = revision_paths(model, "config.json") + if "main" not in revisions: + # controlnet models shouldn't have a fp16 revision to worry about + raise FileNotFoundError(f"{model} does not contain a main revision") + + fp16_weights = ["diffusion_pytorch_model.fp16.safetensors", "diffusion_pytorch_model.fp16.bin"] + fp32_weights = ["diffusion_pytorch_model.safetensors", "diffusion_pytorch_model.bin"] + if half_precision: + weights_names = fp16_weights + fp32_weights + else: + weights_names = fp32_weights + fp16_weights + + weights = next((name for name in weights_names if os.path.isfile(os.path.join(revisions["main"], name))), None) + if weights is None: + raise FileNotFoundError(f"Can't find appropriate weights in {model}") + half_weights = weights in fp16_weights + if not half_precision and half_weights: + logger.warning(f"Can't load fp32 weights for model {model}, attempting to load fp16 instead") + + return ControlNetModel.from_pretrained( + revisions["main"], + torch_dtype=torch.float16 if half_precision else None, + variant="fp16" if half_weights else None + ) + + +def _load_checkpoint(model_class, checkpoint, dtype, **kwargs): + from diffusers.pipelines.stable_diffusion.convert_from_ckpt import download_from_original_stable_diffusion_ckpt + + if isinstance(checkpoint, Checkpoint): + model = checkpoint.path + config = checkpoint.config + else: + model = checkpoint + config = ModelConfig.AUTO_DETECT + + if not os.path.exists(model): + raise FileNotFoundError(f"Can't locate {model}") + + config_file = config.original_config if isinstance(config, ModelConfig) else config + if hasattr(model_class, "from_single_file"): + return model_class.from_single_file( + model, + torch_dtype=dtype, + original_config_file=config_file, + **kwargs + ) + else: + # auto pipelines won't support from_single_file() https://github.com/huggingface/diffusers/issues/4367 + from_pipe = hasattr(model_class, "from_pipe") + if from_pipe: + pipeline_class = config.pipeline if isinstance(config, ModelConfig) else None + else: + pipeline_class = model_class + pipe = download_from_original_stable_diffusion_ckpt( + model, + from_safetensors=model.endswith(".safetensors"), + original_config_file=config_file, + pipeline_class=pipeline_class, + controlnet=kwargs.get("controlnet", None) + ) + if dtype is not None: + pipe.to(torch_dtype=dtype) + if from_pipe: + pipe = model_class.from_pipe(pipe, **kwargs) + return pipe + + +def _convert_pipe(cache, model, pipe, model_class, half_precision, scheduler, **kwargs): + if model_class.__name__ not in { + # some tasks are not supported by auto pipeline + 'DreamTexturesDepth2ImgPipeline', + 'StableDiffusionUpscalePipeline' + }: + pipe = model_class.from_pipe(pipe, **kwargs) + scheduler.create(pipe) + return pipe + + +@cache_check(exists_callback=_convert_pipe) +def _load_pipeline(cache, model, model_class, half_precision, scheduler, **kwargs): + import torch + + dtype = torch.float16 if half_precision else None + + if isinstance(model, Checkpoint) or os.path.splitext(model)[1] in [".ckpt", ".safetensors"]: + pipe = _load_checkpoint(model_class, model, dtype, **kwargs) + scheduler.create(pipe) + return pipe + + revisions = revision_paths(model) + strategies = [] + if "main" in revisions: + strategies.append({"model_path": revisions["main"], "variant": "fp16" if half_precision else None}) + if not half_precision: + # fp16 variant can automatically use fp32 files, but fp32 won't automatically use fp16 files + strategies.append({"model_path": revisions["main"], "variant": "fp16", "_warn_precision_fallback": True}) + if "fp16" in revisions: + strategies.append({"model_path": revisions["fp16"], "_warn_precision_fallback": not half_precision}) + + if len(strategies) == 0: + raise FileNotFoundError(f"{model} does not contain a main or fp16 revision") + + exc = None + for strat in strategies: + if strat.pop("_warn_precision_fallback", False): + logger.warning(f"Can't load fp32 weights for model {model}, attempting to load fp16 instead") + try: + pipe = model_class.from_pretrained(strat.pop("model_path"), torch_dtype=dtype, safety_checker=None, requires_safety_checker=False, **strat, **kwargs) + pipe.scheduler = scheduler.create(pipe) + return pipe + except Exception as e: + if exc is None: + exc = e + raise exc + + +def load_model(self, model_class, model, optimizations, scheduler, controlnet=None, sdxl_refiner_model=None, **kwargs): + import torch + from diffusers import StableDiffusionXLPipeline, AutoPipelineForImage2Image + from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel + + device = self.choose_device(optimizations) + half_precision = optimizations.can_use_half(device) + invalidation_properties = (device, half_precision, optimizations.cpu_offloading(device), controlnet is not None) + + # determine models to be removed from cache + if not hasattr(self, "_pipe") or self._pipe is None or self._pipe[0] != invalidation_properties: + model_cache = {} + self._pipe = (invalidation_properties, model_cache) + gc.collect() + torch.cuda.empty_cache() + else: + model_cache = self._pipe[1] + expected_models = {model} + if sdxl_refiner_model is not None: + expected_models.add(sdxl_refiner_model) + if controlnet is not None: + expected_models.update(name for name in controlnet) + clear_models = set(model_cache).difference(expected_models) + for name in clear_models: + model_cache.pop(name) + for pipe in model_cache.items(): + if isinstance(getattr(pipe, "controlnet", None), MultiControlNetModel): + # make sure no longer needed ControlNetModels are cleared + # the MultiControlNetModel container will be remade + pipe.controlnet = None + if len(clear_models) > 0: + gc.collect() + torch.cuda.empty_cache() + + # load or obtain models from cache + if controlnet is not None: + kwargs["controlnet"] = MultiControlNetModel([ + _load_controlnet_model(model_cache, name, half_precision) for name in controlnet + ]) + if not isinstance(scheduler, Scheduler): + try: + scheduler = Scheduler[scheduler] + except KeyError: + raise ValueError(f"scheduler expected one of {[s.name for s in Scheduler]}, got {repr(scheduler)}") + pipe = _load_pipeline(model_cache, model, model_class, half_precision, scheduler, **kwargs) + if isinstance(pipe, StableDiffusionXLPipeline) and sdxl_refiner_model is not None: + return pipe, _load_pipeline(model_cache, sdxl_refiner_model, AutoPipelineForImage2Image, half_precision, scheduler, **kwargs) + elif sdxl_refiner_model is not None: + if model_cache.pop(sdxl_refiner_model, None) is not None: + # refiner was previously used and left enabled but is not compatible with the now selected model + gc.collect() + torch.cuda.empty_cache() + # the caller expects a tuple since refiner was defined + return pipe, None + return pipe diff --git a/generator_process/actions/outpaint.py b/generator_process/actions/outpaint.py index 5e33060f..b0f3d95c 100644 --- a/generator_process/actions/outpaint.py +++ b/generator_process/actions/outpaint.py @@ -1,7 +1,8 @@ from typing import Tuple, Generator from numpy.typing import NDArray import numpy as np -from .prompt_to_image import ImageGenerationResult, StepPreviewMode +from .prompt_to_image import ImageGenerationResult +from ..future import Future def outpaint( self, @@ -17,6 +18,9 @@ def outpaint( ) -> Generator[ImageGenerationResult, None, None]: from PIL import Image, ImageOps + future = Future() + yield future + init_image = Image.fromarray(image) width = width or 512 height = height or 512 @@ -54,7 +58,7 @@ def outpaint( ) ) - def process(step: ImageGenerationResult): + def process(_, step: ImageGenerationResult): for i, result_image in enumerate(step.images): image = outpaint_bounds.copy() image.paste( @@ -62,12 +66,19 @@ def process(step: ImageGenerationResult): offset_origin ) step.images[i] = np.asarray(ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255. - return step + future.add_response(step) - for step in self.inpaint( + inpaint_generator = self.inpaint( image=np.array(inpaint_tile), width=width, height=height, **kwargs - ): - yield process(step) \ No newline at end of file + ) + inpaint_future = next(inpaint_generator) + inpaint_future.check_cancelled = future.check_cancelled + inpaint_future.add_response_callback(process) + inpaint_future.add_exception_callback(future.set_exception) + for _ in inpaint_generator: + pass + + future.set_done() diff --git a/generator_process/actions/prompt_to_image.py b/generator_process/actions/prompt_to_image.py index 87fce1f9..d1405acd 100644 --- a/generator_process/actions/prompt_to_image.py +++ b/generator_process/actions/prompt_to_image.py @@ -1,454 +1,20 @@ -from typing import Annotated, Union, _AnnotatedAlias, Generator, Callable, List, Optional, Any -import enum -import functools -import math -import os -import sys -from dataclasses import dataclass +from typing import Generator from contextlib import nullcontext -from numpy.typing import NDArray import numpy as np import random -from .detect_seamless import SeamlessAxes -from ..models.upscale_tiler import tiled_decode_latents - -from ..models import Pipeline - -class CachedPipeline: - """A pipeline that has been cached for subsequent runs.""" - - pipeline: Any - """The diffusers pipeline to re-use""" - - invalidation_properties: tuple - """Values that, when changed, will invalid this cached pipeline""" - - snapshot_folder: str - """The snapshot folder containing the model""" - - def __init__(self, pipeline: Any, invalidation_properties: tuple, snapshot_folder: str): - self.pipeline = pipeline - self.invalidation_properties = invalidation_properties - self.snapshot_folder = snapshot_folder - - def is_valid(self, properties: tuple): - return properties == self.invalidation_properties - -def load_pipe(self, action, generator_pipeline, model, optimizations, scheduler, device, **kwargs): - """ - Use a cached pipeline, or create the pipeline class and cache it. - - The cached pipeline will be invalidated if the model or use_cpu_offload options change. - """ - import torch - import gc - - invalidation_properties = ( - action, model, device, - optimizations.can_use_cpu_offload(device), - optimizations.can_use("half_precision", device), - ) - cached_pipe: CachedPipeline = self._cached_pipe if hasattr(self, "_cached_pipe") else None - if cached_pipe is not None and cached_pipe.is_valid(invalidation_properties): - pipe = cached_pipe.pipeline - else: - # Release the cached pipe before loading the new one. - if cached_pipe is not None: - del self._cached_pipe - del cached_pipe - gc.collect() - - revision = "fp16" if optimizations.can_use_half(device) else None - snapshot_folder = model_snapshot_folder(model, revision) - pipe = generator_pipeline.from_pretrained( - snapshot_folder, - revision=revision, - torch_dtype=torch.float16 if optimizations.can_use_half(device) else torch.float32, - **kwargs - ) - if optimizations.can_use_cpu_offload(device) == "off": - pipe = pipe.to(device) - setattr(self, "_cached_pipe", CachedPipeline(pipe, invalidation_properties, snapshot_folder)) - cached_pipe = self._cached_pipe - if scheduler is not None: - if 'scheduler' in os.listdir(cached_pipe.snapshot_folder): - pipe.scheduler = scheduler.create(pipe, { - 'model_path': cached_pipe.snapshot_folder, - 'subfolder': 'scheduler', - }) - else: - pipe.scheduler = scheduler.create(pipe, None) - return pipe - -class Scheduler(enum.Enum): - DDIM = "DDIM" - DDPM = "DDPM" - DEIS_MULTISTEP = "DEIS Multistep" - DPM_SOLVER_MULTISTEP = "DPM Solver Multistep" - DPM_SOLVER_SINGLESTEP = "DPM Solver Singlestep" - EULER_DISCRETE = "Euler Discrete" - EULER_ANCESTRAL_DISCRETE = "Euler Ancestral Discrete" - HEUN_DISCRETE = "Heun Discrete" - KDPM2_DISCRETE = "KDPM2 Discrete" # Non-functional on mps - KDPM2_ANCESTRAL_DISCRETE = "KDPM2 Ancestral Discrete" - LMS_DISCRETE = "LMS Discrete" - PNDM = "PNDM" - - def create(self, pipeline, pretrained): - import diffusers - def scheduler_class(): - match self: - case Scheduler.DDIM: - return diffusers.schedulers.DDIMScheduler - case Scheduler.DDPM: - return diffusers.schedulers.DDPMScheduler - case Scheduler.DEIS_MULTISTEP: - return diffusers.schedulers.DEISMultistepScheduler - case Scheduler.DPM_SOLVER_MULTISTEP: - return diffusers.schedulers.DPMSolverMultistepScheduler - case Scheduler.DPM_SOLVER_SINGLESTEP: - return diffusers.schedulers.DPMSolverSinglestepScheduler - case Scheduler.EULER_DISCRETE: - return diffusers.schedulers.EulerDiscreteScheduler - case Scheduler.EULER_ANCESTRAL_DISCRETE: - return diffusers.schedulers.EulerAncestralDiscreteScheduler - case Scheduler.HEUN_DISCRETE: - return diffusers.schedulers.HeunDiscreteScheduler - case Scheduler.KDPM2_DISCRETE: - return diffusers.schedulers.KDPM2DiscreteScheduler - case Scheduler.KDPM2_ANCESTRAL_DISCRETE: - return diffusers.schedulers.KDPM2AncestralDiscreteScheduler - case Scheduler.LMS_DISCRETE: - return diffusers.schedulers.LMSDiscreteScheduler - case Scheduler.PNDM: - return diffusers.schedulers.PNDMScheduler - if pretrained is not None: - return scheduler_class().from_pretrained(pretrained['model_path'], subfolder=pretrained['subfolder']) - else: - return scheduler_class().from_config(pipeline.scheduler.config) - - def stability_sdk(self): - import stability_sdk.interfaces.gooseai.generation.generation_pb2 - match self: - case Scheduler.LMS_DISCRETE: - return stability_sdk.interfaces.gooseai.generation.generation_pb2.SAMPLER_K_LMS - case Scheduler.DDIM: - return stability_sdk.interfaces.gooseai.generation.generation_pb2.SAMPLER_DDIM - case Scheduler.DDPM: - return stability_sdk.interfaces.gooseai.generation.generation_pb2.SAMPLER_DDPM - case Scheduler.EULER_DISCRETE: - return stability_sdk.interfaces.gooseai.generation.generation_pb2.SAMPLER_K_EULER - case Scheduler.EULER_ANCESTRAL_DISCRETE: - return stability_sdk.interfaces.gooseai.generation.generation_pb2.SAMPLER_K_EULER_ANCESTRAL - case _: - raise ValueError(f"{self} cannot be used with DreamStudio.") - -@dataclass(eq=True) -class Optimizations: - attention_slicing: bool = True - attention_slice_size: Union[str, int] = "auto" - cudnn_benchmark: Annotated[bool, "cuda"] = False - tf32: Annotated[bool, "cuda"] = False - amp: Annotated[bool, "cuda"] = False - half_precision: Annotated[bool, {"cuda", "dml"}] = True - cpu_offload: Annotated[str, {"cuda", "dml"}] = "off" - channels_last_memory_format: bool = False - sdp_attention: bool = True - batch_size: int = 1 - vae_slicing: bool = True - vae_tiling: str = "off" - vae_tile_size: int = 512 - vae_tile_blend: int = 64 - cfg_end: float = 1.0 - - cpu_only: bool = False - - @staticmethod - def infer_device() -> str: - if sys.platform == "darwin": - return "mps" - elif Pipeline.directml_available(): - return "dml" - else: - return "cuda" - - @classmethod - def device_supports(cls, property, device) -> bool: - annotation = cls.__annotations__.get(property, None) - if isinstance(annotation, _AnnotatedAlias): - opt_dev = annotation.__metadata__[0] - if isinstance(opt_dev, str): - return opt_dev == device - return device in opt_dev - return annotation is not None - - def can_use(self, property, device) -> bool: - return self.device_supports(property, device) and getattr(self, property) - - def can_use_half(self, device): - if self.half_precision and device == "cuda": - import torch - name = torch.cuda.get_device_name() - return not ("GTX 1650" in name or "GTX 1660" in name) - return self.can_use("half_precision", device) - - def can_use_cpu_offload(self, device): - return self.cpu_offload if self.device_supports("cpu_offload", device) else "off" - - def apply(self, pipeline, device): - """ - Apply the optimizations to a diffusers pipeline. - - All exceptions are ignored to make this more general purpose across different pipelines. - """ - import torch - - torch.backends.cudnn.benchmark = self.can_use("cudnn_benchmark", device) - torch.backends.cuda.matmul.allow_tf32 = self.can_use("tf32", device) - - try: - if self.can_use("sdp_attention", device): - from diffusers.models.cross_attention import AttnProcessor2_0 - pipeline.unet.set_attn_processor(AttnProcessor2_0()) - elif self.can_use("attention_slicing", device): - pipeline.enable_attention_slicing(self.attention_slice_size) - else: - pipeline.disable_attention_slicing() # will also disable AttnProcessor2_0 - except: pass - - try: - if pipeline.device != pipeline._execution_device: - pass # pipeline is already offloaded, offloading again can cause `pipeline._execution_device` to be incorrect - elif self.can_use_cpu_offload(device) == "model": - # adapted from diffusers.StableDiffusionPipeline.enable_model_cpu_offload() to allow DirectML device and unimplemented pipelines - from accelerate import cpu_offload_with_hook - - hook = None - models = [pipeline.text_encoder, pipeline.unet, pipeline.vae] - if hasattr(pipeline, "controlnet"): - models.append(pipeline.controlnet) - for cpu_offloaded_model in models: - _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) - - # FIXME: due to the safety checker not running it prevents the VAE from being offloaded, uncomment when safety checker is enabled - # if pipeline.safety_checker is not None: - # _, hook = cpu_offload_with_hook(pipeline.safety_checker, device, prev_module_hook=hook) - - # We'll offload the last model manually. - pipeline.final_offload_hook = hook - elif self.can_use_cpu_offload(device) == "submodule": - # adapted from diffusers.StableDiffusionPipeline.enable_sequential_cpu_offload() to allow DirectML device and unimplemented pipelines - from accelerate import cpu_offload - - models = [pipeline.text_encoder, pipeline.unet, pipeline.vae] - if hasattr(pipeline, "controlnet"): - models.append(pipeline.controlnet) - for cpu_offloaded_model in models: - cpu_offload(cpu_offloaded_model, device) - - if pipeline.safety_checker is not None: - cpu_offload(pipeline.safety_checker.vision_model, device, offload_buffers=True) - except: pass - - try: - if self.can_use("channels_last_memory_format", device): - pipeline.unet.to(memory_format=torch.channels_last) - else: - pipeline.unet.to(memory_format=torch.contiguous_format) - except: pass - - try: - if self.can_use("vae_slicing", device): - # Not many pipelines implement the enable_vae_slicing()/disable_vae_slicing() - # methods but all they do is forward their call to the vae anyway. - pipeline.vae.enable_slicing() - else: - pipeline.vae.disable_slicing() - except: pass - - try: - if self.vae_tiling != "off": - if not isinstance(pipeline.decode_latents, functools.partial): - pipeline.decode_latents = functools.partial(tiled_decode_latents.__get__(pipeline), pre_patch=pipeline.decode_latents) - pipeline.decode_latents.keywords['optimizations'] = self - elif self.vae_tiling == "off" and isinstance(pipeline.decode_latents, functools.partial): - pipeline.decode_latents = pipeline.decode_latents.keywords["pre_patch"] - except: pass - - from .. import directml_patches - if device == "dml": - directml_patches.enable(pipeline) - else: - directml_patches.disable(pipeline) - - return pipeline - -class StepPreviewMode(enum.Enum): - NONE = "None" - FAST = "Fast" - FAST_BATCH = "Fast (Batch Tiled)" - ACCURATE = "Accurate" - ACCURATE_BATCH = "Accurate (Batch Tiled)" - -@dataclass -class ImageGenerationResult: - images: List[NDArray] - seeds: List[int] - step: int - final: bool - - @staticmethod - def step_preview(pipe, mode, width, height, latents, generator, iteration): - from PIL import Image, ImageOps - seeds = [gen.initial_seed() for gen in generator] if isinstance(generator, list) else [generator.initial_seed()] - match mode: - case StepPreviewMode.FAST: - return ImageGenerationResult( - [np.asarray(ImageOps.flip(Image.fromarray(approximate_decoded_latents(latents[-1:]))).resize((width, height), Image.Resampling.NEAREST).convert('RGBA'), dtype=np.float32) / 255.], - seeds[-1:], - iteration, - False - ) - case StepPreviewMode.FAST_BATCH: - return ImageGenerationResult( - [ - np.asarray(ImageOps.flip(Image.fromarray(approximate_decoded_latents(latents[i:i + 1]))).resize((width, height), Image.Resampling.NEAREST).convert('RGBA'), - dtype=np.float32) / 255. - for i in range(latents.size(0)) - ], - seeds, - iteration, - False - ) - case StepPreviewMode.ACCURATE: - return ImageGenerationResult( - [np.asarray(ImageOps.flip(pipe.numpy_to_pil(pipe.decode_latents(latents[-1:]))[0]).convert('RGBA'), - dtype=np.float32) / 255.], - seeds[-1:], - iteration, - False - ) - case StepPreviewMode.ACCURATE_BATCH: - return ImageGenerationResult( - [ - np.asarray(ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255. - for image in pipe.numpy_to_pil(pipe.decode_latents(latents)) - ], - seeds, - iteration, - False - ) - return ImageGenerationResult( - [], - seeds, - iteration, - False - ) - - def tile_images(self): - images = self.images - if len(images) == 0: - return None - elif len(images) == 1: - return images[0] - width = images[0].shape[1] - height = images[0].shape[0] - tiles_x = math.ceil(math.sqrt(len(images))) - tiles_y = math.ceil(len(images) / tiles_x) - tiles = np.zeros((height * tiles_y, width * tiles_x, 4), dtype=np.float32) - bottom_offset = (tiles_x*tiles_y-len(images)) * width // 2 - for i, image in enumerate(images): - x = i % tiles_x - y = tiles_y - 1 - int((i - x) / tiles_x) - x *= width - y *= height - if y == 0: - x += bottom_offset - tiles[y: y + height, x: x + width] = image - return tiles - -def choose_device(self) -> str: - """ - Automatically select which PyTorch device to use. - """ - import torch - if torch.cuda.is_available(): - return "cuda" - elif torch.backends.mps.is_available(): - return "mps" - if Pipeline.directml_available(): - import torch_directml - if torch_directml.is_available(): - torch.utils.rename_privateuse1_backend("dml") - return "dml" - return "cpu" - -def approximate_decoded_latents(latents): - """ - Approximate the decoded latents without using the VAE. - """ - import torch - # origingally adapted from code by @erucipe and @keturn here: - # https://discuss.huggingface.co/t/decoding-latents-to-rgb-without-upscaling/23204/7 - - # these updated numbers for v1.5 are from @torridgristle - v1_5_latent_rgb_factors = torch.tensor([ - # R G B - [ 0.3444, 0.1385, 0.0670], # L1 - [ 0.1247, 0.4027, 0.1494], # L2 - [-0.3192, 0.2513, 0.2103], # L3 - [-0.1307, -0.1874, -0.7445] # L4 - ], dtype=latents.dtype, device=latents.device) - - latent_image = latents[0].permute(1, 2, 0) @ v1_5_latent_rgb_factors - latents_ubyte = (((latent_image + 1) / 2) - .clamp(0, 1) # change scale from -1..1 to 0..1 - .mul(0xFF) # to 0..255 - .byte()).cpu() - - return latents_ubyte.numpy() - -def model_snapshot_folder(model, preferred_revision: str | None = None): - """ Try to find the preferred revision, but fallback to another revision if necessary. """ - import diffusers - storage_folder = os.path.join(diffusers.utils.DIFFUSERS_CACHE, model) - if not os.path.exists(os.path.join(storage_folder, "refs")): - storage_folder = os.path.join(diffusers.utils.hub_utils.old_diffusers_cache, model) - if os.path.exists(os.path.join(storage_folder, 'model_index.json')): # converted model - snapshot_folder = storage_folder - else: # hub model - revisions = {} - for revision in os.listdir(os.path.join(storage_folder, "refs")): - ref_path = os.path.join(storage_folder, "refs", revision) - with open(ref_path) as f: - commit_hash = f.read() - - snapshot_folder = os.path.join(storage_folder, "snapshots", commit_hash) - if len(os.listdir(snapshot_folder)) > 1: - revisions[revision] = snapshot_folder - - if len(revisions) == 0: - return None - elif preferred_revision in revisions: - revision = preferred_revision - elif preferred_revision in [None, "fp16"] and "main" in revisions: - revision = "main" - elif preferred_revision in [None, "main"] and "fp16" in revisions: - revision = "fp16" - else: - revision = next(iter(revisions.keys())) - snapshot_folder = revisions[revision] - - return snapshot_folder +from ...api.models.seamless_axes import SeamlessAxes +from ...api.models.step_preview_mode import StepPreviewMode +from ..models import Checkpoint, Optimizations, Scheduler +from ..models.image_generation_result import ImageGenerationResult +from ..future import Future def prompt_to_image( self, - pipeline: Pipeline, - model: str, + model: str | Checkpoint, - scheduler: Scheduler, + scheduler: str | Scheduler, optimizations: Optimizations, @@ -471,201 +37,97 @@ def prompt_to_image( # Stability SDK key: str | None = None, - **kwargs -) -> Generator[ImageGenerationResult, None, None]: - match pipeline: - case Pipeline.STABLE_DIFFUSION: - import diffusers - import torch - from PIL import Image, ImageOps - - # Mostly copied from `diffusers.StableDiffusionPipeline`, with slight modifications to yield the latents at each step. - class GeneratorPipeline(diffusers.StableDiffusionPipeline): - @torch.no_grad() - def __call__( - self, - prompt: Union[str, List[str]], - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 50, - guidance_scale: float = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: Optional[int] = 1, - eta: float = 0.0, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - output_type: Optional[str] = "pil", - return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, - callback_steps: Optional[int] = 1, - **kwargs, - ): - # 0. Default height and width to unet - height = height or self.unet.config.sample_size * self.vae_scale_factor - width = width or self.unet.config.sample_size * self.vae_scale_factor - - # 1. Check inputs. Raise error if not correct - self.check_inputs(prompt, height, width, callback_steps) - - # 2. Define call parameters - batch_size = 1 if isinstance(prompt, str) else len(prompt) - device = self._execution_device - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - # 3. Encode input prompt - text_embeddings = self._encode_prompt( - prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt - ) - - # 4. Prepare timesteps - self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps = self.scheduler.timesteps - - # 5. Prepare latent variables - num_channels_latents = self.unet.in_channels - latents = self.prepare_latents( - batch_size * num_images_per_prompt, - num_channels_latents, - height, - width, - text_embeddings.dtype, - device, - generator, - latents, - ) - - # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - - # 7. Denoising loop - for i, t in enumerate(self.progress_bar(timesteps)): - # NOTE: Modified to support disabling CFG - if do_classifier_free_guidance and (i / len(timesteps)) >= kwargs['cfg_end']: - do_classifier_free_guidance = False - text_embeddings = text_embeddings[text_embeddings.size(0) // 2:] - - # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - - # predict the noise residual - noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + sdxl_refiner_model: str | Checkpoint | None = None, - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample - - # NOTE: Modified to yield the latents instead of calling a callback. - yield ImageGenerationResult.step_preview(self, kwargs['step_preview_mode'], width, height, latents, generator, i) - - # 8. Post-processing - image = self.decode_latents(latents) - - # TODO: Add UI to enable this. - # 9. Run safety checker - # image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype) - - # Offload last model to CPU - if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: - self.final_offload_hook.offload() - - # NOTE: Modified to yield the decoded image as a numpy array. - yield ImageGenerationResult( - [np.asarray(ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255. - for i, image in enumerate(self.numpy_to_pil(image))], - [gen.initial_seed() for gen in generator] if isinstance(generator, list) else [generator.initial_seed()], - num_inference_steps, - True - ) - - if optimizations.cpu_only: - device = "cpu" - else: - device = self.choose_device() - - # StableDiffusionPipeline w/ caching - pipe = load_pipe(self, "prompt", GeneratorPipeline, model, optimizations, scheduler, device) + **kwargs +) -> Generator[Future, None, None]: + future = Future() + yield future - # Optimizations - pipe = optimizations.apply(pipe, device) + import diffusers + import torch + from PIL import Image, ImageOps - # RNG - batch_size = len(prompt) if isinstance(prompt, list) else 1 - generator = [] - for _ in range(batch_size): - gen = torch.Generator(device="cpu" if device in ("mps", "dml") else device) # MPS and DML do not support the `Generator` API - generator.append(gen.manual_seed(random.randrange(0, np.iinfo(np.uint32).max) if seed is None else seed)) - if batch_size == 1: - # Some schedulers don't handle a list of generators: https://github.com/huggingface/diffusers/issues/1909 - generator = generator[0] - - # Seamless - _configure_model_padding(pipe.unet, seamless_axes) - _configure_model_padding(pipe.vae, seamless_axes) + device = self.choose_device(optimizations) - # Inference - with torch.inference_mode() if device not in ('mps', "dml") else nullcontext(): - yield from pipe( + # Stable Diffusion pipeline w/ caching + if device == "cuda" and (optimizations.cpu_offloading(device) or torch.cuda.mem_get_info()[1] > 20 * 1024**3 * (1 if optimizations.can_use_half(device) else 2)): + pipe, refiner = self.load_model(diffusers.AutoPipelineForText2Image, model, optimizations, scheduler, sdxl_refiner_model=sdxl_refiner_model) + else: + pipe = self.load_model(diffusers.AutoPipelineForText2Image, model, optimizations, scheduler) + refiner = None + height = height or pipe.unet.config.sample_size * pipe.vae_scale_factor + width = width or pipe.unet.config.sample_size * pipe.vae_scale_factor + + # Optimizations + pipe = optimizations.apply(pipe, device) + + # RNG + batch_size = len(prompt) if isinstance(prompt, list) else 1 + generator = [] + for _ in range(batch_size): + gen = torch.Generator(device="cpu" if device in ("mps", "dml") else device) # MPS and DML do not support the `Generator` API + generator.append(gen.manual_seed(random.randrange(0, np.iinfo(np.uint32).max) if seed is None else seed)) + if batch_size == 1: + # Some schedulers don't handle a list of generators: https://github.com/huggingface/diffusers/issues/1909 + generator = generator[0] + + # Seamless + _configure_model_padding(pipe.unet, seamless_axes) + _configure_model_padding(pipe.vae, seamless_axes) + + # Inference + with torch.inference_mode() if device not in ('mps', "dml") else nullcontext(): + is_sdxl = isinstance(pipe, diffusers.StableDiffusionXLPipeline) + output_type = "latent" if is_sdxl and sdxl_refiner_model is not None else "pil" + def callback(step, timestep, latents): + if future.check_cancelled(): + raise InterruptedError() + future.add_response(ImageGenerationResult.step_preview(self, step_preview_mode, width, height, latents, generator, step)) + try: + result = pipe( + prompt=prompt, + height=height, + width=width, + num_inference_steps=steps, + guidance_scale=cfg_scale, + negative_prompt=negative_prompt if use_negative_prompt else None, + num_images_per_prompt=1, + eta=0.0, + generator=generator, + latents=None, + output_type=output_type, + return_dict=True, + callback=callback, + callback_steps=1, + #cfg_end=optimizations.cfg_end + ) + if is_sdxl and sdxl_refiner_model is not None and refiner is None: + # allow load_model() to garbage collect pipe + pipe = None + refiner = self.load_model(diffusers.AutoPipelineForImage2Image, sdxl_refiner_model, optimizations, scheduler) + if refiner is not None: + refiner = optimizations.apply(refiner, device) + result = refiner( prompt=prompt, - height=height, - width=width, - num_inference_steps=steps, - guidance_scale=cfg_scale, - negative_prompt=negative_prompt if use_negative_prompt else None, - num_images_per_prompt=1, - eta=0.0, - generator=generator, - latents=None, - output_type="pil", - return_dict=True, - callback=None, + negative_prompt=[""], + callback=callback, callback_steps=1, - step_preview_mode=step_preview_mode, - cfg_end=optimizations.cfg_end + num_inference_steps=steps, + image=result.images ) - case Pipeline.STABILITY_SDK: - import stability_sdk.client - import stability_sdk.interfaces.gooseai.generation.generation_pb2 - from PIL import Image, ImageOps - import io - - if key is None: - raise ValueError("DreamStudio key not provided. Enter your key in the add-on preferences.") - client = stability_sdk.client.StabilityInference(key=key, engine=model) - - if seed is None: - seed = random.randrange(0, np.iinfo(np.uint32).max) - - answers = client.generate( - prompt=prompt, - width=width or 512, - height=height or 512, - cfg_scale=cfg_scale, - sampler=scheduler.stability_sdk(), - steps=steps, - seed=seed - ) - for answer in answers: - for artifact in answer.artifacts: - if artifact.finish_reason == stability_sdk.interfaces.gooseai.generation.generation_pb2.FILTER: - raise ValueError("Your request activated DreamStudio's safety filter. Please modify your prompt and try again.") - if artifact.type == stability_sdk.interfaces.gooseai.generation.generation_pb2.ARTIFACT_IMAGE: - image = Image.open(io.BytesIO(artifact.binary)) - yield ImageGenerationResult( - [np.asarray(ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255.], - [seed], - steps, - True - ) - case _: - raise Exception(f"Unsupported pipeline {pipeline}.") + + future.add_response(ImageGenerationResult( + [np.asarray(ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255. + for image in result.images], + [gen.initial_seed() for gen in generator] if isinstance(generator, list) else [generator.initial_seed()], + steps, + True + )) + except InterruptedError: + pass + + future.set_done() def _conv_forward_asymmetric(self, input, weight, bias): import torch.nn as nn diff --git a/generator_process/actions/upscale.py b/generator_process/actions/upscale.py index cf034627..b70d8ca8 100644 --- a/generator_process/actions/upscale.py +++ b/generator_process/actions/upscale.py @@ -1,22 +1,17 @@ import numpy as np from .prompt_to_image import Optimizations, Scheduler, StepPreviewMode, _configure_model_padding -from .detect_seamless import SeamlessAxes +from ...api.models.seamless_axes import SeamlessAxes import random -from dataclasses import dataclass from numpy.typing import NDArray -from ..models.upscale_tiler import UpscaleTiler - -@dataclass -class ImageUpscaleResult: - image: NDArray | None - tile: int - total: int - final: bool - +from ..models import Checkpoint, Optimizations, Scheduler, UpscaleTiler, ImageGenerationResult +from ..future import Future +from contextlib import nullcontext def upscale( self, image: NDArray, + + model: str | Checkpoint, prompt: str, steps: int, @@ -34,69 +29,74 @@ def upscale( **kwargs ): + future = Future() + yield future + from PIL import Image, ImageOps import torch import diffusers - if optimizations.cpu_only: - device = "cpu" - else: - device = self.choose_device() + device = self.choose_device(optimizations) - pipe = diffusers.StableDiffusionUpscalePipeline.from_pretrained( - "stabilityai/stable-diffusion-x4-upscaler", - revision="fp16" if optimizations.can_use_half(device) else None, - torch_dtype=torch.float16 if optimizations.can_use_half(device) else torch.float32 - ) - pipe.scheduler = scheduler.create(pipe, None) - # vae would automatically be made float32 within the pipeline, but it fails to convert after offloading is enabled - pipe.vae.to(dtype=torch.float32) - if optimizations.can_use_cpu_offload(device) == "off": - pipe = pipe.to(device) - pipe = optimizations.apply(pipe, device) + pipe = self.load_model(diffusers.StableDiffusionUpscalePipeline, model, optimizations, scheduler) - generator = torch.Generator(device="cpu" if device in ("mps", "dml") else device) # MPS and DML do not support the `Generator` API - if seed is None: - seed = random.randrange(0, np.iinfo(np.uint32).max) + # Optimizations + pipe = optimizations.apply(pipe, device) - if image.shape[2] == 4: - image = image[:, :, :3] + # RNG + batch_size = len(prompt) if isinstance(prompt, list) else 1 + generator = [] + for _ in range(batch_size): + gen = torch.Generator(device="cpu" if device in ("mps", "dml") else device) # MPS and DML do not support the `Generator` API + generator.append(gen.manual_seed(random.randrange(0, np.iinfo(np.uint32).max) if seed is None else seed)) + if batch_size == 1: + # Some schedulers don't handle a list of generators: https://github.com/huggingface/diffusers/issues/1909 + generator = generator[0] + + # Seamless tiler = UpscaleTiler(image, 4, tile_size, blend, seamless_axes) _configure_model_padding(pipe.unet, seamless_axes & ~tiler.seamless_axes) _configure_model_padding(pipe.vae, seamless_axes & ~tiler.seamless_axes) + + if image.shape[2] == 4: + image = image[:, :, :3] for i in range(0, len(tiler), optimizations.batch_size): + if future.check_cancelled(): + future.set_done() + return batch_size = min(len(tiler)-i, optimizations.batch_size) ids = list(range(i, i+batch_size)) - low_res_tiles = [Image.fromarray(tiler[id]) for id in ids] - high_res_tiles = pipe( - prompt=[prompt] * batch_size, - image=low_res_tiles, - num_inference_steps=steps, - generator=generator.manual_seed(seed), - guidance_scale=cfg_scale, - ).images + low_res_tiles = [Image.fromarray(tiler[id]).convert('RGB') for id in ids] + # Inference + with torch.inference_mode() if device not in ('mps', "dml") else nullcontext(): + high_res_tiles = pipe( + prompt=[prompt[0] if isinstance(prompt, list) else prompt] * batch_size, + image=low_res_tiles, + num_inference_steps=steps, + generator=generator, + guidance_scale=cfg_scale, + ).images - # not implemented in diffusers.StableDiffusionUpscalePipeline - # Offload last model to CPU - if hasattr(pipe, "final_offload_hook") and pipe.final_offload_hook is not None: - pipe.final_offload_hook.offload() for id, tile in zip(ids, high_res_tiles): - tiler[id] = np.array(tile) + tiler[id] = np.array(tile.convert('RGBA')) step = None if step_preview_mode != StepPreviewMode.NONE: step = Image.fromarray(tiler.combined().astype(np.uint8)) - yield ImageUpscaleResult( - (np.asarray(ImageOps.flip(step).convert('RGBA'), dtype=np.float32) / 255.) if step is not None else None, - i + batch_size, - len(tiler), - (i + batch_size) == len(tiler) - ) + future.add_response(ImageGenerationResult( + [(np.asarray(ImageOps.flip(step).convert('RGBA'), dtype=np.float32) / 255.)], + [seed], + i + batch_size, + (i + batch_size) == len(tiler), + total=len(tiler) + )) if step_preview_mode == StepPreviewMode.NONE: final = Image.fromarray(tiler.combined().astype(np.uint8)) - yield ImageUpscaleResult( - np.asarray(ImageOps.flip(final).convert('RGBA'), dtype=np.float32) / 255., - len(tiler), + future.add_response(ImageGenerationResult( + [np.asarray(ImageOps.flip(final).convert('RGBA'), dtype=np.float32) / 255.], + [seed], len(tiler), - True - ) + True, + total=len(tiler) + )) + future.set_done() diff --git a/generator_process/actor.py b/generator_process/actor.py index 3fabfc8f..4cf32b0e 100644 --- a/generator_process/actor.py +++ b/generator_process/actor.py @@ -6,6 +6,7 @@ from typing import Type, TypeVar, Generator import site import sys +import os from ..absolute_path import absolute_path from .future import Future @@ -13,6 +14,12 @@ def _load_dependencies(): site.addsitedir(absolute_path(".python_dependencies")) deps = sys.path.pop(-1) sys.path.insert(0, deps) + if sys.platform == 'win32': + # fix for ImportError: DLL load failed while importing cv2: The specified module could not be found. + # cv2 needs python3.dll, which is stored in Blender's root directory instead of its python directory. + python3_path = os.path.abspath(os.path.join(sys.executable, "..\\..\\..\\..\\python3.dll")) + if os.path.exists(python3_path): + os.add_dll_directory(os.path.dirname(python3_path)) if current_process().name == "__actor__": _load_dependencies() @@ -112,6 +119,7 @@ def start(self: T) -> T: self.process = get_context('spawn').Process(target=_start_backend, args=(self.__class__, self._message_queue, self._response_queue), name="__actor__", daemon=True) self.process.start() case ActorContext.BACKEND: + os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" self._backend_loop() return self @@ -163,6 +171,12 @@ def _receive(self, message: Message): if extra_message == Message.CANCEL: break if isinstance(res, Future): + def check_cancelled(): + try: + return self._message_queue.get(block=False) == Message.CANCEL + except: + return False + res.check_cancelled = check_cancelled res.add_response_callback(lambda _, res: self._response_queue.put(res)) res.add_exception_callback(lambda _, e: self._response_queue.put(RuntimeError(repr(e)))) res.add_done_callback(lambda _: None) diff --git a/generator_process/future.py b/generator_process/future.py index fe3f563b..d7e302f2 100644 --- a/generator_process/future.py +++ b/generator_process/future.py @@ -15,6 +15,7 @@ class Future: _done_event: threading.Event done: bool = False cancelled: bool = False + check_cancelled: Callable[[], bool] = lambda: False call_done_on_exception: bool = True def __init__(self): diff --git a/generator_process/models/__init__.py b/generator_process/models/__init__.py index 36631dd9..ee9622a9 100644 --- a/generator_process/models/__init__.py +++ b/generator_process/models/__init__.py @@ -1,2 +1,7 @@ -from .pipeline import * -from .fix_it_error import * \ No newline at end of file +from .checkpoint import * +from .image_generation_result import * +from .model_config import * +from .model_type import * +from .optimizations import * +from .scheduler import * +from .upscale_tiler import * \ No newline at end of file diff --git a/generator_process/models/checkpoint.py b/generator_process/models/checkpoint.py new file mode 100644 index 00000000..d8fe1a4e --- /dev/null +++ b/generator_process/models/checkpoint.py @@ -0,0 +1,9 @@ +from dataclasses import dataclass + +from .model_config import ModelConfig + + +@dataclass(frozen=True) +class Checkpoint: + path: str + config: ModelConfig | str | None diff --git a/generator_process/models/fix_it_error.py b/generator_process/models/fix_it_error.py deleted file mode 100644 index 5a19ee1f..00000000 --- a/generator_process/models/fix_it_error.py +++ /dev/null @@ -1,14 +0,0 @@ -from typing import Callable, Any - -class FixItError(Exception): - """An exception with a solution. - - Call the `draw` method to render the UI elements responsible for resolving this error. - """ - def __init__(self, message, fix_it: Callable[[Any, Any], None]): - super().__init__(message) - - self._fix_it = fix_it - - def draw(self, context, layout): - self._fix_it(context, layout) \ No newline at end of file diff --git a/generator_process/models/image_generation_result.py b/generator_process/models/image_generation_result.py new file mode 100644 index 00000000..bdff8bfc --- /dev/null +++ b/generator_process/models/image_generation_result.py @@ -0,0 +1,87 @@ +from typing import List +import math +from dataclasses import dataclass +from numpy.typing import NDArray +import numpy as np +from ...api.models.step_preview_mode import StepPreviewMode + +@dataclass +class ImageGenerationResult: + images: List[NDArray] + seeds: List[int] + step: int + final: bool + total: int | None = None + + @staticmethod + def step_preview(pipe, mode, width, height, latents, generator, iteration): + from PIL import Image, ImageOps + seeds = [gen.initial_seed() for gen in generator] if isinstance(generator, list) else [generator.initial_seed()] + match mode: + case StepPreviewMode.FAST: + return ImageGenerationResult( + [np.asarray(ImageOps.flip(Image.fromarray(approximate_decoded_latents(latents[-1:]))).resize((width, height), Image.Resampling.NEAREST).convert('RGBA'), dtype=np.float32) / 255.], + seeds[-1:], + iteration, + False + ) + case StepPreviewMode.FAST_BATCH: + return ImageGenerationResult( + [ + np.asarray(ImageOps.flip(Image.fromarray(approximate_decoded_latents(latents[i:i + 1]))).resize((width, height), Image.Resampling.NEAREST).convert('RGBA'), + dtype=np.float32) / 255. + for i in range(latents.size(0)) + ], + seeds, + iteration, + False + ) + case StepPreviewMode.ACCURATE: + return ImageGenerationResult( + [np.asarray(ImageOps.flip(pipe.numpy_to_pil(pipe.decode_latents(latents[-1:]))[0]).convert('RGBA'), + dtype=np.float32) / 255.], + seeds[-1:], + iteration, + False + ) + case StepPreviewMode.ACCURATE_BATCH: + return ImageGenerationResult( + [ + np.asarray(ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255. + for image in pipe.numpy_to_pil(pipe.decode_latents(latents)) + ], + seeds, + iteration, + False + ) + return ImageGenerationResult( + [], + seeds, + iteration, + False + ) + +def approximate_decoded_latents(latents): + """ + Approximate the decoded latents without using the VAE. + """ + import torch + # origingally adapted from code by @erucipe and @keturn here: + # https://discuss.huggingface.co/t/decoding-latents-to-rgb-without-upscaling/23204/7 + + # these updated numbers for v1.5 are from @torridgristle + v1_5_latent_rgb_factors = torch.tensor([ + # R G B + [ 0.3444, 0.1385, 0.0670], # L1 + [ 0.1247, 0.4027, 0.1494], # L2 + [-0.3192, 0.2513, 0.2103], # L3 + [-0.1307, -0.1874, -0.7445] # L4 + ], dtype=latents.dtype, device=latents.device) + + latent_image = latents[0].permute(1, 2, 0) @ v1_5_latent_rgb_factors + latents_ubyte = (((latent_image + 1) / 2) + .clamp(0, 1) # change scale from -1..1 to 0..1 + .mul(0xFF) # to 0..255 + .byte()).cpu() + + return latents_ubyte.numpy() \ No newline at end of file diff --git a/generator_process/models/model_config.py b/generator_process/models/model_config.py new file mode 100644 index 00000000..c0f2c8fa --- /dev/null +++ b/generator_process/models/model_config.py @@ -0,0 +1,60 @@ +import enum + +from ...absolute_path import absolute_path + + +class ModelConfig(enum.Enum): + AUTO_DETECT = "auto-detect" + STABLE_DIFFUSION_1 = "v1" + STABLE_DIFFUSION_2_BASE = "v2 (512, epsilon)" + STABLE_DIFFUSION_2 = "v2 (768, v_prediction)" + STABLE_DIFFUSION_2_DEPTH = "v2 (depth)" + STABLE_DIFFUSION_2_INPAINTING = "v2 (inpainting)" + STABLE_DIFFUSION_XL_BASE = "XL (base)" + STABLE_DIFFUSION_XL_REFINER = "XL (refiner)" + CONTROL_NET_1_5 = "1.5 (ControlNet)" + CONTROL_NET_2_1 = "2.1 (ControlNet)" + + @property + def original_config(self): + match self: + case ModelConfig.AUTO_DETECT: + return None + case ModelConfig.STABLE_DIFFUSION_1: + return absolute_path("sd_configs/v1-inference.yaml") + case ModelConfig.STABLE_DIFFUSION_2_BASE: + return absolute_path("sd_configs/v2-inference.yaml") + case ModelConfig.STABLE_DIFFUSION_2: + return absolute_path("sd_configs/v2-inference-v.yaml") + case ModelConfig.STABLE_DIFFUSION_2_DEPTH: + return absolute_path("sd_configs/v2-midas-inference.yaml") + case ModelConfig.STABLE_DIFFUSION_2_INPAINTING: + return absolute_path("sd_configs/v2-inpainting-inference.yaml") + case ModelConfig.STABLE_DIFFUSION_XL_BASE: + return absolute_path("sd_configs/sd_xl_base.yaml") + case ModelConfig.STABLE_DIFFUSION_XL_REFINER: + return absolute_path("sd_configs/sd_xl_refiner.yaml") + case ModelConfig.CONTROL_NET_1_5: + return absolute_path("sd_configs/cldm_v15.yaml") + case ModelConfig.CONTROL_NET_2_1: + return absolute_path("sd_configs/cldm_v21.yaml") + + @property + def pipeline(self): + # allows for saving with correct _class_name in model_index.json and necessary for some models to import + import diffusers + match self: + case ModelConfig.AUTO_DETECT: + return None + case ModelConfig.STABLE_DIFFUSION_2_DEPTH: + return diffusers.StableDiffusionDepth2ImgPipeline + case ModelConfig.STABLE_DIFFUSION_2_INPAINTING: + return diffusers.StableDiffusionInpaintPipeline + case ModelConfig.STABLE_DIFFUSION_XL_BASE: + return diffusers.StableDiffusionXLPipeline + case ModelConfig.STABLE_DIFFUSION_XL_REFINER: + return diffusers.StableDiffusionXLImg2ImgPipeline + case ModelConfig.CONTROL_NET_1_5 | ModelConfig.CONTROL_NET_2_1: + return diffusers.ControlNetModel + case _: + return diffusers.StableDiffusionPipeline diff --git a/generator_process/models/model_type.py b/generator_process/models/model_type.py new file mode 100644 index 00000000..39639917 --- /dev/null +++ b/generator_process/models/model_type.py @@ -0,0 +1,90 @@ +import enum + +from ...api.models.task import * +from .model_config import ModelConfig + + +class ModelType(enum.IntEnum): + """ + Inferred model type from the U-Net `in_channels`. + """ + UNKNOWN = 0 + PROMPT_TO_IMAGE = 4 + DEPTH = 5 + UPSCALING = 7 + INPAINTING = 9 + + CONTROL_NET = -1 + UNSPECIFIED_CHECKPOINT = -2 + + @classmethod + def _missing_(cls, _): + return cls.UNKNOWN + + def recommended_model(self) -> str: + """Provides a recommended model for a given task. + + This method has a bias towards the latest version of official Stability AI models. + """ + match self: + case ModelType.PROMPT_TO_IMAGE: + return "stabilityai/stable-diffusion-2-1" + case ModelType.DEPTH: + return "stabilityai/stable-diffusion-2-depth" + case ModelType.UPSCALING: + return "stabilityai/stable-diffusion-x4-upscaler" + case ModelType.INPAINTING: + return "stabilityai/stable-diffusion-2-inpainting" + case _: + return "stabilityai/stable-diffusion-2-1" + + def matches_task(self, task: Task) -> bool: + """Indicates if the model type is correct for a given `Task`. + + If not an error should be shown to the user to select a different model. + """ + if self == ModelType.UNSPECIFIED_CHECKPOINT: + return True + match task: + case PromptToImage(): + return self == ModelType.PROMPT_TO_IMAGE + case Inpaint(): + return self == ModelType.INPAINTING + case DepthToImage(): + return self == ModelType.DEPTH + case Outpaint(): + return self == ModelType.INPAINTING + case ImageToImage(): + return self == ModelType.PROMPT_TO_IMAGE + case _: + return False + + @staticmethod + def from_task(task: Task) -> 'ModelType | None': + match task: + case PromptToImage(): + return ModelType.PROMPT_TO_IMAGE + case Inpaint(): + return ModelType.INPAINTING + case DepthToImage(): + return ModelType.DEPTH + case Outpaint(): + return ModelType.INPAINTING + case ImageToImage(): + return ModelType.PROMPT_TO_IMAGE + case _: + return None + + @staticmethod + def from_config(config: ModelConfig): + match config: + case ModelConfig.AUTO_DETECT: + return ModelType.UNSPECIFIED_CHECKPOINT + case ModelConfig.STABLE_DIFFUSION_2_DEPTH: + return ModelType.DEPTH + case ModelConfig.STABLE_DIFFUSION_2_INPAINTING: + return ModelType.INPAINTING + case ModelConfig.CONTROL_NET_1_5 | ModelConfig.CONTROL_NET_2_1: + return ModelType.CONTROL_NET + case _: + return ModelType.PROMPT_TO_IMAGE diff --git a/generator_process/models/optimizations.py b/generator_process/models/optimizations.py new file mode 100644 index 00000000..903c8c72 --- /dev/null +++ b/generator_process/models/optimizations.py @@ -0,0 +1,173 @@ +from enum import Enum +from typing import Annotated, Union, _AnnotatedAlias +import functools +import os +import sys +from dataclasses import dataclass + +from .upscale_tiler import tiled_decode_latents + + +class CPUOffload(Enum): + OFF = "off" + MODEL = "model" + SUBMODULE = "submodule" + + def __bool__(self): + return self != CPUOffload.OFF + + +@dataclass(eq=True) +class Optimizations: + attention_slicing: bool = True + attention_slice_size: Union[str, int] = "auto" + cudnn_benchmark: Annotated[bool, "cuda"] = False + tf32: Annotated[bool, "cuda"] = False + amp: Annotated[bool, "cuda"] = False + half_precision: Annotated[bool, {"cuda", "privateuseone"}] = True + cpu_offload: Annotated[str, {"cuda", "privateuseone"}] = CPUOffload.OFF + channels_last_memory_format: bool = False + sdp_attention: Annotated[bool, {"cpu", "cuda", "mps"}] = True + batch_size: int = 1 + vae_slicing: bool = True + vae_tiling: str = "off" + vae_tile_size: int = 512 + vae_tile_blend: int = 64 + cfg_end: float = 1.0 + + cpu_only: bool = False + + @staticmethod + def infer_device() -> str: + from ...absolute_path import absolute_path + if sys.platform == "darwin": + return "mps" + elif os.path.exists(absolute_path(".python_dependencies/torch_directml")): + return "privateuseone" + else: + return "cuda" + + @classmethod + def device_supports(cls, property, device) -> bool: + annotation = cls.__annotations__.get(property, None) + if isinstance(annotation, _AnnotatedAlias): + opt_dev = annotation.__metadata__[0] + if isinstance(opt_dev, str): + return opt_dev == device + return device in opt_dev + return annotation is not None + + def can_use(self, property, device) -> bool: + return self.device_supports(property, device) and getattr(self, property) + + def can_use_half(self, device): + if self.half_precision and device == "cuda": + import torch + name = torch.cuda.get_device_name() + return not ("GTX 1650" in name or "GTX 1660" in name) + return self.can_use("half_precision", device) + + def cpu_offloading(self, device): + return self.cpu_offload if self.device_supports("cpu_offload", device) else CPUOffload.OFF + + def apply(self, pipeline, device): + """ + Apply the optimizations to a diffusers pipeline. + + All exceptions are ignored to make this more general purpose across different pipelines. + """ + import torch + + if not self.cpu_offloading(device): + pipeline = pipeline.to(device) + + torch.backends.cudnn.benchmark = self.can_use("cudnn_benchmark", device) + torch.backends.cuda.matmul.allow_tf32 = self.can_use("tf32", device) + + try: + if self.can_use("sdp_attention", device): + from diffusers.models.attention_processor import AttnProcessor2_0 + pipeline.unet.set_attn_processor(AttnProcessor2_0()) + elif self.can_use("attention_slicing", device): + pipeline.enable_attention_slicing(self.attention_slice_size) + else: + pipeline.disable_attention_slicing() # will also disable AttnProcessor2_0 + except: pass + + try: + if pipeline.device != pipeline._execution_device: + pass # pipeline is already offloaded, offloading again can cause `pipeline._execution_device` to be incorrect + elif self.cpu_offloading(device) == CPUOffload.MODEL: + # adapted from diffusers.StableDiffusionPipeline.enable_model_cpu_offload() to allow DirectML device and unimplemented pipelines + from accelerate import cpu_offload_with_hook + + hook = None + models = [] + # text_encoder can be None in SDXL Pipeline but not text_encoder_2 + if pipeline.text_encoder is not None: + models.append(pipeline.text_encoder) + if hasattr(pipeline, "text_encoder_2"): + models.append(pipeline.text_encoder_2) + models.extend([pipeline.unet, pipeline.vae]) + if hasattr(pipeline, "controlnet"): + models.append(pipeline.controlnet) + for cpu_offloaded_model in models: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + if getattr(pipeline, "safety_checker", None) is not None: + _, hook = cpu_offload_with_hook(pipeline.safety_checker, device, prev_module_hook=hook) + + # We'll offload the last model manually. + pipeline.final_offload_hook = hook + elif self.cpu_offloading(device) == CPUOffload.SUBMODULE: + # adapted from diffusers.StableDiffusionPipeline.enable_sequential_cpu_offload() to allow DirectML device and unimplemented pipelines + from accelerate import cpu_offload + + models = [] + # text_encoder can be None in SDXL Pipeline but not text_encoder_2 + if pipeline.text_encoder is not None: + models.append(pipeline.text_encoder) + if hasattr(pipeline, "text_encoder_2"): + models.append(pipeline.text_encoder_2) + models.extend([pipeline.unet, pipeline.vae]) + if hasattr(pipeline, "controlnet"): + models.append(pipeline.controlnet) + for cpu_offloaded_model in models: + cpu_offload(cpu_offloaded_model, device) + + if getattr(pipeline, "safety_checker", None) is not None: + cpu_offload(pipeline.safety_checker, device, offload_buffers=True) + except: pass + + try: + if self.can_use("channels_last_memory_format", device): + pipeline.unet.to(memory_format=torch.channels_last) + else: + pipeline.unet.to(memory_format=torch.contiguous_format) + except: pass + + try: + if self.can_use("vae_slicing", device): + # Not many pipelines implement the enable_vae_slicing()/disable_vae_slicing() + # methods but all they do is forward their call to the vae anyway. + pipeline.vae.enable_slicing() + else: + pipeline.vae.disable_slicing() + except: pass + + try: + if self.vae_tiling != "off": + if not isinstance(pipeline.vae.decode, functools.partial): + pipeline.vae.decode = functools.partial(tiled_decode_latents.__get__(pipeline), pre_patch=pipeline.vae.decode) + pipeline.vae.decode.keywords['optimizations'] = self + elif self.vae_tiling == "off" and isinstance(pipeline.vae.decode, functools.partial): + pipeline.vae.decode = pipeline.vae.decode.keywords["pre_patch"] + except: pass + + from .. import directml_patches + if device == "privateuseone": + directml_patches.enable(pipeline) + else: + directml_patches.disable(pipeline) + + return pipeline \ No newline at end of file diff --git a/generator_process/models/pipeline.py b/generator_process/models/pipeline.py deleted file mode 100644 index 59474112..00000000 --- a/generator_process/models/pipeline.py +++ /dev/null @@ -1,72 +0,0 @@ -import enum -import os - -class Pipeline(enum.IntEnum): - STABLE_DIFFUSION = 0 - - STABILITY_SDK = 1 - - @staticmethod - def local_available(): - from ...absolute_path import absolute_path - return os.path.exists(absolute_path(".python_dependencies/diffusers")) - - @staticmethod - def directml_available(): - from ...absolute_path import absolute_path - return os.path.exists(absolute_path(".python_dependencies/torch_directml")) - - def __str__(self): - return self.name - - def model(self): - return True - - def init_img_actions(self): - match self: - case Pipeline.STABLE_DIFFUSION: - return ['modify', 'inpaint', 'outpaint'] - case Pipeline.STABILITY_SDK: - return ['modify', 'inpaint'] - - def inpaint_mask_sources(self): - match self: - case Pipeline.STABLE_DIFFUSION: - return ['alpha', 'prompt'] - case Pipeline.STABILITY_SDK: - return ['alpha'] - - def color_correction(self): - match self: - case Pipeline.STABLE_DIFFUSION: - return True - case Pipeline.STABILITY_SDK: - return False - - def negative_prompts(self): - match self: - case Pipeline.STABLE_DIFFUSION: - return True - case Pipeline.STABILITY_SDK: - return False - - def seamless(self): - match self: - case Pipeline.STABLE_DIFFUSION: - return True - case Pipeline.STABILITY_SDK: - return False - - def upscaling(self): - match self: - case Pipeline.STABLE_DIFFUSION: - return True - case Pipeline.STABILITY_SDK: - return False - - def depth(self): - match self: - case Pipeline.STABLE_DIFFUSION: - return True - case Pipeline.STABILITY_SDK: - return False \ No newline at end of file diff --git a/generator_process/models/scheduler.py b/generator_process/models/scheduler.py new file mode 100644 index 00000000..20602d60 --- /dev/null +++ b/generator_process/models/scheduler.py @@ -0,0 +1,57 @@ +import enum + +class Scheduler(enum.Enum): + DDIM = "DDIM" + DDPM = "DDPM" + DEIS_MULTISTEP = "DEIS Multistep" + DPM_SOLVER_MULTISTEP = "DPM Solver Multistep" + DPM_SOLVER_MULTISTEP_KARRAS = "DPM Solver Multistep Karras" + DPM_SOLVER_SINGLESTEP = "DPM Solver Singlestep" + DPM_SOLVER_SINGLESTEP_KARRAS = "DPM Solver Singlestep Karras" + EULER_DISCRETE = "Euler Discrete" + EULER_DISCRETE_KARRAS = "Euler Discrete Karras" + EULER_ANCESTRAL_DISCRETE = "Euler Ancestral Discrete" + HEUN_DISCRETE = "Heun Discrete" + HEUN_DISCRETE_KARRAS = "Heun Discrete Karras" + KDPM2_DISCRETE = "KDPM2 Discrete" # Non-functional on mps + KDPM2_ANCESTRAL_DISCRETE = "KDPM2 Ancestral Discrete" + LMS_DISCRETE = "LMS Discrete" + LMS_DISCRETE_KARRAS = "LMS Discrete Karras" + PNDM = "PNDM" + UNIPC_MULTISTEP = "UniPC Multistep" + + def create(self, pipeline): + import diffusers + def scheduler_class(): + match self: + case Scheduler.DDIM: + return diffusers.schedulers.DDIMScheduler + case Scheduler.DDPM: + return diffusers.schedulers.DDPMScheduler + case Scheduler.DEIS_MULTISTEP: + return diffusers.schedulers.DEISMultistepScheduler + case Scheduler.DPM_SOLVER_MULTISTEP | Scheduler.DPM_SOLVER_MULTISTEP_KARRAS: + return diffusers.schedulers.DPMSolverMultistepScheduler + case Scheduler.DPM_SOLVER_SINGLESTEP | Scheduler.DPM_SOLVER_SINGLESTEP_KARRAS: + return diffusers.schedulers.DPMSolverSinglestepScheduler + case Scheduler.EULER_DISCRETE | Scheduler.EULER_DISCRETE_KARRAS: + return diffusers.schedulers.EulerDiscreteScheduler + case Scheduler.EULER_ANCESTRAL_DISCRETE: + return diffusers.schedulers.EulerAncestralDiscreteScheduler + case Scheduler.HEUN_DISCRETE | Scheduler.HEUN_DISCRETE_KARRAS: + return diffusers.schedulers.HeunDiscreteScheduler + case Scheduler.KDPM2_DISCRETE: + return diffusers.schedulers.KDPM2DiscreteScheduler + case Scheduler.KDPM2_ANCESTRAL_DISCRETE: + return diffusers.schedulers.KDPM2AncestralDiscreteScheduler + case Scheduler.LMS_DISCRETE | Scheduler.LMS_DISCRETE_KARRAS: + return diffusers.schedulers.LMSDiscreteScheduler + case Scheduler.PNDM: + return diffusers.schedulers.PNDMScheduler + case Scheduler.UNIPC_MULTISTEP: + return diffusers.schedulers.UniPCMultistepScheduler + original_config = getattr(pipeline.scheduler, "_original_config", pipeline.scheduler.config) + scheduler = scheduler_class().from_config(original_config, use_karras_sigmas=self.name.endswith("KARRAS")) + scheduler._original_config = original_config + pipeline.scheduler = scheduler + return scheduler diff --git a/generator_process/models/upscale_tiler.py b/generator_process/models/upscale_tiler.py index 608f3933..bf561170 100644 --- a/generator_process/models/upscale_tiler.py +++ b/generator_process/models/upscale_tiler.py @@ -163,7 +163,7 @@ def __len__(self): return len(self.x_tiles) * len(self.y_tiles) -def tiled_decode_latents(self, latents, *, pre_patch, optimizations): +def tiled_decode_latents(self, latents, return_dict=False, *, pre_patch, optimizations): # not all pipelines (namely upscale) have the vae_scale_factor attribute vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) default_size = self.unet.config.sample_size * vae_scale_factor @@ -196,10 +196,16 @@ def tiled_decode_latents(self, latents, *, pre_patch, optimizations): configure_model_padding(self.vae, seamless_axes & ~tiler.seamless_axes) for id, tile in tiler: - tiler[id] = pre_patch(tile.permute(2, 0, 1).unsqueeze(0)).squeeze(0) - images.append(np.expand_dims(tiler.combined(), 0)) + tiler[id] = pre_patch(tile.permute(2, 0, 1).unsqueeze(0)).sample.squeeze(0).permute(1, 2, 0).cpu().numpy() + images.append(np.expand_dims(tiler.combined(), 0).transpose(0, 3, 1, 2)) configure_model_padding(self.vae, seamless_axes) - return np.concatenate(images) + images = np.concatenate(images) + import torch + images = torch.from_numpy(images) + if not return_dict: + return (images,) + from diffusers.models.vae import DecoderOutput + return DecoderOutput(images) def configure_model_padding(model, seamless_axes): import torch.nn as nn diff --git a/operators/dream_texture.py b/operators/dream_texture.py index 62e6953d..aec1a2a5 100644 --- a/operators/dream_texture.py +++ b/operators/dream_texture.py @@ -1,17 +1,15 @@ import bpy import hashlib import numpy as np -import math -from typing import Literal +from typing import List, Literal from .notify_result import NotifyResult -from ..preferences import StableDiffusionPreferences from ..pil_to_image import * from ..prompt_engineering import * from ..generator_process import Generator -from ..generator_process.actions.prompt_to_image import ImageGenerationResult, Pipeline -from ..generator_process.actions.huggingface_hub import ModelType +from .. import api import time +import math def bpy_image(name, width, height, pixels, existing_image): if existing_image is not None and (existing_image.size[0] != width or existing_image.size[1] != height): @@ -53,22 +51,28 @@ class DreamTexture(bpy.types.Operator): @classmethod def poll(cls, context): try: - context.scene.dream_textures_prompt.validate(context) + prompt = context.scene.dream_textures_prompt + backend: api.Backend = prompt.get_backend() + backend.validate(prompt.generate_args(context)) except: return False return Generator.shared().can_use() def execute(self, context): + screen = context.screen + scene = context.scene + prompt = scene.dream_textures_prompt + backend: api.Backend = prompt.get_backend() + history_template = {prop: getattr(context.scene.dream_textures_prompt, prop) for prop in context.scene.dream_textures_prompt.__annotations__.keys()} history_template["iterations"] = 1 history_template["random_seed"] = False + is_file_batch = context.scene.dream_textures_prompt.prompt_structure == file_batch_structure.id file_batch_lines = [] - file_batch_lines_negative = [] if is_file_batch: context.scene.dream_textures_prompt.iterations = 1 file_batch_lines = [line.body for line in context.scene.dream_textures_prompt_file.lines if len(line.body.strip()) > 0] - file_batch_lines_negative = [""] * len(file_batch_lines) history_template["prompt_structure"] = custom_structure.id node_tree = context.material.node_tree if hasattr(context, 'material') and hasattr(context.material, 'node_tree') else None @@ -77,45 +81,58 @@ def execute(self, context): screen = context.screen scene = context.scene - generated_args = scene.dream_textures_prompt.generate_args() + generated_args = scene.dream_textures_prompt.generate_args(context) context.scene.seamless_result.update_args(generated_args) context.scene.seamless_result.update_args(history_template, as_id=True) - init_image = None - if generated_args['use_init_img']: - init_image = get_source_image(context, generated_args['init_img_src']) + # Setup the progress indicator + bpy.types.Scene.dream_textures_progress = bpy.props.IntProperty(name="", default=0, min=0, max=generated_args.steps) + scene.dream_textures_info = "Starting..." + + # Get any init images + try: + init_image = get_source_image(context, prompt.init_img_src) + except ValueError: + init_image = None if init_image is not None: init_image = np.flipud( (np.array(init_image.pixels) * 255) .astype(np.uint8) .reshape((init_image.size[1], init_image.size[0], init_image.channels)) ) + + control_images = None + if len(prompt.control_nets) > 0: + control_images = [ + np.flipud( + np.array(net.control_image.pixels) + .reshape((net.control_image.size[1], net.control_image.size[0], net.control_image.channels)) + ) + for net in prompt.control_nets + ] - # Setup the progress indicator - bpy.types.Scene.dream_textures_progress = bpy.props.IntProperty(name="", default=0, min=0, max=generated_args['steps']) - scene.dream_textures_info = "Starting..." - + # Callbacks last_data_block = None execution_start = time.time() - def step_callback(_, step_image: ImageGenerationResult): + def step_callback(progress: List[api.GenerationResult]) -> bool: nonlocal last_data_block scene.dream_textures_last_execution_time = f"{time.time() - execution_start:.2f} seconds" - if step_image.final: - return - scene.dream_textures_progress = step_image.step + scene.dream_textures_progress = progress[-1].progress for area in context.screen.areas: for region in area.regions: if region.type == "UI": region.tag_redraw() - if len(step_image.images) > 0: - image = step_image.tile_images() - last_data_block = bpy_image(f"Step {step_image.step}/{generated_args['steps']}", image.shape[1], image.shape[0], image.ravel(), last_data_block) - for area in screen.areas: - if area.type == 'IMAGE_EDITOR' and not area.spaces.active.use_image_pin: - area.spaces.active.image = last_data_block + image = api.GenerationResult.tile_images(progress) + if image is None: + return CancelGenerator.should_continue + last_data_block = bpy_image(f"Step {progress[-1].progress}/{progress[-1].total}", image.shape[1], image.shape[0], image.ravel(), last_data_block) + for area in screen.areas: + if area.type == 'IMAGE_EDITOR' and not area.spaces.active.use_image_pin: + area.spaces.active.image = last_data_block + return CancelGenerator.should_continue iteration = 0 - iteration_limit = len(file_batch_lines) if is_file_batch else generated_args['iterations'] + iteration_limit = len(file_batch_lines) if is_file_batch else generated_args.iterations iteration_square = math.ceil(math.sqrt(iteration_limit)) node_pad = np.array((20, 20)) node_size = np.array((240, 277)) + node_pad @@ -123,147 +140,76 @@ def step_callback(_, step_image: ImageGenerationResult): # keep image nodes grid centered but don't go beyond top and left sides of nodes editor node_anchor = node_tree_center + node_size * 0.5 * (-iteration_square, (iteration_limit-1) // iteration_square + 1) node_anchor = np.array((np.maximum(node_tree_top_left[0], node_anchor[0]), np.minimum(node_tree_top_left[1], node_anchor[1]))) + node_pad * (0.5, -0.5) - def done_callback(future): - nonlocal last_data_block - nonlocal iteration - if hasattr(gen, '_active_generation_future'): - del gen._active_generation_future - result: ImageGenerationResult = future.result(last_only=True) - for i, result_image in enumerate(result.images): - seed = result.seeds[i] - prompt_string = context.scene.dream_textures_prompt.prompt_structure_token_subject - seed_str_length = len(str(seed)) - trim_aware_name = (prompt_string[:54 - seed_str_length] + '..') if len(prompt_string) > 54 else prompt_string - name_with_trimmed_prompt = f"{trim_aware_name} ({seed})" - image = bpy_image(name_with_trimmed_prompt, result_image.shape[1], result_image.shape[0], result_image.ravel(), last_data_block) - last_data_block = None - if node_tree is not None: - nodes = node_tree.nodes - texture_node = nodes.new("ShaderNodeTexImage") - texture_node.image = image - texture_node.location = node_anchor + node_size * ((iteration % iteration_square), -(iteration // iteration_square)) - nodes.active = texture_node - for area in screen.areas: - if area.type == 'IMAGE_EDITOR' and not area.spaces.active.use_image_pin: - area.spaces.active.image = image - scene.dream_textures_prompt.seed = str(seed) # update property in case seed was sourced randomly or from hash - # create a hash from the Blender image datablock to use as unique ID of said image and store it in the prompt history - # and as custom property of the image. Needs to be a string because the int from the hash function is too large - image_hash = hashlib.sha256((np.array(image.pixels) * 255).tobytes()).hexdigest() - image['dream_textures_hash'] = image_hash - scene.dream_textures_prompt.hash = image_hash - history_entry = context.scene.dream_textures_history.add() - for key, value in history_template.items(): - match key: - case 'control_nets': - for net in value: - n = history_entry.control_nets.add() - for prop in n.__annotations__.keys(): - setattr(n, prop, getattr(net, prop)) - case _: - setattr(history_entry, key, value) - history_entry.seed = str(seed) - history_entry.hash = image_hash - history_entry.width = result_image.shape[1] - history_entry.height = result_image.shape[0] - if is_file_batch: - history_entry.prompt_structure_token_subject = file_batch_lines[iteration] - iteration += 1 - if iteration < iteration_limit and not future.cancelled: - generate_next() - else: + + def callback(results: List[api.GenerationResult] | Exception): + if isinstance(results, Exception): scene.dream_textures_info = "" scene.dream_textures_progress = 0 - - def exception_callback(_, exception): - scene.dream_textures_info = "" - scene.dream_textures_progress = 0 - if hasattr(gen, '_active_generation_future'): - del gen._active_generation_future - eval('bpy.ops.' + NotifyResult.bl_idname)('INVOKE_DEFAULT', exception=repr(exception)) - raise exception - - original_prompt = generated_args["prompt"] - original_negative_prompt = generated_args["negative_prompt"] - gen = Generator.shared() - def generate_next(): - batch_size = min(generated_args["optimizations"].batch_size, iteration_limit-iteration) - if generated_args['pipeline'] == Pipeline.STABILITY_SDK: - # Stability SDK is able to accept a list of prompts, but I can - # only ever get it to generate multiple of the first one. - batch_size = 1 - if is_file_batch: - generated_args["prompt"] = file_batch_lines[iteration: iteration+batch_size] - generated_args["negative_prompt"] = file_batch_lines_negative[iteration: iteration+batch_size] - else: - generated_args["prompt"] = [original_prompt] * batch_size - generated_args["negative_prompt"] = [original_negative_prompt] * batch_size - if len(generated_args['control_net']) > 0: - f = gen.control_net( - image=init_image, - inpaint=generated_args['init_img_action'] == 'inpaint', - **generated_args - ) - elif init_image is not None: - match generated_args['init_img_action']: - case 'modify': - models = list(filter( - lambda m: m.model_base == generated_args['model'], - context.preferences.addons[StableDiffusionPreferences.bl_idname].preferences.installed_models - )) - supports_depth = generated_args['pipeline'].depth() and len(models) > 0 and ModelType[models[0].model_type] == ModelType.DEPTH - def require_depth(): - if not supports_depth: - raise ValueError("Selected pipeline and model do not support depth conditioning. Please select a different model, such as 'stable-diffusion-2-depth' or change the 'Image Type' to 'Color'.") - match generated_args['modify_action_source_type']: - case 'color': - f = gen.image_to_image( - image=init_image, - **generated_args - ) - case 'depth_generated': - require_depth() - f = gen.depth_to_image( - image=init_image, - depth=None, - **generated_args, - ) - case 'depth_map': - require_depth() - f = gen.depth_to_image( - image=init_image, - depth=np.array(scene.init_depth.pixels) - .astype(np.float32) - .reshape((scene.init_depth.size[1], scene.init_depth.size[0], scene.init_depth.channels)), - **generated_args, - ) - case 'depth': - require_depth() - f = gen.depth_to_image( - image=None, - depth=np.flipud(init_image.astype(np.float32) / 255.), - **generated_args, - ) - case 'inpaint': - f = gen.inpaint( - image=init_image, - **generated_args - ) - case 'outpaint': - f = gen.outpaint( - image=init_image, - **generated_args - ) + CancelGenerator.should_continue = None + if not isinstance(results, InterruptedError): # this is a user-initiated cancellation + eval('bpy.ops.' + NotifyResult.bl_idname)('INVOKE_DEFAULT', exception=repr(results)) + raise results else: - f = gen.prompt_to_image( - **generated_args, - ) - gen._active_generation_future = f - f.call_done_on_exception = False - f.add_response_callback(step_callback) - f.add_exception_callback(exception_callback) - f.add_done_callback(done_callback) + nonlocal last_data_block + nonlocal iteration + for result in results: + if result.image is None or result.seed is None: + continue + + # Create a trimmed image name + prompt_string = context.scene.dream_textures_prompt.prompt_structure_token_subject + seed_str_length = len(str(result.seed)) + trim_aware_name = (prompt_string[:54 - seed_str_length] + '..') if len(prompt_string) > 54 else prompt_string + name_with_trimmed_prompt = f"{trim_aware_name} ({result.seed})" + image = bpy_image(name_with_trimmed_prompt, result.image.shape[1], result.image.shape[0], result.image.ravel(), last_data_block) + last_data_block = None + if node_tree is not None: + nodes = node_tree.nodes + texture_node = nodes.new("ShaderNodeTexImage") + texture_node.image = image + texture_node.location = node_anchor + node_size * ((iteration % iteration_square), -(iteration // iteration_square)) + nodes.active = texture_node + for area in screen.areas: + if area.type == 'IMAGE_EDITOR' and not area.spaces.active.use_image_pin: + area.spaces.active.image = image + scene.dream_textures_prompt.seed = str(result.seed) # update property in case seed was sourced randomly or from hash + # create a hash from the Blender image datablock to use as unique ID of said image and store it in the prompt history + # and as custom property of the image. Needs to be a string because the int from the hash function is too large + image_hash = hashlib.sha256((np.array(image.pixels) * 255).tobytes()).hexdigest() + image['dream_textures_hash'] = image_hash + scene.dream_textures_prompt.hash = image_hash + history_entry = context.scene.dream_textures_history.add() + for key, value in history_template.items(): + match key: + case 'control_nets': + for net in value: + n = history_entry.control_nets.add() + for prop in n.__annotations__.keys(): + setattr(n, prop, getattr(net, prop)) + case _: + setattr(history_entry, key, value) + history_entry.seed = str(result.seed) + history_entry.hash = image_hash + history_entry.width = result.image.shape[1] + history_entry.height = result.image.shape[0] + if is_file_batch: + history_entry.prompt_structure_token_subject = file_batch_lines[iteration] + iteration += 1 + if iteration < iteration_limit: + generate_next() + else: + scene.dream_textures_info = "" + scene.dream_textures_progress = 0 + CancelGenerator.should_continue = None + + # Call the backend + CancelGenerator.should_continue = True # reset global cancellation state + def generate_next(): + args = prompt.generate_args(context, iteration=iteration, init_image=init_image, control_images=control_images) + backend.generate(args, step_callback=step_callback, callback=callback) + generate_next() + return {"FINISHED"} def kill_generator(context=bpy.context): @@ -271,6 +217,7 @@ def kill_generator(context=bpy.context): try: context.scene.dream_textures_info = "" context.scene.dream_textures_progress = 0 + CancelGenerator.should_continue = None except: pass @@ -290,14 +237,12 @@ class CancelGenerator(bpy.types.Operator): bl_description = "Stops the generator without reloading everything next time" bl_options = {'REGISTER'} + should_continue = None + @classmethod def poll(cls, context): - gen = Generator.shared() - return hasattr(gen, "_active_generation_future") and gen._active_generation_future is not None and not gen._active_generation_future.cancelled and not gen._active_generation_future.done + return cls.should_continue is not None def execute(self, context): - gen = Generator.shared() - gen._active_generation_future.cancel() - context.scene.dream_textures_info = "" - context.scene.dream_textures_progress = 0 + CancelGenerator.should_continue = False return {'FINISHED'} diff --git a/operators/project.py b/operators/project.py index d5a7292d..fc275617 100644 --- a/operators/project.py +++ b/operators/project.py @@ -6,23 +6,24 @@ from bpy_extras import view3d_utils import mathutils import numpy as np +from typing import List from .view_history import ImportPromptFile -from ..property_groups.dream_prompt import pipeline_options from .open_latest_version import OpenLatestVersion, is_force_show_download, new_version_available from ..ui.panels.dream_texture import advanced_panel, create_panel, prompt_panel, size_panel from .dream_texture import CancelGenerator, ReleaseGenerator from .notify_result import NotifyResult -from ..preferences import StableDiffusionPreferences from ..generator_process import Generator -from ..generator_process.models import Pipeline, FixItError -from ..generator_process.actions.huggingface_hub import ModelType +from ..generator_process.models import ModelType +from ..api.models import FixItError import tempfile from ..engine.annotations.depth import render_depth_map +from .. import api + framebuffer_arguments = [ ('depth', 'Depth', 'Only provide the scene depth as input'), ('color', 'Depth and Color', 'Provide the scene depth and color as input'), @@ -96,9 +97,8 @@ def draw(self, context): elif new_version_available(): layout.operator(OpenLatestVersion.bl_idname, icon="IMPORT") - layout.prop(context.scene.dream_textures_project_prompt, "pipeline") - if Pipeline[context.scene.dream_textures_project_prompt.pipeline].model(): - layout.prop(context.scene.dream_textures_project_prompt, 'model') + layout.prop(context.scene.dream_textures_project_prompt, "backend") + layout.prop(context.scene.dream_textures_project_prompt, 'model') yield DREAM_PT_dream_panel_projection @@ -137,34 +137,40 @@ def draw(self, context): for obj in context.selected_objects: col.prop_search(obj.data.uv_layers, "active", obj.data, "uv_layers", text=f"{obj.name} Target UVs") - row = layout.row() + row = layout.row(align=True) row.scale_y = 1.5 + if CancelGenerator.poll(context): + row.operator(CancelGenerator.bl_idname, icon="SNAP_FACE", text="") if context.scene.dream_textures_progress <= 0: if context.scene.dream_textures_info != "": - row.label(text=context.scene.dream_textures_info, icon="INFO") + disabled_row = row.row(align=True) + disabled_row.operator(ProjectDreamTexture.bl_idname, text=context.scene.dream_textures_info, icon="INFO") + disabled_row.enabled = False else: - r = row.row() + r = row.row(align=True) r.operator(ProjectDreamTexture.bl_idname, icon="MOD_UVPROJECT") - r.enabled = Pipeline[context.scene.dream_textures_project_prompt.pipeline].depth() and context.object is not None and context.object.mode == 'EDIT' + r.enabled = context.object is not None and context.object.mode == 'EDIT' else: - disabled_row = row.row() + disabled_row = row.row(align=True) disabled_row.use_property_split = True disabled_row.prop(context.scene, 'dream_textures_progress', slider=True) disabled_row.enabled = False - if CancelGenerator.poll(context): - row.operator(CancelGenerator.bl_idname, icon="CANCEL", text="") row.operator(ReleaseGenerator.bl_idname, icon="X", text="") # Validation try: - prompt.validate(context, task=None if context.scene.dream_textures_project_use_control_net else ModelType.DEPTH) _validate_projection(context) + prompt = context.scene.dream_textures_project_prompt + backend: api.Backend = prompt.get_backend() + args = prompt.generate_args(context) + args.task = api.task.PromptToImage() if context.scene.dream_textures_project_use_control_net else api.task.DepthToImage(None, None, 0) + backend.validate(args) except FixItError as e: error_box = layout.box() error_box.use_property_split = False for i, line in enumerate(e.args[0].split('\n')): error_box.label(text=line, icon="ERROR" if i == 0 else "NONE") - e.draw(context, error_box) + e._draw(context.scene.dream_textures_project_prompt, context, error_box) except Exception as e: print(e) return ActionsPanel @@ -235,8 +241,12 @@ class ProjectDreamTexture(bpy.types.Operator): @classmethod def poll(cls, context): try: - context.scene.dream_textures_project_prompt.validate(context, task=None if context.scene.dream_textures_project_use_control_net else ModelType.DEPTH) _validate_projection(context) + prompt = context.scene.dream_textures_project_prompt + backend: api.Backend = prompt.get_backend() + args = prompt.generate_args(context) + args.task = api.task.PromptToImage() if context.scene.dream_textures_project_use_control_net else api.task.DepthToImage(None, None, 0) + backend.validate(args) except: return False return Generator.shared().can_use() @@ -354,90 +364,75 @@ def vert_to_uv(v): main_thread=True ) - gen = Generator.shared() - texture = None - def on_response(_, response): + def step_callback(progress: List[api.GenerationResult]) -> bool: nonlocal texture - if response.final: - return - context.scene.dream_textures_progress = response.step + context.scene.dream_textures_progress = progress[-1].progress + image = api.GenerationResult.tile_images(progress) if texture is None: - texture = bpy.data.images.new(name="Step", width=response.images[0].shape[1], height=response.images[0].shape[0]) - texture.name = f"Step {response.step}/{context.scene.dream_textures_project_prompt.steps}" - texture.pixels[:] = response.images[0].ravel() + texture = bpy.data.images.new(name="Step", width=image.shape[1], height=image.shape[0]) + texture.name = f"Step {progress[-1].progress}/{progress[-1].total}" + texture.pixels[:] = image.ravel() texture.update() image_texture_node.image = texture - - def on_done(future): - nonlocal texture - if hasattr(gen, '_active_generation_future'): - del gen._active_generation_future - context.scene.dream_textures_info = "" - context.scene.dream_textures_progress = 0 - generated = future.result() - prompt_subject = context.scene.dream_textures_project_prompt.prompt_structure_token_subject - seed = generated[0].seeds[0] - seed_str_length = len(str(seed)) - trim_aware_name = (prompt_subject[:54 - seed_str_length] + '..') if len(prompt_subject) > 54 else prompt_subject - name_with_trimmed_prompt = f"{trim_aware_name} ({seed})" - - if isinstance(generated, list): - generated = generated[-1] - if texture is None: - texture = bpy.data.images.new(name=name_with_trimmed_prompt, width=generated.images[0].shape[1], height=generated.images[0].shape[0]) - texture.name = name_with_trimmed_prompt - material.name = name_with_trimmed_prompt - texture.pixels[:] = generated.images[0].ravel() - texture.update() - texture.pack() - image_texture_node.image = texture - if context.scene.dream_textures_project_bake: - for bm, src_uv_layer in target_objects: - dest = bpy.data.images.new(name=f"{texture.name} (Baked)", width=texture.size[0], height=texture.size[1]) - - dest_uv_layer = bm.loops.layers.uv.active - src_uvs = np.empty((len(bm.verts), 2), dtype=np.float32) - dest_uvs = np.empty((len(bm.verts), 2), dtype=np.float32) - for face in bm.faces: - for loop in face.loops: - src_uvs[loop.vert.index] = loop[src_uv_layer].uv - dest_uvs[loop.vert.index] = loop[dest_uv_layer].uv - bake(context, bm, generated.images[0].ravel(), dest, src_uvs, dest_uvs) - dest.update() - dest.pack() - image_texture_node.image = dest - - def on_exception(_, exception): - context.scene.dream_textures_info = "" - context.scene.dream_textures_progress = 0 - if hasattr(gen, '_active_generation_future'): - del gen._active_generation_future - eval('bpy.ops.' + NotifyResult.bl_idname)('INVOKE_DEFAULT', exception=repr(exception)) - raise exception + return CancelGenerator.should_continue + + def callback(results: List[api.GenerationResult] | Exception): + CancelGenerator.should_continue = None + if isinstance(results, Exception): + context.scene.dream_textures_info = "" + context.scene.dream_textures_progress = 0 + if not isinstance(results, InterruptedError): # this is a user-initiated cancellation + eval('bpy.ops.' + NotifyResult.bl_idname)('INVOKE_DEFAULT', exception=repr(results)) + raise results + else: + nonlocal texture + context.scene.dream_textures_info = "" + context.scene.dream_textures_progress = 0 + result = results[-1] + prompt_subject = context.scene.dream_textures_project_prompt.prompt_structure_token_subject + seed_str_length = len(str(result.seed)) + trim_aware_name = (prompt_subject[:54 - seed_str_length] + '..') if len(prompt_subject) > 54 else prompt_subject + name_with_trimmed_prompt = f"{trim_aware_name} ({result.seed})" + + if texture is None: + texture = bpy.data.images.new(name=name_with_trimmed_prompt, width=result.image.shape[1], height=result.image.shape[0]) + texture.name = name_with_trimmed_prompt + material.name = name_with_trimmed_prompt + texture.pixels[:] = result.image.ravel() + texture.update() + texture.pack() + image_texture_node.image = texture + if context.scene.dream_textures_project_bake: + for bm, src_uv_layer in target_objects: + dest = bpy.data.images.new(name=f"{texture.name} (Baked)", width=texture.size[0], height=texture.size[1]) + + dest_uv_layer = bm.loops.layers.uv.active + src_uvs = np.empty((len(bm.verts), 2), dtype=np.float32) + dest_uvs = np.empty((len(bm.verts), 2), dtype=np.float32) + for face in bm.faces: + for loop in face.loops: + src_uvs[loop.vert.index] = loop[src_uv_layer].uv + dest_uvs[loop.vert.index] = loop[dest_uv_layer].uv + bake(context, bm, result.image.ravel(), dest, src_uvs, dest_uvs) + dest.update() + dest.pack() + image_texture_node.image = dest + backend: api.Backend = context.scene.dream_textures_project_prompt.get_backend() + context.scene.dream_textures_info = "Starting..." + CancelGenerator.should_continue = True # reset global cancellation state + image_data = bpy.data.images.load(init_img_path) if init_img_path is not None else None + image = np.asarray(image_data.pixels).reshape((*depth.shape, image_data.channels)) if image_data is not None else None if context.scene.dream_textures_project_use_control_net: - generated_args = context.scene.dream_textures_project_prompt.generate_args() - del generated_args['control'] - future = gen.control_net( - control=[np.flipud(depth)], # the depth control needs to be flipped. - image=init_img_path, - inpaint=False, - **generated_args - ) + generated_args: api.GenerationArguments = context.scene.dream_textures_project_prompt.generate_args(context, init_image=image, control_images=[np.flipud(depth)]) + backend.generate(generated_args, step_callback=step_callback, callback=callback) else: - future = gen.depth_to_image( - depth=depth, - image=init_img_path, - **context.scene.dream_textures_project_prompt.generate_args() - ) - gen._active_generation_future = future - future.call_done_on_exception = False - future.add_response_callback(on_response) - future.add_done_callback(on_done) - future.add_exception_callback(on_exception) + generated_args: api.GenerationArguments = context.scene.dream_textures_project_prompt.generate_args(context) + generated_args.task = api.DepthToImage(depth, image, context.scene.dream_textures_project_prompt.strength) + backend.generate(generated_args, step_callback=step_callback, callback=callback) for area in context.screen.areas: if area.type == 'VIEW_3D': diff --git a/operators/upscale.py b/operators/upscale.py index 0a383a05..be15f859 100644 --- a/operators/upscale.py +++ b/operators/upscale.py @@ -1,8 +1,10 @@ import bpy import numpy as np +from typing import List, Literal +from .. import api from ..prompt_engineering import custom_structure from ..generator_process import Generator -from ..generator_process.actions.upscale import ImageUpscaleResult +from .dream_texture import CancelGenerator upscale_options = [ ("2", "2x", "", 2), @@ -74,7 +76,7 @@ def step_progress_update(self, context): .reshape((input_image.size[1], input_image.size[0], input_image.channels)) ) - generated_args = context.scene.dream_textures_upscale_prompt.generate_args() + generated_args = context.scene.dream_textures_upscale_prompt.generate_args(context) context.scene.dream_textures_upscale_seamless_result.update_args(generated_args) # Setup the progress indicator @@ -84,49 +86,52 @@ def step_progress_update(self, context): if region.type == "UI": region.tag_redraw() return None - bpy.types.Scene.dream_textures_progress = bpy.props.IntProperty(name="", default=0, min=0, max=generated_args['steps'], update=step_progress_update) + bpy.types.Scene.dream_textures_progress = bpy.props.IntProperty(name="", default=0, min=0, max=generated_args.steps, update=step_progress_update) scene.dream_textures_info = "Starting..." last_data_block = None - def on_tile_complete(_, tile: ImageUpscaleResult): + def step_callback(progress: List[api.GenerationResult]) -> bool: nonlocal last_data_block if last_data_block is None: - bpy.types.Scene.dream_textures_progress = bpy.props.IntProperty(name="", default=tile.tile, min=0, max=tile.total, update=step_progress_update) - if tile.final or tile.image is None: - return + bpy.types.Scene.dream_textures_progress = bpy.props.IntProperty(name="", default=progress[-1].progress, min=0, max=progress[-1].total, update=step_progress_update) - scene.dream_textures_progress = tile.tile - last_data_block = bpy_image(f"Tile {tile.tile}/{tile.total}", tile.image.shape[1], tile.image.shape[0], tile.image.ravel(), last_data_block) + scene.dream_textures_progress = progress[-1].progress + if progress[-1].image is not None: + last_data_block = bpy_image(f"Tile {progress[-1].progress}/{progress[-1].total}", progress[-1].image.shape[1], progress[-1].image.shape[0], progress[-1].image.ravel(), last_data_block) for area in screen.areas: if area.type == 'IMAGE_EDITOR' and not area.spaces.active.use_image_pin: area.spaces.active.image = last_data_block + return CancelGenerator.should_continue - def image_done(future): - nonlocal last_data_block - if last_data_block is not None: - bpy.data.images.remove(last_data_block) - last_data_block = None - tile: ImageUpscaleResult = future.result(last_only=True) - if tile.image is None: - return - image = bpy_image(f"{input_image.name} (Upscaled)", tile.image.shape[1], tile.image.shape[0], tile.image.ravel(), last_data_block) - for area in screen.areas: - if area.type == 'IMAGE_EDITOR' and not area.spaces.active.use_image_pin: - area.spaces.active.image = image - if active_node is not None: - active_node.image = image - scene.dream_textures_info = "" - scene.dream_textures_progress = 0 - gen = Generator.shared() - context.scene.dream_textures_upscale_prompt.prompt_structure = custom_structure.id - f = gen.upscale( - image=image_pixels, - tile_size=context.scene.dream_textures_upscale_tile_size, - blend=context.scene.dream_textures_upscale_blend, - **generated_args + def callback(results: List[api.GenerationResult] | Exception): + if isinstance(results, Exception): + scene.dream_textures_info = "" + scene.dream_textures_progress = 0 + CancelGenerator.should_continue = None + else: + nonlocal last_data_block + if last_data_block is not None: + bpy.data.images.remove(last_data_block) + last_data_block = None + if results[-1].image is None: + return + image = bpy_image(f"{input_image.name} (Upscaled)", results[-1].image.shape[1], results[-1].image.shape[0], results[-1].image.ravel(), last_data_block) + for area in screen.areas: + if area.type == 'IMAGE_EDITOR' and not area.spaces.active.use_image_pin: + area.spaces.active.image = image + if active_node is not None: + active_node.image = image + scene.dream_textures_info = "" + scene.dream_textures_progress = 0 + CancelGenerator.should_continue = None + + prompt = context.scene.dream_textures_upscale_prompt + prompt.prompt_structure = custom_structure.id + backend: api.Backend = prompt.get_backend() + generated_args.task = api.models.task.Upscale(image=image_pixels, tile_size=context.scene.dream_textures_upscale_tile_size, blend=context.scene.dream_textures_upscale_blend) + CancelGenerator.should_continue = True + backend.generate( + generated_args, step_callback=step_callback, callback=callback ) - f.add_response_callback(on_tile_complete) - f.add_done_callback(image_done) - gen._active_generation_future = f return {"FINISHED"} \ No newline at end of file diff --git a/operators/view_history.py b/operators/view_history.py index 45440c03..ea5e4025 100644 --- a/operators/view_history.py +++ b/operators/view_history.py @@ -12,7 +12,7 @@ def draw_item(self, context, layout, data, item, icon, active_data, active_propn layout.label(text=f"{item.seed}", translate=False) layout.label(text=f"{item.width}x{item.height}", translate=False) layout.label(text=f"{item.steps} steps", translate=False) - layout.label(text=next(x for x in scheduler_options if x[0] == item.scheduler)[1], translate=False) + layout.label(text=item.scheduler, translate=False) elif self.layout_type == 'GRID': layout.alignment = 'CENTER' layout.label(text="", icon_value=icon) diff --git a/preferences.py b/preferences.py index 8529a490..4c5a8585 100644 --- a/preferences.py +++ b/preferences.py @@ -11,9 +11,8 @@ from .operators.open_latest_version import OpenLatestVersion from .ui.presets import RestoreDefaultPresets, default_presets_missing from .generator_process import Generator -from .generator_process.actions.prompt_to_image import Pipeline -from .generator_process.actions.huggingface_hub import DownloadStatus, ModelType -from .generator_process.actions.convert_original_stable_diffusion_to_diffusers import ModelConfig +from .generator_process.actions.huggingface_hub import DownloadStatus, Model as HubModel +from .generator_process.models import Checkpoint, ModelConfig, ModelType is_downloading = False @@ -30,12 +29,14 @@ def execute(self, context): return {"FINISHED"} _model_config_options = [(m.name, m.value, '') for m in ModelConfig] +import_extensions = ['.ckpt', '.safetensors', '.pth'] +import_extensions_glob = ";".join(import_extensions).replace(".", "*.") class ImportWeights(bpy.types.Operator, ImportHelper): bl_idname = "dream_textures.import_weights" bl_label = "Import Checkpoint File" filename_ext = ".ckpt" filter_glob: bpy.props.StringProperty( - default="*.ckpt", + default=import_extensions_glob, options={'HIDDEN'}, maxlen=255, ) @@ -43,21 +44,28 @@ class ImportWeights(bpy.types.Operator, ImportHelper): name="Model Config", items=_model_config_options ) + prefer_fp16_variant: bpy.props.BoolProperty( + name="Save Half Precision Weights", + default=True + ) def execute(self, context): - _, extension = os.path.splitext(self.filepath) - if extension != '.ckpt': - self.report({"ERROR"}, "Select a valid stable diffusion '.ckpt' file.") - return {"FINISHED"} - try: - Generator.shared().convert_original_stable_diffusion_to_diffusers(self.filepath, ModelConfig[self.model_config]).result() - except Exception as e: - self.report({"ERROR"}, """Model conversion failed. Make sure you select the correct model configuration in the sidebar. -Press 'N' or click the gear icon in the top right of the file selection popup to reveal the sidebar.""") - self.report({"ERROR"}, str(e)) - - set_model_list('installed_models', Generator.shared().hf_list_installed_models().result()) - + global is_downloading + is_downloading = True + f = Generator.shared().convert_original_stable_diffusion_to_diffusers(self.filepath, ModelConfig[self.model_config], self.prefer_fp16_variant) + def on_progress(_, response: DownloadStatus): + bpy.context.preferences.addons[__package__].preferences.download_file = response.file + bpy.context.preferences.addons[__package__].preferences.download_progress = int((response.index / response.total) * 100) + def on_done(future): + global is_downloading + is_downloading = False + fetch_installed_models() + def on_exception(_, exception): + self.report({"ERROR"}, str(exception)) + raise exception + f.add_response_callback(on_progress) + f.add_done_callback(on_done) + f.add_exception_callback(on_exception) return {"FINISHED"} class Model(bpy.types.PropertyGroup): @@ -87,9 +95,9 @@ def draw_item(self, context, layout, data, item, icon, active_data, active_propn split.label(text=item.model_type.replace('_', ' ').title()) install_model = layout.operator(InstallModel.bl_idname, text="", icon="FILE_FOLDER" if is_installed else "IMPORT") install_model.model = item.model - install_model.prefer_fp16_revision = data.prefer_fp16_revision + install_model.prefer_fp16_variant = data.prefer_fp16_variant + install_model.resume_download = data.resume_download -@staticmethod def set_model_list(model_list: str, models: list): getattr(bpy.context.preferences.addons[__package__].preferences, model_list).clear() for model in models: @@ -103,6 +111,58 @@ def set_model_list(model_list: str, models: list): except: pass +class checkpoint_lookup: + _checkpoints = {} + + @classmethod + def get(cls, item): + return cls._checkpoints.get(item, item) + +class model_lookup: + _models = {} + + @classmethod + def get(cls, item): + return cls._models.get(item, item) + +def fetch_installed_models(blocking=True): + def on_done(future): + model_list = future.result() + + model_lookup._models = { os.path.basename(model.id).replace('models--', '').replace('--', '/'): model for model in model_list } + + pref = bpy.context.preferences.addons[__package__].preferences + checkpoint_links = ((link.path, ModelConfig[link.model_config]) for link in pref.linked_checkpoints) + checkpoints = {} + for path, config in checkpoint_links: + if not os.path.exists(path): + continue + if os.path.isfile(path): + checkpoints[os.path.basename(path)] = (path, config) + continue + for name in os.listdir(path): + if os.path.splitext(name)[1] not in import_extensions: + continue + if name in checkpoints: + # file linked config takes precedence over folder linked config + continue + checkpoints[name] = (os.path.join(path, name), config) + checkpoint_lookup._checkpoints.clear() + for path, config in checkpoints.values(): + model = HubModel(path, "", [], -1, -1, ModelType.from_config(config)) + model_list.append(model) + checkpoint_lookup._checkpoints[os.path.basename(path)] = Checkpoint(path, config) + model_lookup._models[os.path.basename(path)] = model + + set_model_list('installed_models', model_list) + + future = Generator.shared().hf_list_installed_models() + if blocking: + on_done(future) + else: + future.add_done_callback(on_done) + + class ModelSearch(bpy.types.Operator): bl_idname = "dream_textures.model_search" bl_label = "Search" @@ -120,22 +180,31 @@ class InstallModel(bpy.types.Operator): bl_options = {"REGISTER", "INTERNAL"} model: StringProperty(name="Model ID") - prefer_fp16_revision: bpy.props.BoolProperty(name="", default=True) + prefer_fp16_variant: bpy.props.BoolProperty(name="", default=True) + resume_download: bpy.props.BoolProperty(name="", default=True) def execute(self, context): if os.path.exists(self.model): - webbrowser.open(f"file://{self.model}") + if os.path.isfile(self.model): + webbrowser.open(f"file://{os.path.dirname(self.model)}") + else: + webbrowser.open(f"file://{self.model}") else: global is_downloading is_downloading = True - f = Generator.shared().hf_snapshot_download(self.model, bpy.context.preferences.addons[__package__].preferences.hf_token, "fp16" if self.prefer_fp16_revision else None) + f = Generator.shared().hf_snapshot_download( + self.model, + bpy.context.preferences.addons[__package__].preferences.hf_token, + "fp16" if self.prefer_fp16_variant else None, + self.resume_download + ) def on_progress(_, response: DownloadStatus): bpy.context.preferences.addons[__package__].preferences.download_file = response.file bpy.context.preferences.addons[__package__].preferences.download_progress = int((response.index / response.total) * 100) def on_done(future): global is_downloading is_downloading = False - set_model_list('installed_models', Generator.shared().hf_list_installed_models().result()) + fetch_installed_models() def on_exception(_, exception): self.report({"ERROR"}, str(exception)) raise exception @@ -166,6 +235,89 @@ def _template_model_download_progress(context, layout): progress_col.enabled = False return is_downloading +class CheckpointGroup(bpy.types.PropertyGroup): + bl_label = "Model" + bl_idname = "dream_textures.checkpoint" + + path: bpy.props.StringProperty(name="Checkpoint") + model_config: bpy.props.EnumProperty( + name="Model Config", + items=_model_config_options + ) + +class LinkCheckpoint(bpy.types.Operator, ImportHelper): + bl_idname = "dream_textures.link_checkpoint" + bl_label = "Link Checkpoint File or Folder" + filename_ext = ".ckpt" + files: CollectionProperty( + type=bpy.types.OperatorFileListElement, + options={'HIDDEN', 'SKIP_SAVE'} + ) + filter_glob: bpy.props.StringProperty( + default=import_extensions_glob, + options={'HIDDEN'}, + maxlen=255, + ) + model_config: bpy.props.EnumProperty( + name="Model Config", + items=_model_config_options + ) + + def invoke(self, context, _event): + if os.path.isfile(self.filepath): + # Reset to a directory, otherwise the filename remains populated and can cause issues to select a directory if gone unnoticed. + self.filepath = os.path.dirname(self.filepath) + os.path.sep + return super().invoke(context, _event) + + def execute(self, context): + pref = context.preferences.addons[__package__].preferences + for file in self.files: + path = self.filepath + if file.name != "": + path = os.path.join(os.path.dirname(path), file.name) + + if not os.path.exists(path): + self.report({"ERROR"}, f"{path} does not exist") + continue + if os.path.isfile(path) and os.path.splitext(path)[1] not in import_extensions: + self.report({"ERROR"}, f"{os.path.basename(path)} is not a checkpoint") + continue + + link = next((link for link in pref.linked_checkpoints if link.path == path), None) + if link is None: + link = pref.linked_checkpoints.add() + link.path = path + link.model_config = self.model_config + + fetch_installed_models() + + return {"FINISHED"} + +class UnlinkCheckpoint(bpy.types.Operator): + bl_idname = "dream_textures.unlink_checkpoint" + bl_label = "Unlink Checkpoint File" + + path: bpy.props.StringProperty() + def execute(self, context): + pref = context.preferences.addons[__package__].preferences + index = next((i for i, link in enumerate(pref.linked_checkpoints) if link.path == self.path), -1) + if index != -1: + pref.linked_checkpoints.remove(index) + + fetch_installed_models() + + return {"FINISHED"} + +class PREFERENCES_UL_CheckpointList(bpy.types.UIList): + def draw_item(self, context, layout, data, item, icon, active_data, active_propname): + split = layout.split(factor=0.75) + split.label(text=item.path) + split.label(text=ModelConfig[item.model_config].value) + install_model = layout.operator(InstallModel.bl_idname, text="", icon="FILE_FOLDER") + install_model.model = item.path + unlink = layout.operator(UnlinkCheckpoint.bl_idname, text="", icon="TRASH") + unlink.path = item.path + class StableDiffusionPreferences(bpy.types.AddonPreferences): bl_idname = __package__ @@ -175,20 +327,23 @@ class StableDiffusionPreferences(bpy.types.AddonPreferences): model_results: CollectionProperty(type=Model) active_model_result: bpy.props.IntProperty(name="Active Model", default=0) hf_token: StringProperty(name="HuggingFace Token") - prefer_fp16_revision: bpy.props.BoolProperty(name="Prefer Half Precision Weights", description="Download fp16 weights if available for smaller file size. If you run with 'Half Precision' disabled, you should not use this setting", default=True) + prefer_fp16_variant: bpy.props.BoolProperty(name="Prefer Half Precision Weights", description="Download fp16 weights if available for smaller file size. If you run with 'Half Precision' disabled, you should not use this setting", default=True) + resume_download: bpy.props.BoolProperty(name="Resume Incomplete Download", description="Continue an in-progress download in case if Blender was closed or connection was interrupted, otherwise incomplete files will be entirely redownloaded", default=True) installed_models: CollectionProperty(type=Model) active_installed_model: bpy.props.IntProperty(name="Active Model", default=0) + linked_checkpoints: CollectionProperty(type=CheckpointGroup) + active_linked_checkpoint: bpy.props.IntProperty(name="Active Checkpoint", default=0) + download_file: bpy.props.StringProperty(name="") download_progress: bpy.props.IntProperty(name="", min=0, max=100, subtype="PERCENTAGE", update=_update_ui) + model_cache = [] + @staticmethod def register(): - if Pipeline.local_available(): - def on_done(future): - set_model_list('installed_models', future.result()) - Generator.shared().hf_list_installed_models().add_done_callback(on_done) + fetch_installed_models(False) def draw(self, context): layout = self.layout @@ -200,62 +355,57 @@ def draw(self, context): has_dependencies = len(os.listdir(absolute_path(".python_dependencies"))) > 2 if has_dependencies: - has_local = Pipeline.local_available() - - if has_local: - if not _template_model_download_progress(context, layout): - conflicting_packages = ["wandb", "k_diffusion"] - conflicting_package_specs = {} + if not _template_model_download_progress(context, layout): + conflicting_packages = ["wandb", "k_diffusion"] + conflicting_package_specs = {} + for package in conflicting_packages: + spec = importlib.util.find_spec(package) + if spec is not None: + conflicting_package_specs[package] = spec + if len(conflicting_package_specs) > 0: + conflicts_box = layout.box() + conflicts_box.label(text="WARNING", icon="ERROR") + conflicts_box.label(text=f"The following packages conflict with Dream Textures: {', '.join(conflicting_packages)}") + conflicts_box.label(text=f"You may need to run Blender as an administrator to remove these packages") + conflicts_box.operator(UninstallDependencies.bl_idname, text="Uninstall Conflicting Packages", icon="CANCEL").conflicts = ' '.join(conflicting_packages) + conflicts_box.label(text=f"If the button above fails, you can remove the following folders manually:") for package in conflicting_packages: - spec = importlib.util.find_spec(package) - if spec is not None: - conflicting_package_specs[package] = spec - if len(conflicting_package_specs) > 0: - conflicts_box = layout.box() - conflicts_box.label(text="WARNING", icon="ERROR") - conflicts_box.label(text=f"The following packages conflict with Dream Textures: {', '.join(conflicting_packages)}") - conflicts_box.label(text=f"You may need to run Blender as an administrator to remove these packages") - conflicts_box.operator(UninstallDependencies.bl_idname, text="Uninstall Conflicting Packages", icon="CANCEL").conflicts = ' '.join(conflicting_packages) - conflicts_box.label(text=f"If the button above fails, you can remove the following folders manually:") - for package in conflicting_packages: - if package not in conflicting_package_specs: - continue - location = conflicting_package_specs[package].submodule_search_locations[0] - conflicts_box.operator(OpenURL.bl_idname, text=f"Open '{location}'").url = f"file://{location}" - - if not weights_installed: - default_weights_box = layout.box() - default_weights_box.label(text="You need to download at least one model.") - install_model = default_weights_box.operator(InstallModel.bl_idname, text="Download Stable Diffusion v2.1 (Recommended)", icon="IMPORT") - install_model.model = "stabilityai/stable-diffusion-2-1" - install_model.prefer_fp16_revision = self.prefer_fp16_revision - - search_box = layout.box() - search_box.label(text="Find Models", icon="SETTINGS") - search_box.label(text="Search Hugging Face Hub for more compatible models.") - - search_box.prop(self, "model_query", text="", icon="VIEWZOOM") - - if len(self.model_results) > 0: - search_box.template_list(PREFERENCES_UL_ModelList.__name__, "dream_textures_model_results", self, "model_results", self, "active_model_result") - - search_box.label(text="Some models require authentication. Provide a token to download gated models.") - - auth_row = search_box.row() - auth_row.prop(self, "hf_token", text="Token") - auth_row.operator(OpenURL.bl_idname, text="Get Your Token", icon="KEYINGSET").url = "https://huggingface.co/settings/tokens" - - search_box.prop(self, "prefer_fp16_revision") - - layout.template_list(PREFERENCES_UL_ModelList.__name__, "dream_textures_installed_models", self, "installed_models", self, "active_installed_model") - layout.operator(ImportWeights.bl_idname, icon='IMPORT') - - dream_studio_box = layout.box() - dream_studio_box.label(text=f"DreamStudio{' (Optional)' if has_local else ''}", icon="HIDE_OFF") - dream_studio_box.label(text=f"Link to your DreamStudio account to run in the cloud{' instead of locally.' if has_local else '.'}") - key_row = dream_studio_box.row() - key_row.prop(self, "dream_studio_key", text="Key") - key_row.operator(OpenURL.bl_idname, text="Find Your Key", icon="KEYINGSET").url = "https://beta.dreamstudio.ai/membership?tab=apiKeys" + if package not in conflicting_package_specs: + continue + location = conflicting_package_specs[package].submodule_search_locations[0] + conflicts_box.operator(OpenURL.bl_idname, text=f"Open '{location}'").url = f"file://{location}" + + if not weights_installed: + default_weights_box = layout.box() + default_weights_box.label(text="You need to download at least one model.") + install_model = default_weights_box.operator(InstallModel.bl_idname, text="Download Stable Diffusion v2.1 (Recommended)", icon="IMPORT") + install_model.model = "stabilityai/stable-diffusion-2-1" + install_model.prefer_fp16_variant = self.prefer_fp16_variant + install_model.resume_download = self.resume_download + + search_box = layout.box() + search_box.label(text="Find Models", icon="SETTINGS") + search_box.label(text="Search Hugging Face Hub for more compatible models.") + + search_box.prop(self, "model_query", text="", icon="VIEWZOOM") + + if len(self.model_results) > 0: + search_box.template_list(PREFERENCES_UL_ModelList.__name__, "dream_textures_model_results", self, "model_results", self, "active_model_result") + + search_box.label(text="Some models require authentication. Provide a token to download gated models.") + + auth_row = search_box.row() + auth_row.prop(self, "hf_token", text="Token") + auth_row.operator(OpenURL.bl_idname, text="Get Your Token", icon="KEYINGSET").url = "https://huggingface.co/settings/tokens" + + search_box.prop(self, "prefer_fp16_variant") + search_box.prop(self, "resume_download") + + layout.template_list(PREFERENCES_UL_ModelList.__name__, "dream_textures_installed_models", self, "installed_models", self, "active_installed_model") + import_weights = layout.operator(ImportWeights.bl_idname, icon='IMPORT') + import_weights.prefer_fp16_variant = self.prefer_fp16_variant + layout.template_list(PREFERENCES_UL_CheckpointList.__name__, "dream_textures_linked_checkpoints", self, "linked_checkpoints", self, "active_linked_checkpoint") + layout.operator(LinkCheckpoint.bl_idname, icon='FOLDER_REDIRECT') if weights_installed or len(self.dream_studio_key) > 0: complete_box = layout.box() diff --git a/property_groups/control_net.py b/property_groups/control_net.py index db47b2dd..b8e7eb2d 100644 --- a/property_groups/control_net.py +++ b/property_groups/control_net.py @@ -1,13 +1,10 @@ import bpy from bpy.props import FloatProperty, EnumProperty, PointerProperty -from ..generator_process.actions.huggingface_hub import ModelType -from ..preferences import StableDiffusionPreferences - def control_net_options(self, context): return [ - (model.model_base, model.model_base.replace('models--', '').replace('--', '/'), '') for model in context.preferences.addons[StableDiffusionPreferences.bl_idname].preferences.installed_models - if model.model_type == ModelType.CONTROL_NET.name + None if model is None else (model.id, model.name, model.description) + for model in context.scene.dream_textures_prompt.get_backend().list_controlnet_models(context) ] class ControlNet(bpy.types.PropertyGroup): diff --git a/property_groups/dream_prompt.py b/property_groups/dream_prompt.py index 60166d1f..c184c21b 100644 --- a/property_groups/dream_prompt.py +++ b/property_groups/dream_prompt.py @@ -5,18 +5,22 @@ from typing import _AnnotatedAlias from ..generator_process.actions.detect_seamless import SeamlessAxes -from ..generator_process.actions.prompt_to_image import Optimizations, Scheduler, StepPreviewMode, Pipeline -from ..generator_process.actions.huggingface_hub import ModelType +from ..generator_process.actions.prompt_to_image import Optimizations, Scheduler, StepPreviewMode from ..prompt_engineering import * from ..preferences import StableDiffusionPreferences -from .dream_prompt_validation import validate from .control_net import ControlNet import numpy as np from functools import reduce -scheduler_options = [(scheduler.value, scheduler.value, '') for scheduler in Scheduler] +from .. import api + +def scheduler_options(self, context): + return [ + (scheduler, scheduler, '') + for scheduler in self.get_backend().list_schedulers(context) + ] step_preview_mode_options = [(mode.value, mode.value, '') for mode in StepPreviewMode] @@ -39,7 +43,7 @@ ] def init_image_actions_filtered(self, context): - available = Pipeline[self.pipeline].init_img_actions() + available = ['modify', 'inpaint', 'outpaint'] return list(filter(lambda x: x[0] in available, init_image_actions)) inpaint_mask_sources = [ @@ -48,7 +52,7 @@ def init_image_actions_filtered(self, context): ] def inpaint_mask_sources_filtered(self, context): - available = Pipeline[self.pipeline].inpaint_mask_sources() + available = ['alpha', 'prompt'] return list(filter(lambda x: x[0] in available, inpaint_mask_sources)) seamless_axes = [ @@ -69,48 +73,20 @@ def modify_action_source_type(self, context): ] def model_options(self, context): - match Pipeline[self.pipeline]: - case Pipeline.STABLE_DIFFUSION: - def model_case(model, i): - return ( - model.model_base, - model.model_base.replace('models--', '').replace('--', '/'), - ModelType[model.model_type].name, - i - ) - models = {} - for i, model in enumerate(context.preferences.addons[StableDiffusionPreferences.bl_idname].preferences.installed_models): - if model.model_type in {ModelType.CONTROL_NET.name, ModelType.UNKNOWN.name}: - continue - if model.model_type not in models: - models[model.model_type] = [model_case(model, i)] - else: - models[model.model_type].append(model_case(model, i)) - return reduce( - lambda a, b: a + [None] + sorted(b, key=lambda m: m[0]), - [ - models[group] - for group in sorted(models.keys()) - ], - [] - ) - case Pipeline.STABILITY_SDK: - return [ - ("stable-diffusion-v1", "Stable Diffusion v1.4", ModelType.PROMPT_TO_IMAGE.name), - ("stable-diffusion-v1-5", "Stable Diffusion v1.5", ModelType.PROMPT_TO_IMAGE.name), - ("stable-diffusion-512-v2-0", "Stable Diffusion v2.0", ModelType.PROMPT_TO_IMAGE.name), - ("stable-diffusion-768-v2-0", "Stable Diffusion v2.0-768", ModelType.PROMPT_TO_IMAGE.name), - ("stable-diffusion-512-v2-1", "Stable Diffusion v2.1", ModelType.PROMPT_TO_IMAGE.name), - ("stable-diffusion-768-v2-1", "Stable Diffusion v2.1-768", ModelType.PROMPT_TO_IMAGE.name), - None, - ("stable-inpainting-v1-0", "Stable Inpainting v1.0", ModelType.INPAINTING.name), - ("stable-inpainting-512-v2-0", "Stable Inpainting v2.0", ModelType.INPAINTING.name), - ] - -def pipeline_options(self, context): return [ - (Pipeline.STABLE_DIFFUSION.name, 'Stable Diffusion', 'Stable Diffusion on your own hardware', 1), - (Pipeline.STABILITY_SDK.name, 'DreamStudio', 'Cloud compute via DreamStudio', 2), + None if model is None else (model.id, model.name, model.description) + for model in self.get_backend().list_models(context) + ] + +def _model_update(self, context): + options = [m for m in model_options(self, context) if m is not None] + if self.model == '' and len(options) > 0: + self.model = options[0] + +def backend_options(self, context): + return [ + (backend._id(), backend.name if hasattr(backend, "name") else backend.__name__, backend.description if hasattr(backend, "description") else "") + for backend in api.Backend._list_backends() ] def seed_clamp(self, ctx): @@ -123,8 +99,8 @@ def seed_clamp(self, ctx): pass # will get hashed once generated attributes = { - "pipeline": EnumProperty(name="Pipeline", items=pipeline_options, default=1 if Pipeline.local_available() else 2, description="Specify which model and target should be used."), - "model": EnumProperty(name="Model", items=model_options, description="Specify which model to use for inference"), + "backend": EnumProperty(name="Backend", items=backend_options, default=1, description="Specify which generation backend to use"), + "model": EnumProperty(name="Model", items=model_options, description="Specify which model to use for inference", update=_model_update), "control_nets": CollectionProperty(type=ControlNet), "active_control_net": IntProperty(name="Active ControlNet"), @@ -174,58 +150,6 @@ def seed_clamp(self, ctx): "hash": StringProperty(name="Image Hash"), } -default_optimizations = Optimizations() -def optimization(optim, property=None, **kwargs): - if "name" not in kwargs: - kwargs["name"] = optim.replace('_', ' ').title() - if "default" not in kwargs: - kwargs["default"] = getattr(default_optimizations, optim) - if property is None: - match kwargs["default"]: - case bool(): - property = BoolProperty - case int(): - property = IntProperty - case float(): - property = FloatProperty - case _: - raise TypeError(f"{optim} cannot infer optimization property from {type(kwargs['default'])}") - attributes[f"optimizations_{optim}"] = property(**kwargs) - -optimization("attention_slicing", description="Computes attention in several steps. Saves some memory in exchange for a small speed decrease") -optimization("attention_slice_size_src", property=EnumProperty, items=( - ("auto", "Automatic", "Computes attention in two steps", 1), - ("manual", "Manual", "Computes attention in `attention_head_dim // size` steps. A smaller `size` saves more memory.\n" - "`attention_head_dim` must be a multiple of `size`, otherwise the image won't generate properly.\n" - "`attention_head_dim` can be found within the model snapshot's unet/config.json file", 2), -), default=1, name="Attention Slice Size") -optimization("attention_slice_size", default=1, min=1) -optimization("cudnn_benchmark", name="cuDNN Benchmark", description="Allows cuDNN to benchmark multiple convolution algorithms and select the fastest") -optimization("tf32", name="TF32", description="Utilizes tensor cores on Ampere (RTX 30xx) or newer GPUs for matrix multiplications.\nHas no effect if half precision is enabled") -optimization("half_precision", description="Reduces memory usage and increases speed in exchange for a slight loss in image quality.\nHas no effect if CPU only is enabled or using a GTX 16xx GPU") -optimization("cpu_offload", property=EnumProperty, items=( - ("off", "Off", "", 0), - ("model", "Model", "Some memory savings with minimal speed penalty", 1), - ("submodule", "Submodule", "Better memory savings with large speed penalty", 2) -), default=0, name="CPU Offload", description="Dynamically moves models in and out of device memory for reduced memory usage with reduced speed") -optimization("channels_last_memory_format", description="An alternative way of ordering NCHW tensors that may be faster or slower depending on the device") -optimization("sdp_attention", name="SDP Attention", - description="Scaled dot product attention requires less memory and often comes with a good speed increase.\n" - "Prompt recall may not produce the exact same image, but usually only minor noise differences.\n" - "Overrides attention slicing") -optimization("batch_size", default=1, min=1, description="Improves speed when using iterations or upscaling in exchange for higher memory usage.\nHighly recommended to use with VAE slicing enabled") -optimization("vae_slicing", name="VAE Slicing", description="Reduces memory usage of batched VAE decoding. Has no effect if batch size is 1.\nMay have a small performance improvement with large batches") -optimization("vae_tiling", property=EnumProperty, items=( - ("off", "Off", "", 0), - ("half", "Half", "Uses tiles of half the selected model's default size. Likely to cause noticeably inaccurate colors", 1), - ("full", "Full", "Uses tiles of the selected model's default size, intended for use where image size is manually set higher. May cause slightly inaccurate colors", 2), - ("manual", "Manual", "", 3) -), default=0, name="VAE Tiling", description="Decodes generated images in tiled regions to reduce memory usage in exchange for longer decode time and less accurate colors.\nCan allow for generating larger images that would otherwise run out of memory on the final step") -optimization("vae_tile_size", min=1, name="VAE Tile Size", description="Width and height measurement of tiles. Smaller sizes are more likely to cause inaccurate colors and other undesired artifacts") -optimization("vae_tile_blend", min=0, name="VAE Tile Blend", description="Minimum amount of how much each edge of a tile will intersect its adjacent tile") -optimization("cfg_end", name="CFG End", min=0, max=1, description="The percentage of steps to complete before disabling classifier-free guidance") -optimization("cpu_only", name="CPU Only", description="Disables GPU acceleration and is extremely slow") - def map_structure_token_items(value): return (value[0], value[1], '') for structure in prompt_structures: @@ -280,48 +204,96 @@ def get_seed(self): h = ~h return (h & 0xFFFFFFFF) ^ (h >> 32) # 64 bit hash down to 32 bits -def get_optimizations(self: DreamPrompt): - optimizations = Optimizations() - for prop in dir(self): - split_name = prop.replace('optimizations_', '') - if prop.startswith('optimizations_') and hasattr(optimizations, split_name): - setattr(optimizations, split_name, getattr(self, prop)) - if self.optimizations_attention_slice_size_src == 'auto': - optimizations.attention_slice_size = 'auto' - return optimizations - -def generate_args(self): - args = { key: getattr(self, key) for key in DreamPrompt.__annotations__ } - if not args['use_negative_prompt']: - args['negative_prompt'] = None - args['prompt'] = self.generate_prompt() - args['seed'] = self.get_seed() - args['optimizations'] = self.get_optimizations() - args['scheduler'] = Scheduler(args['scheduler']) - args['step_preview_mode'] = StepPreviewMode(args['step_preview_mode']) - args['pipeline'] = Pipeline[args['pipeline']] - args['outpaint_origin'] = (args['outpaint_origin'][0], args['outpaint_origin'][1]) - args['key'] = bpy.context.preferences.addons[StableDiffusionPreferences.bl_idname].preferences.dream_studio_key - args['seamless_axes'] = SeamlessAxes(args['seamless_axes']) - args['width'] = args['width'] if args['use_size'] else None - args['height'] = args['height'] if args['use_size'] else None - - args['control_net'] = [net.control_net for net in args['control_nets']] - args['controlnet_conditioning_scale'] = [net.conditioning_scale for net in args['control_nets']] - args['control'] = [ - np.flipud( - np.array(net.control_image.pixels) - .reshape((net.control_image.size[1], net.control_image.size[0], net.control_image.channels)) - ) - for net in args['control_nets'] - if net.control_image is not None - ] - del args['control_nets'] - return args +def generate_args(self, context, iteration=0, init_image=None, control_images=None) -> api.GenerationArguments: + is_file_batch = self.prompt_structure == file_batch_structure.id + file_batch_lines = [] + file_batch_lines_negative = [] + if is_file_batch: + file_batch_lines = [line.body for line in context.scene.dream_textures_prompt_file.lines if len(line.body.strip()) > 0] + file_batch_lines_negative = [""] * len(file_batch_lines) + + backend: api.Backend = self.get_backend() + batch_size = backend.get_batch_size(context) + iteration_limit = len(file_batch_lines) if is_file_batch else self.iterations + batch_size = min(batch_size, iteration_limit-iteration) + + task: api.Task = api.PromptToImage() + if self.use_init_img: + match self.init_img_action: + case 'modify': + match self.modify_action_source_type: + case 'color': + task = api.ImageToImage( + image=init_image, + strength=self.strength, + fit=self.fit + ) + case 'depth_generated': + task = api.DepthToImage( + depth=None, + image=init_image, + strength=self.strength + ) + case 'depth_map': + task = api.DepthToImage( + depth=np.array(context.scene.init_depth.pixels) + .astype(np.float32) + .reshape((context.scene.init_depth.size[1], context.scene.init_depth.size[0], context.scene.init_depth.channels)), + image=init_image, + strength=self.strength + ) + case 'depth': + task = api.DepthToImage( + image=None, + depth=np.flipud(init_image.astype(np.float32) / 255.), + strength=self.strength + ) + case 'inpaint': + task = api.Inpaint( + image=init_image, + strength=self.strength, + fit=self.fit, + mask_source=api.Inpaint.MaskSource.ALPHA if self.inpaint_mask_src == 'alpha' else api.Inpaint.MaskSource.PROMPT, + mask_prompt=self.text_mask, + confidence=self.text_mask_confidence + ) + case 'outpaint': + task = api.Outpaint( + image=init_image, + origin=(self.outpaint_origin[0], self.outpaint_origin[1]) + ) + + return api.GenerationArguments( + task=task, + model=next(model for model in self.get_backend().list_models(context) if model is not None and model.id == self.model), + prompt=api.Prompt( + file_batch_lines[iteration:iteration+batch_size] if is_file_batch else [self.generate_prompt()] * batch_size, + file_batch_lines_negative[iteration:iteration+batch_size] if is_file_batch else ([self.negative_prompt] * batch_size if self.use_negative_prompt else None) + ), + size=(self.width, self.height) if self.use_size else None, + seed=self.get_seed(), + steps=self.steps, + guidance_scale=self.cfg_scale, + scheduler=self.scheduler, + seamless_axes=SeamlessAxes(self.seamless_axes), + step_preview_mode=StepPreviewMode(self.step_preview_mode), + iterations=self.iterations, + control_nets=[ + api.models.control_net.ControlNet( + net.control_net, + control_images[i] if control_images is not None else None, + net.conditioning_scale + ) + for i, net in enumerate(self.control_nets) + if net.control_image is not None + ] + ) + +def get_backend(self) -> api.Backend: + return getattr(self, api.Backend._lookup(self.backend)._attribute()) DreamPrompt.generate_prompt = generate_prompt DreamPrompt.get_prompt_subject = get_prompt_subject DreamPrompt.get_seed = get_seed -DreamPrompt.get_optimizations = get_optimizations DreamPrompt.generate_args = generate_args -DreamPrompt.validate = validate \ No newline at end of file +DreamPrompt.get_backend = get_backend \ No newline at end of file diff --git a/property_groups/dream_prompt_validation.py b/property_groups/dream_prompt_validation.py deleted file mode 100644 index 15a5b633..00000000 --- a/property_groups/dream_prompt_validation.py +++ /dev/null @@ -1,91 +0,0 @@ -from ..preferences import StableDiffusionPreferences, _template_model_download_progress, InstallModel -from ..generator_process.models import Pipeline, FixItError -from ..generator_process.actions.huggingface_hub import ModelType -from ..preferences import OpenURL - -def validate(self, context, task: ModelType | None = None) -> bool: - if task is None: - if self.use_init_img: - match self.init_img_action: - case 'modify': - match self.modify_action_source_type: - case 'color': - task = ModelType.PROMPT_TO_IMAGE - case 'depth_generated' | 'depth_map' | 'depth': - task = ModelType.DEPTH - case 'inpaint' | 'outpaint': - task = ModelType.INPAINTING - if task is None: - task = ModelType.PROMPT_TO_IMAGE - - # Check if the pipeline supports the task. - pipeline = Pipeline[self.pipeline] - match task: - case ModelType.DEPTH: - if not pipeline.depth(): - raise FixItError( - f"""The selected pipeline does not support {task.name.replace('_', ' ').lower()} tasks. -Select a different pipeline below.""", - lambda _, layout: layout.prop(self, "pipeline") - ) - - # Pipeline-specific checks - match pipeline: - case Pipeline.STABLE_DIFFUSION: - if not Pipeline.local_available(): - raise FixItError( - "Local generation is not available for the variant of the add-on you have installed. Choose a different Pipeline such as 'DreamStudio'", - lambda _, layout: layout.prop(self, "pipeline") - ) - - installed_models = context.preferences.addons[StableDiffusionPreferences.bl_idname].preferences.installed_models - model = next((m for m in installed_models if m.model_base == self.model), None) - if model is None: - raise FixItError("No model selected.", lambda _, layout: layout.prop(self, "model")) - else: - if model.model_type != task.name: - def fix_model(context, layout): - layout.prop(self, "model") - if not any(m.model_type == task.name for m in installed_models): - if not _template_model_download_progress(context, layout): - layout.label(text="You do not have any compatible models downloaded:") - install_model = layout.operator(InstallModel.bl_idname, text=f"Download {task.recommended_model()} (Recommended)", icon="IMPORT") - install_model.model = task.recommended_model() - install_model.prefer_fp16_revision = context.preferences.addons[StableDiffusionPreferences.bl_idname].preferences.prefer_fp16_revision - raise FixItError( - f"""Incorrect model type selected for {task.name.replace('_', ' ').lower()} tasks. -The selected model is for {model.model_type.replace('_', ' ').lower()}. -Select a different model below.""", - fix_model - ) - case Pipeline.STABILITY_SDK: - if len(context.preferences.addons[StableDiffusionPreferences.bl_idname].preferences.dream_studio_key) <= 0: - raise FixItError( - f"""No DreamStudio key entered. -Enter your API key below{', or change the pipeline' if Pipeline.local_available() else ''}.""", - lambda ctx, layout: layout.prop(ctx.preferences.addons[StableDiffusionPreferences.bl_idname].preferences, "dream_studio_key") - ) - - init_image = None - if self.use_init_img: - match self.init_img_src: - case 'file': - init_image = context.scene.init_img - case 'open_editor': - for area in context.screen.areas: - if area.type == 'IMAGE_EDITOR': - if area.spaces.active.image is not None: - init_image = area.spaces.active.image - if init_image is not None and init_image.type == 'RENDER_RESULT': - def fix_init_img(ctx, layout): - layout.prop(self, "init_img_src", expand=True) - if self.init_img_src == 'file': - layout.template_ID(context.scene, "init_img", open="image.open") - layout.label(text="Or, enable the render pass to generate after each render.") - layout.operator(OpenURL.bl_idname, text="Learn More", icon="QUESTION").url = "https://github.com/carson-katri/dream-textures/blob/main/docs/RENDER_PASS.md" - raise FixItError("""'Render Result' cannot be used as a source image. -Save the image then open the file to use it as a source image.""", - fix_init_img - ) - - return True \ No newline at end of file diff --git a/property_groups/seamless_result.py b/property_groups/seamless_result.py index f93fc258..f1ee952f 100644 --- a/property_groups/seamless_result.py +++ b/property_groups/seamless_result.py @@ -4,7 +4,7 @@ from ..generator_process.actions.detect_seamless import SeamlessAxes from ..generator_process import Generator from ..preferences import StableDiffusionPreferences - +from ..api.models import GenerationArguments def update(self, context): if hasattr(context.area, "regions"): @@ -52,9 +52,16 @@ def result(future): self.result = future.result().text Generator.shared().detect_seamless(pixels).add_done_callback(result) - def update_args(self, args: dict[str, any], as_id=False): - if args['seamless_axes'] == SeamlessAxes.AUTO and self.result != 'Processing': - if as_id: - args['seamless_axes'] = SeamlessAxes(self.result).id - else: - args['seamless_axes'] = SeamlessAxes(self.result) + def update_args(self, args, as_id=False): + if isinstance(args, GenerationArguments): + if args.seamless_axes == SeamlessAxes.AUTO and self.result != 'Processing': + if as_id: + args.seamless_axes = SeamlessAxes(self.result).id + else: + args.seamless_axes = SeamlessAxes(self.result) + else: + if args['seamless_axes'] == SeamlessAxes.AUTO and self.result != 'Processing': + if as_id: + args['seamless_axes'] = SeamlessAxes(self.result).id + else: + args['seamless_axes'] = SeamlessAxes(self.result) diff --git a/render_pass.py b/render_pass.py index b016d9dd..a089cb9a 100644 --- a/render_pass.py +++ b/render_pass.py @@ -2,9 +2,10 @@ import cycles import numpy as np import os -from .generator_process.actions.prompt_to_image import ImageGenerationResult -from .generator_process import Generator +from typing import List import threading +from .generator_process import Generator +from . import api pass_inputs = [ ('color', 'Color', 'Provide the scene color as input'), @@ -112,43 +113,47 @@ def _render_dream_textures_pass(self, layer, size, scene, render_pass, render_re self.update_stats("Dream Textures", "Generating...") - generated_args = scene.dream_textures_render_properties_prompt.generate_args() - generated_args['width'] = size[0] - generated_args['height'] = size[1] + prompt = scene.dream_textures_render_properties_prompt match scene.dream_textures_render_properties_pass_inputs: case 'color': - f = gen.image_to_image( - image=np.flipud(combined_pixels.reshape((size[1], size[0], 4)) * 255).astype(np.uint8), - **generated_args + task = api.ImageToImage( + np.flipud(combined_pixels.reshape((size[1], size[0], 4)) * 255).astype(np.uint8), + prompt.strength, + True ) case 'depth': - f = gen.depth_to_image( - depth=depth, - image=None, - **generated_args + task = api.DepthToImage( + depth, + None, + prompt.strength ) case 'color_depth': - f = gen.depth_to_image( - depth=depth, - image=np.flipud(combined_pixels.reshape((size[1], size[0], 4)) * 255).astype(np.uint8), - **generated_args + task = api.DepthToImage( + depth, + np.flipud(combined_pixels.reshape((size[1], size[0], 4)) * 255).astype(np.uint8), + prompt.strength ) event = threading.Event() - def on_step(_, step: ImageGenerationResult): - if step.final: - return - self.update_progress(step.step / generated_args['steps']) - if len(step.images) > 0: - combined_pixels = step.images[0] - render_pass.rect.foreach_set(combined_pixels.reshape((size[0] * size[1], 4))) - self.update_result(render_result) # This does not seem to have an effect. - def on_done(future): + def step_callback(progress: List[api.GenerationResult]) -> bool: + self.update_progress(progress[-1].progress / progress[-1].total) + render_pass.rect.foreach_set(progress[-1].image.reshape((size[0] * size[1], 4))) + self.update_result(render_result) # This does not seem to have an effect. + return True + def callback(results: List[api.GenerationResult] | Exception): nonlocal combined_pixels - result = future.result(last_only=True) - combined_pixels = result.images[0] + combined_pixels = results[-1].image event.set() - f.add_response_callback(on_step) - f.add_done_callback(on_done) + + backend: api.Backend = prompt.get_backend() + generated_args: api.GenerationArguments = prompt.generate_args(bpy.context) + generated_args.task = task + generated_args.size = size + backend.generate( + generated_args, + step_callback=step_callback, + callback=callback + ) + event.wait() # Perform an inverse transform so when Blender applies its transform everything looks correct. diff --git a/requirements/linux-rocm.txt b/requirements/linux-rocm.txt index 4b838a4f..fdb81b33 100644 --- a/requirements/linux-rocm.txt +++ b/requirements/linux-rocm.txt @@ -1,4 +1,5 @@ -git+https://github.com/huggingface/diffusers@main#egg=diffusers +diffusers==0.20.2 +invisible-watermark transformers accelerate huggingface_hub @@ -9,6 +10,7 @@ torch>=2.0 # Original SD checkpoint conversion pytorch-lightning tensorboard +omegaconf scipy # LMSDiscreteScheduler diff --git a/requirements/mac-mps-cpu.txt b/requirements/mac-mps-cpu.txt index 010031eb..9ff3e201 100644 --- a/requirements/mac-mps-cpu.txt +++ b/requirements/mac-mps-cpu.txt @@ -1,6 +1,7 @@ -git+https://github.com/huggingface/diffusers@main#egg=diffusers +diffusers==0.20.2 +invisible-watermark transformers -accelerate==0.14.0 +accelerate huggingface_hub torch>=2.0 @@ -8,6 +9,7 @@ torch>=2.0 # Original SD checkpoint conversion pytorch-lightning tensorboard +omegaconf scipy # LMSDiscreteScheduler diff --git a/requirements/win-dml.txt b/requirements/win-dml.txt index b2830fdd..c5b4c36d 100644 --- a/requirements/win-dml.txt +++ b/requirements/win-dml.txt @@ -1,4 +1,5 @@ -git+https://github.com/huggingface/diffusers@main#egg=diffusers +diffusers==0.20.2 +invisible-watermark transformers accelerate huggingface_hub @@ -9,6 +10,7 @@ torch>=2.0 # Original SD checkpoint conversion pytorch-lightning tensorboard +omegaconf scipy # LMSDiscreteScheduler diff --git a/requirements/win-linux-cuda.txt b/requirements/win-linux-cuda.txt index 04dfd47c..4022fb6c 100644 --- a/requirements/win-linux-cuda.txt +++ b/requirements/win-linux-cuda.txt @@ -1,4 +1,5 @@ -git+https://github.com/huggingface/diffusers@main#egg=diffusers +diffusers==0.20.2 +invisible-watermark transformers accelerate huggingface_hub @@ -9,6 +10,7 @@ torch>=2.0 # Original SD checkpoint conversion pytorch-lightning tensorboard +omegaconf scipy # LMSDiscreteScheduler diff --git a/sd_configs/cldm_v15.yaml b/sd_configs/cldm_v15.yaml new file mode 100644 index 00000000..fde18255 --- /dev/null +++ b/sd_configs/cldm_v15.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/sd_configs/cldm_v21.yaml b/sd_configs/cldm_v21.yaml new file mode 100644 index 00000000..fc651936 --- /dev/null +++ b/sd_configs/cldm_v21.yaml @@ -0,0 +1,85 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + use_checkpoint: True + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + use_checkpoint: True + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder + params: + freeze: True + layer: "penultimate" diff --git a/sd_configs/sd_xl_base.yaml b/sd_configs/sd_xl_base.yaml new file mode 100644 index 00000000..8aaf5b6e --- /dev/null +++ b/sd_configs/sd_xl_base.yaml @@ -0,0 +1,98 @@ +model: + target: sgm.models.diffusion.DiffusionEngine + params: + scale_factor: 0.13025 + disable_first_stage_autocast: True + + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser + params: + num_idx: 1000 + + weighting_config: + target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + adm_in_channels: 2816 + num_classes: sequential + use_checkpoint: True + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [4, 2] + num_res_blocks: 2 + channel_mult: [1, 2, 4] + num_head_channels: 64 + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: [1, 2, 10] # note: the first is unused (due to attn_res starting at 2) 32, 16, 8 --> 64, 32, 16 + context_dim: 2048 + spatial_transformer_attn_type: softmax-xformers + legacy: False + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + # crossattn cond + - is_trainable: False + input_key: txt + target: sgm.modules.encoders.modules.FrozenCLIPEmbedder + params: + layer: hidden + layer_idx: 11 + # crossattn and vector cond + - is_trainable: False + input_key: txt + target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2 + params: + arch: ViT-bigG-14 + version: laion2b_s39b_b160k + freeze: True + layer: penultimate + always_return_pooled: True + legacy: False + # vector cond + - is_trainable: False + input_key: original_size_as_tuple + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 # multiplied by two + # vector cond + - is_trainable: False + input_key: crop_coords_top_left + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 # multiplied by two + # vector cond + - is_trainable: False + input_key: target_size_as_tuple + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 # multiplied by two + + first_stage_config: + target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + attn_type: vanilla-xformers + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity diff --git a/sd_configs/sd_xl_refiner.yaml b/sd_configs/sd_xl_refiner.yaml new file mode 100644 index 00000000..cab5fe28 --- /dev/null +++ b/sd_configs/sd_xl_refiner.yaml @@ -0,0 +1,91 @@ +model: + target: sgm.models.diffusion.DiffusionEngine + params: + scale_factor: 0.13025 + disable_first_stage_autocast: True + + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser + params: + num_idx: 1000 + + weighting_config: + target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + adm_in_channels: 2560 + num_classes: sequential + use_checkpoint: True + in_channels: 4 + out_channels: 4 + model_channels: 384 + attention_resolutions: [4, 2] + num_res_blocks: 2 + channel_mult: [1, 2, 4, 4] + num_head_channels: 64 + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 4 + context_dim: [1280, 1280, 1280, 1280] # 1280 + spatial_transformer_attn_type: softmax-xformers + legacy: False + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + # crossattn and vector cond + - is_trainable: False + input_key: txt + target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2 + params: + arch: ViT-bigG-14 + version: laion2b_s39b_b160k + legacy: False + freeze: True + layer: penultimate + always_return_pooled: True + # vector cond + - is_trainable: False + input_key: original_size_as_tuple + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 # multiplied by two + # vector cond + - is_trainable: False + input_key: crop_coords_top_left + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 # multiplied by two + # vector cond + - is_trainable: False + input_key: aesthetic_score + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 # multiplied by one + + first_stage_config: + target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + attn_type: vanilla-xformers + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity diff --git a/sd_configs/v1-inference.yaml b/sd_configs/v1-inference.yaml new file mode 100644 index 00000000..d4effe56 --- /dev/null +++ b/sd_configs/v1-inference.yaml @@ -0,0 +1,70 @@ +model: + base_learning_rate: 1.0e-04 + target: ldm.models.diffusion.ddpm.LatentDiffusion + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false # Note: different from the one we trained before + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + + scheduler_config: # 10000 warmup steps + target: ldm.lr_scheduler.LambdaLinearScheduler + params: + warm_up_steps: [ 10000 ] + cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases + f_start: [ 1.e-6 ] + f_max: [ 1. ] + f_min: [ 1. ] + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/sd_configs/v2-inference-v.yaml b/sd_configs/v2-inference-v.yaml new file mode 100644 index 00000000..8ec8dfbf --- /dev/null +++ b/sd_configs/v2-inference-v.yaml @@ -0,0 +1,68 @@ +model: + base_learning_rate: 1.0e-4 + target: ldm.models.diffusion.ddpm.LatentDiffusion + params: + parameterization: "v" + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False # we set this to false because this is an inference only config + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: True + use_fp16: True + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder + params: + freeze: True + layer: "penultimate" diff --git a/sd_configs/v2-inference.yaml b/sd_configs/v2-inference.yaml new file mode 100644 index 00000000..152c4f3c --- /dev/null +++ b/sd_configs/v2-inference.yaml @@ -0,0 +1,67 @@ +model: + base_learning_rate: 1.0e-4 + target: ldm.models.diffusion.ddpm.LatentDiffusion + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False # we set this to false because this is an inference only config + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: True + use_fp16: True + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder + params: + freeze: True + layer: "penultimate" diff --git a/sd_configs/v2-inpainting-inference.yaml b/sd_configs/v2-inpainting-inference.yaml new file mode 100644 index 00000000..32a9471d --- /dev/null +++ b/sd_configs/v2-inpainting-inference.yaml @@ -0,0 +1,158 @@ +model: + base_learning_rate: 5.0e-05 + target: ldm.models.diffusion.ddpm.LatentInpaintDiffusion + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: hybrid + scale_factor: 0.18215 + monitor: val/loss_simple_ema + finetune_keys: null + use_ema: False + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: True + image_size: 32 # unused + in_channels: 9 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [ ] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder + params: + freeze: True + layer: "penultimate" + + +data: + target: ldm.data.laion.WebDataModuleFromConfig + params: + tar_base: null # for concat as in LAION-A + p_unsafe_threshold: 0.1 + filter_word_list: "data/filters.yaml" + max_pwatermark: 0.45 + batch_size: 8 + num_workers: 6 + multinode: True + min_size: 512 + train: + shards: + - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-0/{00000..18699}.tar -" + - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-1/{00000..18699}.tar -" + - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-2/{00000..18699}.tar -" + - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-3/{00000..18699}.tar -" + - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-4/{00000..18699}.tar -" #{00000-94333}.tar" + shuffle: 10000 + image_key: jpg + image_transforms: + - target: torchvision.transforms.Resize + params: + size: 512 + interpolation: 3 + - target: torchvision.transforms.RandomCrop + params: + size: 512 + postprocess: + target: ldm.data.laion.AddMask + params: + mode: "512train-large" + p_drop: 0.25 + # NOTE use enough shards to avoid empty validation loops in workers + validation: + shards: + - "pipe:aws s3 cp s3://deep-floyd-s3/datasets/laion_cleaned-part5/{93001..94333}.tar - " + shuffle: 0 + image_key: jpg + image_transforms: + - target: torchvision.transforms.Resize + params: + size: 512 + interpolation: 3 + - target: torchvision.transforms.CenterCrop + params: + size: 512 + postprocess: + target: ldm.data.laion.AddMask + params: + mode: "512train-large" + p_drop: 0.25 + +lightning: + find_unused_parameters: True + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + metrics_over_trainsteps_checkpoint: + params: + every_n_train_steps: 10000 + + image_logger: + target: main.ImageLogger + params: + enable_autocast: False + disabled: False + batch_frequency: 1000 + max_images: 4 + increase_log_steps: False + log_first_step: False + log_images_kwargs: + use_ema_scope: False + inpaint: False + plot_progressive_rows: False + plot_diffusion_rows: False + N: 4 + unconditional_guidance_scale: 5.0 + unconditional_guidance_label: [""] + ddim_steps: 50 # todo check these out for depth2img, + ddim_eta: 0.0 # todo check these out for depth2img, + + trainer: + benchmark: True + val_check_interval: 5000000 + num_sanity_val_steps: 0 + accumulate_grad_batches: 1 diff --git a/sd_configs/v2-midas-inference.yaml b/sd_configs/v2-midas-inference.yaml new file mode 100644 index 00000000..f20c30f6 --- /dev/null +++ b/sd_configs/v2-midas-inference.yaml @@ -0,0 +1,74 @@ +model: + base_learning_rate: 5.0e-07 + target: ldm.models.diffusion.ddpm.LatentDepth2ImageDiffusion + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: hybrid + scale_factor: 0.18215 + monitor: val/loss_simple_ema + finetune_keys: null + use_ema: False + + depth_stage_config: + target: ldm.modules.midas.api.MiDaSInference + params: + model_type: "dpt_hybrid" + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: True + image_size: 32 # unused + in_channels: 5 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [ ] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder + params: + freeze: True + layer: "penultimate" + + diff --git a/ui/panels/dream_texture.py b/ui/panels/dream_texture.py index 8ebcf446..049031e2 100644 --- a/ui/panels/dream_texture.py +++ b/ui/panels/dream_texture.py @@ -13,10 +13,11 @@ from ...operators.open_latest_version import OpenLatestVersion, is_force_show_download, new_version_available from ...operators.view_history import ImportPromptFile from ..space_types import SPACE_TYPES -from ...property_groups.dream_prompt import DreamPrompt, pipeline_options +from ...property_groups.dream_prompt import DreamPrompt, backend_options from ...generator_process.actions.prompt_to_image import Optimizations from ...generator_process.actions.detect_seamless import SeamlessAxes -from ...generator_process.models import Pipeline, FixItError +from ...api.models import FixItError +from ... import api def dream_texture_panels(): for space_type in SPACE_TYPES: @@ -50,9 +51,8 @@ def draw(self, context): elif new_version_available(): layout.operator(OpenLatestVersion.bl_idname, icon="IMPORT") - layout.prop(context.scene.dream_textures_prompt, "pipeline") - if Pipeline[context.scene.dream_textures_prompt.pipeline].model(): - layout.prop(context.scene.dream_textures_prompt, 'model') + layout.prop(context.scene.dream_textures_prompt, "backend") + layout.prop(context.scene.dream_textures_prompt, 'model') DreamTexturePanel.__name__ = f"DREAM_PT_dream_panel_{space_type}" yield DreamTexturePanel @@ -120,12 +120,12 @@ def draw(self, context): segment_row.prop(prompt, enum_prop, icon_only=is_custom) if prompt.prompt_structure == file_batch_structure.id: layout.template_ID(context.scene, "dream_textures_prompt_file", open="text.open") - if Pipeline[prompt.pipeline].seamless(): - layout.prop(prompt, "seamless_axes") - if prompt.seamless_axes == SeamlessAxes.AUTO and get_seamless_result is not None: - auto_row = self.layout.row() - auto_row.enabled = False - auto_row.prop(get_seamless_result(context, prompt), "result") + + layout.prop(prompt, "seamless_axes") + if prompt.seamless_axes == SeamlessAxes.AUTO and get_seamless_result is not None: + auto_row = self.layout.row() + auto_row.enabled = False + auto_row.prop(get_seamless_result(context, prompt), "result") yield PromptPanel @@ -137,7 +137,7 @@ class NegativePromptPanel(sub_panel): @classmethod def poll(cls, context): - return get_prompt(context).prompt_structure != file_batch_structure.id and Pipeline[get_prompt(context).pipeline].negative_prompts() + return get_prompt(context).prompt_structure != file_batch_structure.id def draw_header(self, context): layout = self.layout @@ -221,9 +221,9 @@ def _outpaint_warning_box(warning): _outpaint_warning_box("Outpaint has no overlap, so the result will not blend") elif prompt.init_img_action == 'modify': layout.prop(prompt, "fit") - layout.prop(prompt, "strength") - if Pipeline[prompt.pipeline].color_correction(): - layout.prop(prompt, "use_init_img_color") + if prompt.init_img_action != 'outpaint': + layout.prop(prompt, "strength") + layout.prop(prompt, "use_init_img_color") if prompt.init_img_action == 'modify': layout.prop(prompt, "modify_action_source_type") if prompt.modify_action_source_type == 'depth_map': @@ -263,14 +263,18 @@ def draw(self, context): layout = self.layout layout.use_property_split = True - layout.prop(get_prompt(context), "random_seed") - if not get_prompt(context).random_seed: - layout.prop(get_prompt(context), "seed") + prompt = get_prompt(context) + layout.prop(prompt, "random_seed") + if not prompt.random_seed: + layout.prop(prompt, "seed") # advanced_box.prop(self, "iterations") # Disabled until supported by the addon. - layout.prop(get_prompt(context), "steps") - layout.prop(get_prompt(context), "cfg_scale") - layout.prop(get_prompt(context), "scheduler") - layout.prop(get_prompt(context), "step_preview_mode") + layout.prop(prompt, "steps") + layout.prop(prompt, "cfg_scale") + layout.prop(prompt, "scheduler") + layout.prop(prompt, "step_preview_mode") + + backend: api.Backend = prompt.get_backend() + backend.draw_advanced(layout, context) yield AdvancedPanel @@ -289,19 +293,8 @@ def draw(self, context): layout.use_property_split = True prompt = get_prompt(context) - inferred_device = Optimizations.infer_device() - if prompt.optimizations_cpu_only: - inferred_device = "cpu" - def optimization(prop): - if Optimizations.device_supports(prop, inferred_device): - layout.prop(prompt, f"optimizations_{prop}") - - optimization("cudnn_benchmark") - optimization("tf32") - optimization("half_precision") - optimization("channels_last_memory_format") - optimization("batch_size") - optimization("cfg_end") + backend: api.Backend = prompt.get_backend() + backend.draw_speed_optimizations(layout, context) yield SpeedOptimizationPanel class MemoryOptimizationPanel(sub_panel): @@ -316,26 +309,8 @@ def draw(self, context): layout.use_property_split = True prompt = get_prompt(context) - inferred_device = Optimizations.infer_device() - if prompt.optimizations_cpu_only: - inferred_device = "cpu" - def optimization(prop): - if Optimizations.device_supports(prop, inferred_device): - layout.prop(prompt, f"optimizations_{prop}") - - optimization("attention_slicing") - slice_size_row = layout.row() - slice_size_row.prop(prompt, "optimizations_attention_slice_size_src") - if prompt.optimizations_attention_slice_size_src == 'manual': - slice_size_row.prop(prompt, "optimizations_attention_slice_size", text="Size") - optimization("sdp_attention") - optimization("cpu_offload") - optimization("cpu_only") - optimization("vae_slicing") - optimization("vae_tiling") - if prompt.optimizations_vae_tiling == "manual": - optimization("vae_tile_size") - optimization("vae_tile_blend") + backend: api.Backend = prompt.get_backend() + backend.draw_memory_optimizations(layout, context) yield MemoryOptimizationPanel def actions_panel(sub_panel, space_type, get_prompt): @@ -356,17 +331,19 @@ def draw(self, context): iterations_row.enabled = prompt.prompt_structure != file_batch_structure.id iterations_row.prop(prompt, "iterations") - row = layout.row() + row = layout.row(align=True) row.scale_y = 1.5 if CancelGenerator.poll(context): row.operator(CancelGenerator.bl_idname, icon="SNAP_FACE", text="") if context.scene.dream_textures_progress <= 0: if context.scene.dream_textures_info != "": - row.label(text=context.scene.dream_textures_info, icon="INFO") + disabled_row = row.row(align=True) + disabled_row.operator(DreamTexture.bl_idname, text=context.scene.dream_textures_info, icon="INFO") + disabled_row.enabled = False else: row.operator(DreamTexture.bl_idname, icon="PLAY", text="Generate") else: - disabled_row = row.row() + disabled_row = row.row(align=True) disabled_row.use_property_split = True disabled_row.prop(context.scene, 'dream_textures_progress', slider=True) disabled_row.enabled = False @@ -380,13 +357,14 @@ def draw(self, context): # Validation try: - prompt.validate(context) + backend: api.Backend = prompt.get_backend() + backend.validate(prompt.generate_args(context)) except FixItError as e: error_box = layout.box() error_box.use_property_split = False for i, line in enumerate(e.args[0].split('\n')): error_box.label(text=line, icon="ERROR" if i == 0 else "NONE") - e.draw(context, error_box) + e._draw(prompt, context, error_box) except Exception as e: print(e) return ActionsPanel \ No newline at end of file diff --git a/ui/panels/render_properties.py b/ui/panels/render_properties.py index 731f9b69..45e9e4b4 100644 --- a/ui/panels/render_properties.py +++ b/ui/panels/render_properties.py @@ -1,8 +1,7 @@ import bpy from .dream_texture import create_panel, prompt_panel, advanced_panel -from ...property_groups.dream_prompt import pipeline_options -from ...generator_process.actions.prompt_to_image import Pipeline -from ...generator_process.actions.huggingface_hub import ModelType +from ...property_groups.dream_prompt import backend_options +from ...generator_process.models import ModelType from ...preferences import StableDiffusionPreferences class RenderPropertiesPanel(bpy.types.Panel): @@ -27,10 +26,9 @@ def draw(self, context): layout.use_property_decorate = False layout.active = context.scene.dream_textures_render_properties_enabled - if len(pipeline_options(self, context)) > 1: - layout.prop(context.scene.dream_textures_render_properties_prompt, "pipeline") - if Pipeline[context.scene.dream_textures_render_properties_prompt.pipeline].model(): - layout.prop(context.scene.dream_textures_render_properties_prompt, 'model') + if len(backend_options(self, context)) > 1: + layout.prop(context.scene.dream_textures_render_properties_prompt, "backend") + layout.prop(context.scene.dream_textures_render_properties_prompt, 'model') layout.prop(context.scene.dream_textures_render_properties_prompt, "strength") layout.prop(context.scene, "dream_textures_render_properties_pass_inputs") if context.scene.dream_textures_render_properties_pass_inputs != 'color': @@ -40,11 +38,6 @@ def draw(self, context): box.label(text="Enable the Z pass to use depth pass inputs") box.use_property_split = False box.prop(context.view_layer, "use_pass_z") - - if not Pipeline[context.scene.dream_textures_render_properties_prompt.pipeline].depth(): - box = layout.box() - box.label(text="Unsupported pipeline", icon="ERROR") - box.label(text="The selected pipeline does not support depth to image.") models = list(filter( lambda m: m.model_base == context.scene.dream_textures_render_properties_prompt.model, diff --git a/ui/panels/upscaling.py b/ui/panels/upscaling.py index 5308c6bb..a926a2e8 100644 --- a/ui/panels/upscaling.py +++ b/ui/panels/upscaling.py @@ -4,7 +4,6 @@ from ...operators.upscale import Upscale, get_source_image from ...operators.dream_texture import CancelGenerator, ReleaseGenerator from ...generator_process.actions.detect_seamless import SeamlessAxes -from ...generator_process.actions.prompt_to_image import Pipeline from .dream_texture import create_panel, advanced_panel from ..space_types import SPACE_TYPES @@ -21,8 +20,6 @@ class UpscalingPanel(Panel): @classmethod def poll(cls, context): - if not Pipeline[context.scene.dream_textures_prompt.pipeline].upscaling(): - return False if cls.bl_space_type == 'NODE_EDITOR': return context.area.ui_type == "ShaderNodeTree" or context.area.ui_type == "CompositorNodeTree" else: @@ -34,6 +31,9 @@ def draw(self, context): layout.use_property_decorate = False prompt = context.scene.dream_textures_upscale_prompt + + layout.prop(prompt, "backend") + layout.prop(prompt, "model") layout.prop(prompt, "prompt_structure_token_subject") layout.prop(context.scene, "dream_textures_upscale_tile_size") @@ -68,8 +68,6 @@ class ActionsPanel(Panel): @classmethod def poll(cls, context): - if not Pipeline[context.scene.dream_textures_prompt.pipeline].upscaling(): - return False if cls.bl_space_type == 'NODE_EDITOR': return context.area.ui_type == "ShaderNodeTree" or context.area.ui_type == "CompositorNodeTree" else: @@ -81,11 +79,15 @@ def draw(self, context): layout.use_property_decorate = False image = get_source_image(context) - row = layout.row() + row = layout.row(align=True) row.scale_y = 1.5 + if CancelGenerator.poll(context): + row.operator(CancelGenerator.bl_idname, icon="SNAP_FACE", text="") if context.scene.dream_textures_progress <= 0: if context.scene.dream_textures_info != "": - row.label(text=context.scene.dream_textures_info, icon="INFO") + disabled_row = row.row(align=True) + disabled_row.operator(Upscale.bl_idname, text=context.scene.dream_textures_info, icon="INFO") + disabled_row.enabled = False else: row.operator( Upscale.bl_idname, @@ -93,12 +95,10 @@ def draw(self, context): icon="FULLSCREEN_ENTER" ) else: - disabled_row = row.row() + disabled_row = row.row(align=True) disabled_row.use_property_split = True disabled_row.prop(context.scene, 'dream_textures_progress', slider=True) disabled_row.enabled = False - if CancelGenerator.poll(context): - row.operator(CancelGenerator.bl_idname, icon="CANCEL", text="") row.operator(ReleaseGenerator.bl_idname, icon="X", text="") yield UpscalingPanel advanced_panels = [*create_panel(space_type, 'UI', UpscalingPanel.bl_idname, advanced_panel, lambda context: context.scene.dream_textures_upscale_prompt)]