From 407fdff90b8a3f1220cc6af7fa36175f7732acc3 Mon Sep 17 00:00:00 2001
From: NullSenseStudio <47096043+NullSenseStudio@users.noreply.github.com>
Date: Fri, 17 Feb 2023 15:08:15 -0500
Subject: [PATCH 01/13] show missing optimizations

---
 generator_process/actions/prompt_to_image.py | 14 ++++++++------
 property_groups/dream_prompt.py              |  3 +--
 ui/panels/dream_texture.py                   |  6 ++++--
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/generator_process/actions/prompt_to_image.py b/generator_process/actions/prompt_to_image.py
index bce36d1b..1db5158e 100644
--- a/generator_process/actions/prompt_to_image.py
+++ b/generator_process/actions/prompt_to_image.py
@@ -165,16 +165,18 @@ def infer_device() -> str:
         else:
             return "cuda"
 
-    def can_use(self, property, device) -> bool:
-        if not getattr(self, property):
-            return False
-        if isinstance(self.__annotations__.get(property, None), _AnnotatedAlias):
-            annotation: _AnnotatedAlias = self.__annotations__[property]
+    @classmethod
+    def device_supports(cls, property, device) -> bool:
+        annotation = cls.__annotations__.get(property, None)
+        if isinstance(annotation, _AnnotatedAlias):
             opt_dev = annotation.__metadata__[0]
             if isinstance(opt_dev, str):
                 return opt_dev == device
             return device in opt_dev
-        return True
+        return annotation is not None
+
+    def can_use(self, property, device) -> bool:
+        return self.device_supports(property, device) and getattr(self, property)
 
     def can_use_half(self, device):
         if self.half_precision and device == "cuda":
diff --git a/property_groups/dream_prompt.py b/property_groups/dream_prompt.py
index 62261683..f589366c 100644
--- a/property_groups/dream_prompt.py
+++ b/property_groups/dream_prompt.py
@@ -175,9 +175,8 @@ def seed_clamp(self, ctx):
         if annotation != bool or (annotation is _AnnotatedAlias and annotation.__origin__ != bool):
             continue
     default = getattr(default_optimizations, optim, None)
-    if default is not None and not isinstance(getattr(default_optimizations, optim), bool):
+    if default is not None and not isinstance(default, bool):
         continue
-    setattr(default_optimizations, optim, True)
     attributes[f"optimizations_{optim}"] = BoolProperty(name=optim.replace('_', ' ').title(), default=default)
 attributes["optimizations_attention_slice_size_src"] = EnumProperty(name="Attention Slice Size", items=(
     ("auto", "Automatic", "", 1),
diff --git a/ui/panels/dream_texture.py b/ui/panels/dream_texture.py
index bfd67725..e07716b4 100644
--- a/ui/panels/dream_texture.py
+++ b/ui/panels/dream_texture.py
@@ -277,7 +277,7 @@ def draw(self, context):
             inferred_device = Optimizations.infer_device()
             def optimization(prop):
                 if hasattr(prompt, f"optimizations_{prop}"):
-                    if Optimizations().can_use(prop, inferred_device):
+                    if Optimizations.device_supports(prop, inferred_device):
                         layout.prop(prompt, f"optimizations_{prop}")
 
             optimization("cudnn_benchmark")
@@ -300,9 +300,11 @@ def draw(self, context):
             layout.use_property_split = True
             prompt = get_prompt(context)
 
+            inferred_device = Optimizations.infer_device()
             def optimization(prop):
                 if hasattr(prompt, f"optimizations_{prop}"):
-                    layout.prop(prompt, f"optimizations_{prop}")
+                    if Optimizations.device_supports(prop, inferred_device):
+                        layout.prop(prompt, f"optimizations_{prop}")
 
             optimization("attention_slicing")
             slice_size_row = layout.row()

From dd10d0eb85606eaceae816e518fa8f118b9d75de Mon Sep 17 00:00:00 2001
From: NullSenseStudio <47096043+NullSenseStudio@users.noreply.github.com>
Date: Sat, 18 Feb 2023 00:42:02 -0500
Subject: [PATCH 02/13] optimization descriptions

---
 property_groups/dream_prompt.py | 51 +++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 18 deletions(-)

diff --git a/property_groups/dream_prompt.py b/property_groups/dream_prompt.py
index f589366c..a4ae73c9 100644
--- a/property_groups/dream_prompt.py
+++ b/property_groups/dream_prompt.py
@@ -166,24 +166,39 @@ def seed_clamp(self, ctx):
 }
 
 default_optimizations = Optimizations()
-    
-for optim in dir(Optimizations):
-    if optim.startswith('_'):
-        continue
-    if hasattr(Optimizations.__annotations__, optim):
-        annotation = Optimizations.__annotations__[optim]
-        if annotation != bool or (annotation is _AnnotatedAlias and annotation.__origin__ != bool):
-            continue
-    default = getattr(default_optimizations, optim, None)
-    if default is not None and not isinstance(default, bool):
-        continue
-    attributes[f"optimizations_{optim}"] = BoolProperty(name=optim.replace('_', ' ').title(), default=default)
-attributes["optimizations_attention_slice_size_src"] = EnumProperty(name="Attention Slice Size", items=(
-    ("auto", "Automatic", "", 1),
-    ("manual", "Manual", "", 2),
-), default=1)
-attributes["optimizations_attention_slice_size"] = IntProperty(name="Attention Slice Size", default=1, min=1)
-attributes["optimizations_batch_size"] = IntProperty(name="Batch Size", default=1, min=1)
+def optimization(optim, property=None, **kwargs):
+    if "name" not in kwargs:
+        kwargs["name"] = optim.replace('_', ' ').title()
+    if "default" not in kwargs:
+        kwargs["default"] = getattr(default_optimizations, optim)
+    if property is None:
+        match kwargs["default"]:
+            case bool():
+                property = BoolProperty
+            case int():
+                property = IntProperty
+            case _:
+                raise TypeError(f"{optim} cannot infer optimization property from {type(kwargs['default'])}")
+    attributes[f"optimizations_{optim}"] = property(**kwargs)
+
+optimization("attention_slicing", description="Computes attention in several steps. Saves some memory in exchange for a small speed decrease")
+optimization("attention_slice_size_src", property=EnumProperty, items=(
+    ("auto", "Automatic", "Computes attention in two steps", 1),
+    ("manual", "Manual", "Computes attention in `attention_head_dim // size` steps. A smaller `size` saves more memory.\n"
+                         "`attention_head_dim` must be a multiple of `size`, otherwise the image won't generate properly.\n"
+                         "`attention_head_dim` can be found within the model snapshot's unet/config.json file", 2),
+), default=1, name="Attention Slice Size")
+optimization("attention_slice_size", default=1, min=1)
+optimization("cudnn_benchmark", name="cuDNN Benchmark", description="Allows cuDNN to benchmark multiple convolution algorithms and select the fastest")
+optimization("tf32", name="TF32", description="Utilizes tensor cores on Ampere (RTX 30xx) or newer GPUs for matrix multiplications.\nHas no effect if half precision is enabled")
+optimization("amp")
+optimization("half_precision", description="Reduces memory usage and increases speed in exchange for a slight loss in image quality.\nHas no effect if CPU only is enabled or using a GTX 16xx GPU")
+optimization("sequential_cpu_offload", name="Sequential CPU Offload", description="Dynamically moves individual model weights in and out of device memory for reduced memory usage and a large speed penalty")
+optimization("channels_last_memory_format", description="An alternative way of ordering NCHW tensors that may be faster or slower depending on the device")
+# optimization("xformers_attention") # FIXME: xFormers is not yet available.
+optimization("batch_size", default=1, min=1, description="Improves speed when using iterations or upscaling in exchange for higher memory usage.\nHighly recommended to use with VAE slicing enabled")
+optimization("vae_slicing", name="VAE Slicing", description="Reduces memory usage of batched VAE decoding. Has no affect if batch size is 1.\nMay have a small performance improvement with large batches")
+optimization("cpu_only", name="CPU Only", description="Disables GPU acceleration and is extremely slow")
 
 def map_structure_token_items(value):
     return (value[0], value[1], '')

From ab9162eadac3d7dc6fb204769b65dc41d19d53fe Mon Sep 17 00:00:00 2001
From: NullSenseStudio <47096043+NullSenseStudio@users.noreply.github.com>
Date: Sat, 18 Feb 2023 20:02:52 -0500
Subject: [PATCH 03/13] remove automatic mixed precision

---
 generator_process/actions/depth_to_image.py  |  3 +-
 generator_process/actions/image_to_image.py  | 35 +++++-----
 generator_process/actions/inpaint.py         | 67 ++++++++++----------
 generator_process/actions/prompt_to_image.py | 37 ++++++-----
 property_groups/dream_prompt.py              |  1 -
 ui/panels/dream_texture.py                   |  1 -
 6 files changed, 69 insertions(+), 75 deletions(-)

diff --git a/generator_process/actions/depth_to_image.py b/generator_process/actions/depth_to_image.py
index fe127bba..347507d1 100644
--- a/generator_process/actions/depth_to_image.py
+++ b/generator_process/actions/depth_to_image.py
@@ -362,8 +362,7 @@ def __call__(
             _configure_model_padding(pipe.vae, seamless_axes)
 
             # Inference
-            with (torch.inference_mode() if device not in ('mps', "privateuseone") else nullcontext()), \
-                (torch.autocast(device) if optimizations.can_use("amp", device) else nullcontext()):
+            with torch.inference_mode() if device not in ('mps', "privateuseone") else nullcontext():
                 yield from pipe(
                     prompt=prompt,
                     depth_image=depth_image,
diff --git a/generator_process/actions/image_to_image.py b/generator_process/actions/image_to_image.py
index 1248ff09..43abb7e3 100644
--- a/generator_process/actions/image_to_image.py
+++ b/generator_process/actions/image_to_image.py
@@ -180,24 +180,23 @@ def __call__(
             _configure_model_padding(pipe.vae, seamless_axes)
 
             # Inference
-            with (torch.inference_mode() if device not in ('mps', "privateuseone") else nullcontext()), \
-                    (torch.autocast(device) if optimizations.can_use("amp", device) else nullcontext()):
-                    yield from pipe(
-                        prompt=prompt,
-                        image=[init_image] * batch_size,
-                        strength=strength,
-                        num_inference_steps=steps,
-                        guidance_scale=cfg_scale,
-                        negative_prompt=negative_prompt if use_negative_prompt else None,
-                        num_images_per_prompt=1,
-                        eta=0.0,
-                        generator=generator,
-                        output_type="pil",
-                        return_dict=True,
-                        callback=None,
-                        callback_steps=1,
-                        step_preview_mode=step_preview_mode
-                    )
+            with torch.inference_mode() if device not in ('mps', "privateuseone") else nullcontext():
+                yield from pipe(
+                    prompt=prompt,
+                    image=[init_image] * batch_size,
+                    strength=strength,
+                    num_inference_steps=steps,
+                    guidance_scale=cfg_scale,
+                    negative_prompt=negative_prompt if use_negative_prompt else None,
+                    num_images_per_prompt=1,
+                    eta=0.0,
+                    generator=generator,
+                    output_type="pil",
+                    return_dict=True,
+                    callback=None,
+                    callback_steps=1,
+                    step_preview_mode=step_preview_mode
+                )
         case Pipeline.STABILITY_SDK:
             import stability_sdk.client
             import stability_sdk.interfaces.gooseai.generation.generation_pb2
diff --git a/generator_process/actions/inpaint.py b/generator_process/actions/inpaint.py
index 8991b2bc..42b2921b 100644
--- a/generator_process/actions/inpaint.py
+++ b/generator_process/actions/inpaint.py
@@ -213,40 +213,39 @@ def __call__(
             _configure_model_padding(pipe.vae, seamless_axes)
 
             # Inference
-            with (torch.inference_mode() if device not in ('mps', "privateuseone") else nullcontext()), \
-                    (torch.autocast(device) if optimizations.can_use("amp", device) else nullcontext()):
-                    match inpaint_mask_src:
-                        case 'alpha':
-                            mask_image = ImageOps.invert(init_image.getchannel('A'))
-                        case 'prompt':
-                            from transformers import AutoProcessor, CLIPSegForImageSegmentation
-
-                            processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
-                            clipseg = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")
-                            inputs = processor(text=[text_mask], images=[init_image.convert('RGB')], return_tensors="pt", padding=True)
-                            outputs = clipseg(**inputs)
-                            mask_image = Image.fromarray(np.uint8((1 - torch.sigmoid(outputs.logits).lt(text_mask_confidence).int().detach().numpy()) * 255), 'L').resize(init_image.size)
-
-                    yield from pipe(
-                        prompt=prompt,
-                        image=[init_image.convert('RGB')] * batch_size,
-                        mask_image=[mask_image] * batch_size,
-                        strength=strength,
-                        height=init_image.size[1] if fit else height,
-                        width=init_image.size[0] if fit else width,
-                        num_inference_steps=steps,
-                        guidance_scale=cfg_scale,
-                        negative_prompt=negative_prompt if use_negative_prompt else None,
-                        num_images_per_prompt=1,
-                        eta=0.0,
-                        generator=generator,
-                        latents=None,
-                        output_type="pil",
-                        return_dict=True,
-                        callback=None,
-                        callback_steps=1,
-                        step_preview_mode=step_preview_mode
-                    )
+            with torch.inference_mode() if device not in ('mps', "privateuseone") else nullcontext():
+                match inpaint_mask_src:
+                    case 'alpha':
+                        mask_image = ImageOps.invert(init_image.getchannel('A'))
+                    case 'prompt':
+                        from transformers import AutoProcessor, CLIPSegForImageSegmentation
+
+                        processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
+                        clipseg = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")
+                        inputs = processor(text=[text_mask], images=[init_image.convert('RGB')], return_tensors="pt", padding=True)
+                        outputs = clipseg(**inputs)
+                        mask_image = Image.fromarray(np.uint8((1 - torch.sigmoid(outputs.logits).lt(text_mask_confidence).int().detach().numpy()) * 255), 'L').resize(init_image.size)
+
+                yield from pipe(
+                    prompt=prompt,
+                    image=[init_image.convert('RGB')] * batch_size,
+                    mask_image=[mask_image] * batch_size,
+                    strength=strength,
+                    height=init_image.size[1] if fit else height,
+                    width=init_image.size[0] if fit else width,
+                    num_inference_steps=steps,
+                    guidance_scale=cfg_scale,
+                    negative_prompt=negative_prompt if use_negative_prompt else None,
+                    num_images_per_prompt=1,
+                    eta=0.0,
+                    generator=generator,
+                    latents=None,
+                    output_type="pil",
+                    return_dict=True,
+                    callback=None,
+                    callback_steps=1,
+                    step_preview_mode=step_preview_mode
+                )
         case Pipeline.STABILITY_SDK:
             import stability_sdk.client
             import stability_sdk.interfaces.gooseai.generation.generation_pb2
diff --git a/generator_process/actions/prompt_to_image.py b/generator_process/actions/prompt_to_image.py
index 1db5158e..5bf05589 100644
--- a/generator_process/actions/prompt_to_image.py
+++ b/generator_process/actions/prompt_to_image.py
@@ -562,25 +562,24 @@ def __call__(
             _configure_model_padding(pipe.vae, seamless_axes)
 
             # Inference
-            with (torch.inference_mode() if device not in ('mps', "privateuseone") else nullcontext()), \
-                (torch.autocast(device) if optimizations.can_use("amp", device) else nullcontext()):
-                    yield from pipe(
-                        prompt=prompt,
-                        height=height,
-                        width=width,
-                        num_inference_steps=steps,
-                        guidance_scale=cfg_scale,
-                        negative_prompt=negative_prompt if use_negative_prompt else None,
-                        num_images_per_prompt=1,
-                        eta=0.0,
-                        generator=generator,
-                        latents=None,
-                        output_type="pil",
-                        return_dict=True,
-                        callback=None,
-                        callback_steps=1,
-                        step_preview_mode=step_preview_mode
-                    )
+            with torch.inference_mode() if device not in ('mps', "privateuseone") else nullcontext():
+                yield from pipe(
+                    prompt=prompt,
+                    height=height,
+                    width=width,
+                    num_inference_steps=steps,
+                    guidance_scale=cfg_scale,
+                    negative_prompt=negative_prompt if use_negative_prompt else None,
+                    num_images_per_prompt=1,
+                    eta=0.0,
+                    generator=generator,
+                    latents=None,
+                    output_type="pil",
+                    return_dict=True,
+                    callback=None,
+                    callback_steps=1,
+                    step_preview_mode=step_preview_mode
+                )
         case Pipeline.STABILITY_SDK:
             import stability_sdk.client
             import stability_sdk.interfaces.gooseai.generation.generation_pb2
diff --git a/property_groups/dream_prompt.py b/property_groups/dream_prompt.py
index a4ae73c9..333416f3 100644
--- a/property_groups/dream_prompt.py
+++ b/property_groups/dream_prompt.py
@@ -191,7 +191,6 @@ def optimization(optim, property=None, **kwargs):
 optimization("attention_slice_size", default=1, min=1)
 optimization("cudnn_benchmark", name="cuDNN Benchmark", description="Allows cuDNN to benchmark multiple convolution algorithms and select the fastest")
 optimization("tf32", name="TF32", description="Utilizes tensor cores on Ampere (RTX 30xx) or newer GPUs for matrix multiplications.\nHas no effect if half precision is enabled")
-optimization("amp")
 optimization("half_precision", description="Reduces memory usage and increases speed in exchange for a slight loss in image quality.\nHas no effect if CPU only is enabled or using a GTX 16xx GPU")
 optimization("sequential_cpu_offload", name="Sequential CPU Offload", description="Dynamically moves individual model weights in and out of device memory for reduced memory usage and a large speed penalty")
 optimization("channels_last_memory_format", description="An alternative way of ordering NCHW tensors that may be faster or slower depending on the device")
diff --git a/ui/panels/dream_texture.py b/ui/panels/dream_texture.py
index e07716b4..d1fc9e3c 100644
--- a/ui/panels/dream_texture.py
+++ b/ui/panels/dream_texture.py
@@ -282,7 +282,6 @@ def optimization(prop):
 
             optimization("cudnn_benchmark")
             optimization("tf32")
-            optimization("amp")
             optimization("half_precision")
             optimization("channels_last_memory_format")
             optimization("batch_size")

From 36819fd2d696b98b3ac9adf48420ff355612887d Mon Sep 17 00:00:00 2001
From: NullSenseStudio <47096043+NullSenseStudio@users.noreply.github.com>
Date: Sat, 18 Feb 2023 22:11:03 -0500
Subject: [PATCH 04/13] don't move pipe with cpu offloading

---
 generator_process/actions/prompt_to_image.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/generator_process/actions/prompt_to_image.py b/generator_process/actions/prompt_to_image.py
index 5bf05589..2d0769d4 100644
--- a/generator_process/actions/prompt_to_image.py
+++ b/generator_process/actions/prompt_to_image.py
@@ -65,7 +65,8 @@ def load_pipe(self, action, generator_pipeline, model, optimizations, scheduler,
             revision=revision,
             torch_dtype=torch.float16 if optimizations.can_use_half(device) else torch.float32,
         )
-        pipe = pipe.to(device)
+        if not optimizations.can_use("sequential_cpu_offload", device):
+            pipe = pipe.to(device)
         setattr(self, "_cached_pipe", CachedPipeline(pipe, invalidation_properties, snapshot_folder))
         cached_pipe = self._cached_pipe
     if 'scheduler' in os.listdir(cached_pipe.snapshot_folder):

From cf800b3a5b96419a7060d8feb65bd551e5316111 Mon Sep 17 00:00:00 2001
From: NullSenseStudio <47096043+NullSenseStudio@users.noreply.github.com>
Date: Sun, 19 Feb 2023 14:15:07 -0500
Subject: [PATCH 05/13] xformers

---
 generator_process/actions/prompt_to_image.py | 7 +++----
 property_groups/dream_prompt.py              | 7 +++++--
 requirements/win-linux-cuda.txt              | 2 ++
 ui/panels/dream_texture.py                   | 2 +-
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/generator_process/actions/prompt_to_image.py b/generator_process/actions/prompt_to_image.py
index 2d0769d4..fb5096aa 100644
--- a/generator_process/actions/prompt_to_image.py
+++ b/generator_process/actions/prompt_to_image.py
@@ -151,7 +151,7 @@ class Optimizations:
     half_precision: Annotated[bool, {"cuda", "privateuseone"}] = True
     sequential_cpu_offload: Annotated[bool, {"cuda", "privateuseone"}] = False
     channels_last_memory_format: bool = False
-    # xformers_attention: bool = False # FIXME: xFormers is not yet available.
+    xformers_attention: Annotated[bool, "cuda"] = False
     batch_size: int = 1
     vae_slicing: bool = True
 
@@ -226,9 +226,8 @@ def apply(self, pipeline, device):
                 pipeline.unet.to(memory_format=torch.contiguous_format)
         except: pass
 
-        # FIXME: xFormers wheels are not yet available (https://github.com/facebookresearch/xformers/issues/533)
-        # if self.can_use("xformers_attention", device):
-        #     pipeline.enable_xformers_memory_efficient_attention()
+        if self.can_use("xformers_attention", device):
+            pipeline.enable_xformers_memory_efficient_attention()
 
         try:
             if self.can_use("vae_slicing", device):
diff --git a/property_groups/dream_prompt.py b/property_groups/dream_prompt.py
index 333416f3..ddcdc75e 100644
--- a/property_groups/dream_prompt.py
+++ b/property_groups/dream_prompt.py
@@ -194,9 +194,12 @@ def optimization(optim, property=None, **kwargs):
 optimization("half_precision", description="Reduces memory usage and increases speed in exchange for a slight loss in image quality.\nHas no effect if CPU only is enabled or using a GTX 16xx GPU")
 optimization("sequential_cpu_offload", name="Sequential CPU Offload", description="Dynamically moves individual model weights in and out of device memory for reduced memory usage and a large speed penalty")
 optimization("channels_last_memory_format", description="An alternative way of ordering NCHW tensors that may be faster or slower depending on the device")
-# optimization("xformers_attention") # FIXME: xFormers is not yet available.
+optimization("xformers_attention", name="xFormers Attention",
+             description="Memory efficient attention that also often inscreases speed.\n"
+                         "Requires a Pascal (GTX 10xx) or newer GPU. Overrides attention slicing.\n"
+                         "Prompt recall may not produce the same image")
 optimization("batch_size", default=1, min=1, description="Improves speed when using iterations or upscaling in exchange for higher memory usage.\nHighly recommended to use with VAE slicing enabled")
-optimization("vae_slicing", name="VAE Slicing", description="Reduces memory usage of batched VAE decoding. Has no affect if batch size is 1.\nMay have a small performance improvement with large batches")
+optimization("vae_slicing", name="VAE Slicing", description="Reduces memory usage of batched VAE decoding. Has no effect if batch size is 1.\nMay have a small performance improvement with large batches")
 optimization("cpu_only", name="CPU Only", description="Disables GPU acceleration and is extremely slow")
 
 def map_structure_token_items(value):
diff --git a/requirements/win-linux-cuda.txt b/requirements/win-linux-cuda.txt
index fad5f1e5..a5223563 100644
--- a/requirements/win-linux-cuda.txt
+++ b/requirements/win-linux-cuda.txt
@@ -15,3 +15,5 @@ scipy # LMSDiscreteScheduler
 stability-sdk # DreamStudio
 
 opencolorio==2.1.2 # color management
+
+xformers # memory efficient attention
diff --git a/ui/panels/dream_texture.py b/ui/panels/dream_texture.py
index d1fc9e3c..874415ce 100644
--- a/ui/panels/dream_texture.py
+++ b/ui/panels/dream_texture.py
@@ -310,9 +310,9 @@ def optimization(prop):
             slice_size_row.prop(prompt, "optimizations_attention_slice_size_src")
             if prompt.optimizations_attention_slice_size_src == 'manual':
                 slice_size_row.prop(prompt, "optimizations_attention_slice_size", text="Size")
+            optimization("xformers_attention")
             optimization("sequential_cpu_offload")
             optimization("cpu_only")
-            # optimization("xformers_attention") # FIXME: xFormers is not yet available.
             optimization("vae_slicing")
     yield MemoryOptimizationPanel
 

From 9404f4a58e85bde1f6e3a0b8185387aac3fc9e7b Mon Sep 17 00:00:00 2001
From: NullSenseStudio <47096043+NullSenseStudio@users.noreply.github.com>
Date: Sun, 19 Feb 2023 22:48:50 -0500
Subject: [PATCH 06/13] model offloading

---
 generator_process/actions/depth_to_image.py  |  4 +++
 generator_process/actions/image_to_image.py  |  4 +++
 generator_process/actions/inpaint.py         |  4 +++
 generator_process/actions/prompt_to_image.py | 37 +++++++++++++++-----
 generator_process/actions/upscale.py         |  8 ++++-
 property_groups/dream_prompt.py              |  6 +++-
 ui/panels/dream_texture.py                   |  2 +-
 7 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/generator_process/actions/depth_to_image.py b/generator_process/actions/depth_to_image.py
index 347507d1..53bc8899 100644
--- a/generator_process/actions/depth_to_image.py
+++ b/generator_process/actions/depth_to_image.py
@@ -307,6 +307,10 @@ def __call__(
                     # 12. Run safety checker
                     # image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
 
+                    # Offload last model to CPU
+                    if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+                        self.final_offload_hook.offload()
+
                     # NOTE: Modified to yield the decoded image as a numpy array.
                     yield ImageGenerationResult(
                         [np.asarray(PIL.ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255.
diff --git a/generator_process/actions/image_to_image.py b/generator_process/actions/image_to_image.py
index 43abb7e3..fed2923e 100644
--- a/generator_process/actions/image_to_image.py
+++ b/generator_process/actions/image_to_image.py
@@ -132,6 +132,10 @@ def __call__(
                     # 10. Run safety checker
                     # image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
 
+                    # Offload last model to CPU
+                    if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+                        self.final_offload_hook.offload()
+
                     # NOTE: Modified to yield the decoded image as a numpy array.
                     yield ImageGenerationResult(
                         [np.asarray(ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255.
diff --git a/generator_process/actions/inpaint.py b/generator_process/actions/inpaint.py
index 42b2921b..eac812d1 100644
--- a/generator_process/actions/inpaint.py
+++ b/generator_process/actions/inpaint.py
@@ -173,6 +173,10 @@ def __call__(
                     # 10. Run safety checker
                     # image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
 
+                    # Offload last model to CPU
+                    if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+                        self.final_offload_hook.offload()
+
                     # NOTE: Modified to yield the decoded image as a numpy array.
                     yield ImageGenerationResult(
                         [np.asarray(ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255.
diff --git a/generator_process/actions/prompt_to_image.py b/generator_process/actions/prompt_to_image.py
index fb5096aa..0aa50f63 100644
--- a/generator_process/actions/prompt_to_image.py
+++ b/generator_process/actions/prompt_to_image.py
@@ -45,7 +45,7 @@ def load_pipe(self, action, generator_pipeline, model, optimizations, scheduler,
 
     invalidation_properties = (
         action, model, device,
-        optimizations.can_use("sequential_cpu_offload", device),
+        optimizations.can_use_cpu_offload(device),
         optimizations.can_use("half_precision", device),
     )
     cached_pipe: CachedPipeline = self._cached_pipe if hasattr(self, "_cached_pipe") else None
@@ -65,7 +65,7 @@ def load_pipe(self, action, generator_pipeline, model, optimizations, scheduler,
             revision=revision,
             torch_dtype=torch.float16 if optimizations.can_use_half(device) else torch.float32,
         )
-        if not optimizations.can_use("sequential_cpu_offload", device):
+        if optimizations.can_use_cpu_offload(device) == "off":
             pipe = pipe.to(device)
         setattr(self, "_cached_pipe", CachedPipeline(pipe, invalidation_properties, snapshot_folder))
         cached_pipe = self._cached_pipe
@@ -149,7 +149,7 @@ class Optimizations:
     tf32: Annotated[bool, "cuda"] = False
     amp: Annotated[bool, "cuda"] = False
     half_precision: Annotated[bool, {"cuda", "privateuseone"}] = True
-    sequential_cpu_offload: Annotated[bool, {"cuda", "privateuseone"}] = False
+    cpu_offload: Annotated[str, {"cuda", "privateuseone"}] = "off"
     channels_last_memory_format: bool = False
     xformers_attention: Annotated[bool, "cuda"] = False
     batch_size: int = 1
@@ -185,6 +185,9 @@ def can_use_half(self, device):
             name = torch.cuda.get_device_name()
             return not ("GTX 1650" in name or "GTX 1660" in name)
         return self.can_use("half_precision", device)
+
+    def can_use_cpu_offload(self, device):
+        return self.cpu_offload if self.device_supports("cpu_offload", device) else "off"
     
     def apply(self, pipeline, device):
         """
@@ -205,10 +208,24 @@ def apply(self, pipeline, device):
         except: pass
         
         try:
-            if self.can_use("sequential_cpu_offload", device) and device in ["cuda", "privateuseone"]:
-                # Doesn't allow for selecting execution device
-                # pipeline.enable_sequential_cpu_offload()
-
+            if pipeline.device != pipeline._execution_device:
+                pass # pipeline is already offloaded, offloading again can cause `pipeline._execution_device` to be incorrect
+            elif self.can_use_cpu_offload(device) == "model":
+                # adapted from diffusers.StableDiffusionPipeline.enable_model_cpu_offload() to allow DirectML device and unimplemented pipelines
+                from accelerate import cpu_offload_with_hook
+
+                hook = None
+                for cpu_offloaded_model in [pipeline.text_encoder, pipeline.unet, pipeline.vae]:
+                    _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+                # FIXME: due to the safety checker not running it prevents the VAE from being offloaded, uncomment when safety checker is enabled
+                # if pipeline.safety_checker is not None:
+                #     _, hook = cpu_offload_with_hook(pipeline.safety_checker, device, prev_module_hook=hook)
+
+                # We'll offload the last model manually.
+                pipeline.final_offload_hook = hook
+            elif self.can_use_cpu_offload(device) == "submodule":
+                # adapted from diffusers.StableDiffusionPipeline.enable_sequential_cpu_offload() to allow DirectML device and unimplemented pipelines
                 from accelerate import cpu_offload
 
                 for cpu_offloaded_model in [pipeline.unet, pipeline.text_encoder, pipeline.vae]:
@@ -216,7 +233,7 @@ def apply(self, pipeline, device):
                         cpu_offload(cpu_offloaded_model, device)
 
                 if pipeline.safety_checker is not None:
-                    cpu_offload(pipeline.safety_checker.vision_model, device)
+                    cpu_offload(pipeline.safety_checker.vision_model, device, offload_buffers=True)
         except: pass
         
         try:
@@ -527,6 +544,10 @@ def __call__(
                     # 9. Run safety checker
                     # image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
 
+                    # Offload last model to CPU
+                    if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+                        self.final_offload_hook.offload()
+
                     # NOTE: Modified to yield the decoded image as a numpy array.
                     yield ImageGenerationResult(
                         [np.asarray(ImageOps.flip(image).convert('RGBA'), dtype=np.float32) / 255.
diff --git a/generator_process/actions/upscale.py b/generator_process/actions/upscale.py
index dad4ebb0..bee60ce3 100644
--- a/generator_process/actions/upscale.py
+++ b/generator_process/actions/upscale.py
@@ -173,7 +173,7 @@ def upscale(
     pipe.scheduler = scheduler.create(pipe, None)
     # vae would automatically be made float32 within the pipeline, but it fails to convert after offloading is enabled
     pipe.vae.to(dtype=torch.float32)
-    if not optimizations.can_use("sequential_cpu_offload", device):
+    if optimizations.can_use_cpu_offload(device) == "off":
         pipe = pipe.to(device)
     pipe = optimizations.apply(pipe, device)
 
@@ -195,6 +195,12 @@ def upscale(
             generator=generator.manual_seed(seed),
             guidance_scale=cfg_scale,
         ).images
+
+        # not implemented in diffusers.StableDiffusionUpscalePipeline
+        # Offload last model to CPU
+        if hasattr(pipe, "final_offload_hook") and pipe.final_offload_hook is not None:
+            pipe.final_offload_hook.offload()
+
         for id, tile in zip(ids, high_res_tiles):
             tiler[id] = np.array(tile)
         step = None
diff --git a/property_groups/dream_prompt.py b/property_groups/dream_prompt.py
index ddcdc75e..53bbe8ac 100644
--- a/property_groups/dream_prompt.py
+++ b/property_groups/dream_prompt.py
@@ -192,7 +192,11 @@ def optimization(optim, property=None, **kwargs):
 optimization("cudnn_benchmark", name="cuDNN Benchmark", description="Allows cuDNN to benchmark multiple convolution algorithms and select the fastest")
 optimization("tf32", name="TF32", description="Utilizes tensor cores on Ampere (RTX 30xx) or newer GPUs for matrix multiplications.\nHas no effect if half precision is enabled")
 optimization("half_precision", description="Reduces memory usage and increases speed in exchange for a slight loss in image quality.\nHas no effect if CPU only is enabled or using a GTX 16xx GPU")
-optimization("sequential_cpu_offload", name="Sequential CPU Offload", description="Dynamically moves individual model weights in and out of device memory for reduced memory usage and a large speed penalty")
+optimization("cpu_offload", property=EnumProperty, items=(
+    ("off", "Off", "", 0),
+    ("model", "Model", "Some memory savings with minimal speed penalty", 1),
+    ("submodule", "Submodule", "Better memory savings with large speed penalty", 2)
+), default=0, name="CPU Offload", description="Dynamically moves models in and out of device memory for reduced memory usage with reduced speed")
 optimization("channels_last_memory_format", description="An alternative way of ordering NCHW tensors that may be faster or slower depending on the device")
 optimization("xformers_attention", name="xFormers Attention",
              description="Memory efficient attention that also often inscreases speed.\n"
diff --git a/ui/panels/dream_texture.py b/ui/panels/dream_texture.py
index 874415ce..38f45978 100644
--- a/ui/panels/dream_texture.py
+++ b/ui/panels/dream_texture.py
@@ -311,7 +311,7 @@ def optimization(prop):
             if prompt.optimizations_attention_slice_size_src == 'manual':
                 slice_size_row.prop(prompt, "optimizations_attention_slice_size", text="Size")
             optimization("xformers_attention")
-            optimization("sequential_cpu_offload")
+            optimization("cpu_offload")
             optimization("cpu_only")
             optimization("vae_slicing")
     yield MemoryOptimizationPanel

From 2a299b6cdd9e6aad4c022dd8597005082d147ac0 Mon Sep 17 00:00:00 2001
From: NullSenseStudio <47096043+NullSenseStudio@users.noreply.github.com>
Date: Mon, 20 Feb 2023 22:23:55 -0500
Subject: [PATCH 07/13] try except xformers

---
 generator_process/actions/prompt_to_image.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/generator_process/actions/prompt_to_image.py b/generator_process/actions/prompt_to_image.py
index 0aa50f63..a4066c6d 100644
--- a/generator_process/actions/prompt_to_image.py
+++ b/generator_process/actions/prompt_to_image.py
@@ -243,8 +243,13 @@ def apply(self, pipeline, device):
                 pipeline.unet.to(memory_format=torch.contiguous_format)
         except: pass
 
-        if self.can_use("xformers_attention", device):
-            pipeline.enable_xformers_memory_efficient_attention()
+        try:
+            if self.can_use("xformers_attention", device):
+                pipeline.enable_xformers_memory_efficient_attention()
+            elif not self.can_use("attention_slicing", device):
+                # disabling xformers will also disable attention slicing
+                pipeline.disable_xformers_memory_efficient_attention()
+        except: pass
 
         try:
             if self.can_use("vae_slicing", device):
@@ -320,7 +325,7 @@ def step_preview(pipe, mode, width, height, latents, generator, iteration):
                 )
         return ImageGenerationResult(
             [],
-            [seeds],
+            seeds,
             iteration,
             False
         )

From 07b00944a09e9ed5b83fecc3a1d06a0ad37f88d1 Mon Sep 17 00:00:00 2001
From: NullSenseStudio <47096043+NullSenseStudio@users.noreply.github.com>
Date: Fri, 3 Mar 2023 18:23:06 -0500
Subject: [PATCH 08/13] vae tiling

---
 .../actions/detect_seamless/__init__.py       |  12 +
 generator_process/actions/prompt_to_image.py  |  20 +-
 generator_process/actions/upscale.py          | 128 +--------
 generator_process/models/upscale_tiler.py     | 255 ++++++++++++++++++
 property_groups/dream_prompt.py               |   8 +
 ui/panels/dream_texture.py                    |  14 +-
 6 files changed, 306 insertions(+), 131 deletions(-)
 create mode 100644 generator_process/models/upscale_tiler.py

diff --git a/generator_process/actions/detect_seamless/__init__.py b/generator_process/actions/detect_seamless/__init__.py
index 874fe385..221e55d5 100644
--- a/generator_process/actions/detect_seamless/__init__.py
+++ b/generator_process/actions/detect_seamless/__init__.py
@@ -41,6 +41,18 @@ def __eq__(self, other):
             return True
         return False
 
+    def __and__(self, other):
+        return SeamlessAxes((self.x and other.x, self.y and other.y))
+
+    def __or__(self, other):
+        return SeamlessAxes((self.x or other.x, self.y or other.y))
+
+    def __xor__(self, other):
+        return SeamlessAxes((self.x != other.x, self.y != other.y))
+
+    def __invert__(self):
+        return SeamlessAxes((not self.x, not self.y))
+
     @classmethod
     def _missing_(cls, value):
         if isinstance(value, str):
diff --git a/generator_process/actions/prompt_to_image.py b/generator_process/actions/prompt_to_image.py
index a4066c6d..72445d87 100644
--- a/generator_process/actions/prompt_to_image.py
+++ b/generator_process/actions/prompt_to_image.py
@@ -1,5 +1,6 @@
 from typing import Annotated, Union, _AnnotatedAlias, Generator, Callable, List, Optional, Any
 import enum
+import functools
 import math
 import os
 import sys
@@ -10,7 +11,7 @@
 import numpy as np
 import random
 from .detect_seamless import SeamlessAxes
-from ...absolute_path import absolute_path
+from ..models.upscale_tiler import tiled_decode_latents
 
 from ..models import Pipeline
 
@@ -154,6 +155,9 @@ class Optimizations:
     xformers_attention: Annotated[bool, "cuda"] = False
     batch_size: int = 1
     vae_slicing: bool = True
+    vae_tiling: str = "off"
+    vae_tile_size: int = 512
+    vae_tile_blend: int = 64
 
     cpu_only: bool = False
 
@@ -259,6 +263,15 @@ def apply(self, pipeline, device):
             else:
                 pipeline.vae.disable_slicing()
         except: pass
+
+        try:
+            if self.vae_tiling != "off":
+                if not isinstance(pipeline.decode_latents, functools.partial):
+                    pipeline.decode_latents = functools.partial(tiled_decode_latents.__get__(pipeline), pre_patch=pipeline.decode_latents)
+                pipeline.decode_latents.keywords['optimizations'] = self
+            elif self.vae_tiling == "off" and isinstance(pipeline.decode_latents, functools.partial):
+                pipeline.decode_latents = pipeline.decode_latents.keywords["pre_patch"]
+        except: pass
         
         from .. import directml_patches
         if device == "privateuseone":
@@ -672,6 +685,11 @@ def _configure_model_padding(model, seamless_axes):
     Modifies the 2D convolution layers to use a circular padding mode based on the `seamless` and `seamless_axes` options.
     """
     seamless_axes = SeamlessAxes(seamless_axes)
+    if seamless_axes == SeamlessAxes.AUTO:
+        seamless_axes = seamless_axes.OFF
+    if getattr(model, "seamless_axes", SeamlessAxes.OFF) == seamless_axes:
+        return
+    model.seamless_axes = seamless_axes
     for m in model.modules():
         if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d)):
             if seamless_axes.x or seamless_axes.y:
diff --git a/generator_process/actions/upscale.py b/generator_process/actions/upscale.py
index bee60ce3..e44f1d36 100644
--- a/generator_process/actions/upscale.py
+++ b/generator_process/actions/upscale.py
@@ -1,10 +1,10 @@
-import math
 import numpy as np
-from .prompt_to_image import Optimizations, Scheduler, StepPreviewMode
+from .prompt_to_image import Optimizations, Scheduler, StepPreviewMode, _configure_model_padding
 from .detect_seamless import SeamlessAxes
 import random
 from dataclasses import dataclass
 from numpy.typing import NDArray
+from ..models.upscale_tiler import UpscaleTiler
 
 @dataclass
 class ImageUpscaleResult:
@@ -14,128 +14,6 @@ class ImageUpscaleResult:
     final: bool
 
 
-class UpscaleTiler:
-    def __init__(self, image: NDArray, scale: int, tile_size: int, blend: int, seamless_axes: SeamlessAxes):
-        self.image = image
-        self.scale = scale
-        self.tile_size = tile_size
-        self.blend = blend
-        seamless_axes = SeamlessAxes(seamless_axes)
-        self.x_tiles = self.axis_tiles(image.shape[1], tile_size, blend, seamless_axes.x)
-        self.y_tiles = self.axis_tiles(image.shape[0], tile_size, blend, seamless_axes.y)
-        # combined image with last channel containing pixel weights
-        self.canvas = np.zeros((image.shape[0] * scale, image.shape[1] * scale, image.shape[2] + 1), dtype=np.float32)
-
-        scaled_tile_size = tile_size * scale
-        weight_gradient = [min(i + 1, scaled_tile_size - i) for i in range(scaled_tile_size)]
-        tile_weight = np.zeros((scaled_tile_size, scaled_tile_size), dtype=np.float32)
-        tile_weight[:] = weight_gradient
-        # determines how much each pixel in a blended area influences the final color, basically a pyramid
-        self.tile_weight = np.minimum(tile_weight, np.reshape(weight_gradient, (scaled_tile_size, 1)))
-
-    @staticmethod
-    def axis_tiles(axis_size: int, tile_size: int, blend: int, seamless: bool) -> list[int]:
-        """
-        Returns a list of values where each tile starts on an axis.
-        Blend is only guaranteed as a minimum and may vary by a pixel between tiles.
-        """
-        if seamless:
-            count = math.ceil(axis_size / (tile_size - blend))
-            blend_balance = math.ceil(tile_size - axis_size / count)
-            final = axis_size - tile_size + blend_balance
-        else:
-            count = math.ceil((axis_size - tile_size) / (tile_size - blend)) + 1
-            final = axis_size - tile_size
-        if count == 1:
-            return [0]
-        return [i * final // (count - 1) for i in range(count)]
-
-    def combined(self) -> NDArray:
-        return self.canvas[:, :, :-1]
-
-    def index_to_xy(self, index: int):
-        key_y = index % len(self.y_tiles)
-        key_x = (index - key_y) // len(self.y_tiles)
-        return key_x, key_y
-
-    def __getitem__(self, key: int | tuple[int, int]) -> NDArray:
-        if isinstance(key, int):
-            key = self.index_to_xy(key)
-        image = self.image
-        tile_size = self.tile_size
-        x0 = self.x_tiles[key[0]]
-        x1 = x0 + tile_size
-        x2 = image.shape[1] - x0
-        y0 = self.y_tiles[key[1]]
-        y1 = y0 + tile_size
-        y2 = image.shape[0] - y0
-        if x2 >= tile_size and y2 >= tile_size:
-            return image[y0:y1, x0:x1]
-        # seamless axis wrapping
-        tile = np.empty((tile_size, tile_size, image.shape[2]), dtype=self.image.dtype)
-        if x2 < tile_size:
-            if y2 < tile_size:
-                # wrap bottom/right to top/left
-                tile[:y2, :x2] = image[y0:, x0:]
-                tile[y2:, :x2] = image[:tile_size - y2, x0:]
-                tile[:y2, x2:] = image[y0:, :tile_size - x2]
-                tile[y2:, x2:] = image[:tile_size - y2, :tile_size - x2]
-            else:
-                # wrap right to left
-                tile[:, :x2] = image[y0:y1, x0:]
-                tile[:, x2:] = image[y0:y1, :tile_size - x2]
-        else:
-            # wrap bottom to top
-            tile[:y2] = image[y0:, x0:x1]
-            tile[y2:] = image[:tile_size - y2, x0:x1]
-        return tile
-
-    def __setitem__(self, key: int | tuple[int, int], tile: NDArray):
-        if isinstance(key, int):
-            key = self.index_to_xy(key)
-        canvas = self.canvas
-        scale = self.scale
-        tile_size = self.tile_size * scale
-        tile_weight = self.tile_weight
-        x0 = self.x_tiles[key[0]] * scale
-        x1 = x0 + tile_size
-        x2 = canvas.shape[1] - x0
-        y0 = self.y_tiles[key[1]] * scale
-        y1 = y0 + tile_size
-        y2 = canvas.shape[0] - y0
-
-        def update(canvas_slice, tile_slice, weight_slice):
-            weight_slice = weight_slice.reshape(weight_slice.shape[0], weight_slice.shape[1], 1)
-            # undo weighted average, then add new tile with its weights applied and average again
-            canvas_slice[:, :, :-1] *= canvas_slice[:, :, -1:]
-            canvas_slice[:, :, :-1] += tile_slice * weight_slice
-            canvas_slice[:, :, -1:] += weight_slice
-            canvas_slice[:, :, :-1] /= canvas_slice[:, :, -1:]
-
-        if x2 >= tile_size and y2 >= tile_size:
-            update(canvas[y0:y1, x0:x1], tile, tile_weight)
-        elif x2 < tile_size:
-            if y2 < tile_size:
-                update(canvas[y0:, x0:], tile[:y2, :x2], tile_weight[:y2, :x2])
-                update(canvas[:tile_size - y2, x0:], tile[y2:, :x2], tile_weight[y2:, :x2])
-                update(canvas[y0:, :tile_size - x2], tile[:y2, x2:], tile_weight[:y2, x2:])
-                update(canvas[:tile_size - y2, :tile_size - x2], tile[y2:, x2:], tile_weight[y2:, x2:])
-            else:
-                update(canvas[y0:y1, x0:], tile[:, :x2], tile_weight[:, :x2])
-                update(canvas[y0:y1, :tile_size - x2], tile[:, x2:], tile_weight[:, x2:])
-        else:
-            update(canvas[y0:, x0:x1], tile[:y2], tile_weight[:y2])
-            update(canvas[:tile_size - y2, x0:x1], tile[y2:], tile_weight[y2:])
-
-    def __iter__(self):
-        for x in range(len(self.x_tiles)):
-            for y in range(len(self.y_tiles)):
-                yield (x, y), self[x, y]
-
-    def __len__(self):
-        return len(self.x_tiles) * len(self.y_tiles)
-
-
 def upscale(
     self,
     image: NDArray,
@@ -184,6 +62,8 @@ def upscale(
     if image.shape[2] == 4:
         image = image[:, :, :3]
     tiler = UpscaleTiler(image, 4, tile_size, blend, seamless_axes)
+    _configure_model_padding(pipe.unet, seamless_axes & ~tiler.seamless_axes)
+    _configure_model_padding(pipe.vae, seamless_axes & ~tiler.seamless_axes)
     for i in range(0, len(tiler), optimizations.batch_size):
         batch_size = min(len(tiler)-i, optimizations.batch_size)
         ids = list(range(i, i+batch_size))
diff --git a/generator_process/models/upscale_tiler.py b/generator_process/models/upscale_tiler.py
new file mode 100644
index 00000000..ef34d4be
--- /dev/null
+++ b/generator_process/models/upscale_tiler.py
@@ -0,0 +1,255 @@
+import math
+from typing import Optional
+
+import numpy as np
+from ..actions.detect_seamless import SeamlessAxes
+from numpy.typing import NDArray
+
+
+class UpscaleTiler:
+    def __init__(
+            self,
+            image: NDArray,
+            scale: int,
+            tile_size: int | tuple[int, int],
+            blend: int | tuple[int, int],
+            seamless_axes: SeamlessAxes,
+            defer_seamless: bool = True,
+            out_channels: Optional[int] = None
+    ):
+        height, width = image.shape[:2]
+        if scale < 1:
+            raise ValueError("scale must be 1 or higher")
+        if isinstance(tile_size, int):
+            tile_size = (tile_size, tile_size)
+        if tile_size[0] <= 0 or tile_size[1] <= 0:
+            raise ValueError("tile size must be 1 or higher")
+        if isinstance(blend, int):
+            blend = (blend, blend)
+        if blend[0] < 0 or blend[1] < 0:
+            raise ValueError("blend must be 0 or higher")
+        seamless_axes = SeamlessAxes(seamless_axes)
+        if defer_seamless:
+            # Seamless handling may be deferred to upscaler model or VAE rather than using larger or multiple tiles
+            seamless_axes = SeamlessAxes((seamless_axes.x and width > tile_size[0], seamless_axes.y and height > tile_size[1]))
+        max_width = width*2 if seamless_axes.x else width
+        max_height = height*2 if seamless_axes.y else height
+        tile_size = (min(tile_size[0], max_width), min(tile_size[1], max_height))
+        blend = (min(blend[0], math.ceil(tile_size[0]/2)), min(blend[1], math.ceil(tile_size[1]/2)))
+        self.image = image
+        self.scale = scale
+        self.tile_size = tile_size
+        self.blend = blend
+        self.seamless_axes = seamless_axes
+        self.x_tiles = self.axis_tiles(width, tile_size[0], blend[0], seamless_axes.x)
+        self.y_tiles = self.axis_tiles(height, tile_size[1], blend[1], seamless_axes.y)
+        if out_channels is None:
+            out_channels = image.shape[2]
+        # combined image with last channel containing pixel weights
+        self.canvas = np.zeros((image.shape[0] * scale, image.shape[1] * scale, out_channels + 1), dtype=np.float32)
+
+        scaled_tile_size = (tile_size[0] * scale, tile_size[1] * scale)
+        weight_gradient_y = [min(i + 1, scaled_tile_size[1] - i) for i in range(scaled_tile_size[1])]
+        weight_gradient_x = [min(i + 1, scaled_tile_size[0] - i) for i in range(scaled_tile_size[0])]
+        tile_weight = np.zeros(scaled_tile_size, dtype=np.float32)
+        tile_weight[:] = weight_gradient_y
+        # determines how much each pixel in a blended area influences the final color, basically a pyramid
+        self.tile_weight = np.minimum(tile_weight, np.reshape(weight_gradient_x, (scaled_tile_size[0], 1)))
+
+    @staticmethod
+    def axis_tiles(axis_size: int, tile_size: int, blend: int, seamless: bool) -> list[int]:
+        """
+        Returns a list of values where each tile starts on an axis.
+        Blend is only guaranteed as a minimum and may vary by a pixel between tiles.
+        """
+        if seamless:
+            count = math.ceil(axis_size / (tile_size - blend))
+            blend_balance = math.ceil(tile_size - axis_size / count)
+            final = min(axis_size - tile_size + blend_balance, axis_size * 2 - tile_size)
+        else:
+            count = math.ceil((axis_size - tile_size) / (tile_size - blend)) + 1
+            final = axis_size - tile_size
+        if count == 1:
+            return [0]
+        return [i * final // (count - 1) for i in range(count)]
+
+    def combined(self) -> NDArray:
+        return self.canvas[:, :, :-1]
+
+    def index_to_xy(self, index: int):
+        key_y = index % len(self.y_tiles)
+        key_x = (index - key_y) // len(self.y_tiles)
+        return key_x, key_y
+
+    def __getitem__(self, key: int | tuple[int, int]) -> NDArray:
+        if isinstance(key, int):
+            key = self.index_to_xy(key)
+        image = self.image
+        tile_size = self.tile_size
+        x0 = self.x_tiles[key[0]]
+        x1 = x0 + tile_size[0]
+        x2 = image.shape[1] - x0
+        y0 = self.y_tiles[key[1]]
+        y1 = y0 + tile_size[1]
+        y2 = image.shape[0] - y0
+        if x2 >= tile_size[0] and y2 >= tile_size[1]:
+            return image[y0:y1, x0:x1]
+        # seamless axis wrapping
+        if isinstance(image, np.ndarray):
+            tile = np.empty((tile_size[0], tile_size[1], image.shape[2]), dtype=image.dtype)
+        else:
+            import torch
+            tile = torch.empty((tile_size[0], tile_size[1], image.shape[2]), dtype=image.dtype, device=image.device)
+        if x2 < tile_size[0]:
+            if y2 < tile_size[1]:
+                # wrap bottom/right to top/left
+                tile[:y2, :x2] = image[y0:, x0:]
+                tile[y2:, :x2] = image[:tile_size[1] - y2, x0:]
+                tile[:y2, x2:] = image[y0:, :tile_size[0] - x2]
+                tile[y2:, x2:] = image[:tile_size[1] - y2, :tile_size[0] - x2]
+            else:
+                # wrap right to left
+                tile[:, :x2] = image[y0:y1, x0:]
+                tile[:, x2:] = image[y0:y1, :tile_size[0] - x2]
+        else:
+            # wrap bottom to top
+            tile[:y2] = image[y0:, x0:x1]
+            tile[y2:] = image[:tile_size[1] - y2, x0:x1]
+        return tile
+
+    def __setitem__(self, key: int | tuple[int, int], tile: NDArray):
+        if isinstance(key, int):
+            key = self.index_to_xy(key)
+        canvas = self.canvas
+        scale = self.scale
+        tile_size = (self.tile_size[0] * scale, self.tile_size[1] * scale)
+        tile_weight = self.tile_weight
+        x0 = self.x_tiles[key[0]] * scale
+        x1 = x0 + tile_size[0]
+        x2 = canvas.shape[1] - x0
+        y0 = self.y_tiles[key[1]] * scale
+        y1 = y0 + tile_size[1]
+        y2 = canvas.shape[0] - y0
+
+        def update(canvas_slice, tile_slice, weight_slice):
+            weight_slice = weight_slice.reshape(weight_slice.shape[0], weight_slice.shape[1], 1)
+            # undo weighted average, then add new tile with its weights applied and average again
+            canvas_slice[:, :, :-1] *= canvas_slice[:, :, -1:]
+            canvas_slice[:, :, :-1] += tile_slice * weight_slice
+            canvas_slice[:, :, -1:] += weight_slice
+            canvas_slice[:, :, :-1] /= canvas_slice[:, :, -1:]
+
+        if x2 >= tile_size[0] and y2 >= tile_size[1]:
+            update(canvas[y0:y1, x0:x1], tile, tile_weight)
+        elif x2 < tile_size[0]:
+            if y2 < tile_size[1]:
+                update(canvas[y0:, x0:], tile[:y2, :x2], tile_weight[:y2, :x2])
+                update(canvas[:tile_size[1] - y2, x0:], tile[y2:, :x2], tile_weight[y2:, :x2])
+                update(canvas[y0:, :tile_size[0] - x2], tile[:y2, x2:], tile_weight[:y2, x2:])
+                update(canvas[:tile_size[1] - y2, :tile_size[0] - x2], tile[y2:, x2:], tile_weight[y2:, x2:])
+            else:
+                update(canvas[y0:y1, x0:], tile[:, :x2], tile_weight[:, :x2])
+                update(canvas[y0:y1, :tile_size[0] - x2], tile[:, x2:], tile_weight[:, x2:])
+        else:
+            update(canvas[y0:, x0:x1], tile[:y2], tile_weight[:y2])
+            update(canvas[:tile_size[1] - y2, x0:x1], tile[y2:], tile_weight[y2:])
+
+    def __iter__(self):
+        for x in range(len(self.x_tiles)):
+            for y in range(len(self.y_tiles)):
+                yield (x, y), self[x, y]
+
+    def __len__(self):
+        return len(self.x_tiles) * len(self.y_tiles)
+
+
+def tiled_decode_latents(self, latents, *, pre_patch, optimizations):
+    # not all pipelines (namely upscale) have the vae_scale_factor attribute
+    vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+    default_size = self.unet.config.sample_size * vae_scale_factor
+    match optimizations.vae_tiling:
+        case "full":
+            tile_size = default_size
+            blend = math.ceil(tile_size / 8)
+        case "half":
+            tile_size = math.ceil(default_size / 2)
+            blend = math.ceil(tile_size / 8)
+        case "manual":
+            tile_size = optimizations.vae_tile_size
+            blend = optimizations.vae_tile_blend
+        case _:
+            return pre_patch(latents)
+
+    seamless_axes = getattr(self.vae, "seamless_axes", SeamlessAxes.OFF)
+
+    images = []
+    for image_latents in latents.split(1, dim=0):
+        tiler = UpscaleTiler(
+            image_latents.squeeze(0).permute(1, 2, 0),
+            vae_scale_factor,
+            math.ceil(tile_size / vae_scale_factor),
+            math.ceil(blend / vae_scale_factor),
+            seamless_axes,
+            out_channels=self.vae.config.out_channels
+        )
+
+        configure_model_padding(self.vae, seamless_axes & ~tiler.seamless_axes)
+
+        for id, tile in tiler:
+            tiler[id] = pre_patch(tile.permute(2, 0, 1).unsqueeze(0)).squeeze(0)
+        images.append(np.expand_dims(tiler.combined(), 0))
+    configure_model_padding(self.vae, seamless_axes)
+    return np.concatenate(images)
+
+def configure_model_padding(model, seamless_axes):
+    import torch.nn as nn
+    """
+    Modifies the 2D convolution layers to use a circular padding mode based on the `seamless_axes` option.
+    """
+    seamless_axes = SeamlessAxes(seamless_axes)
+    if seamless_axes == SeamlessAxes.AUTO:
+        seamless_axes = seamless_axes.OFF
+    if getattr(model, "seamless_axes", SeamlessAxes.OFF) == seamless_axes:
+        return
+    model.seamless_axes = seamless_axes
+    for m in model.modules():
+        if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d)):
+            if seamless_axes.x or seamless_axes.y:
+                m.asymmetric_padding_mode = (
+                    'circular' if seamless_axes.x else 'constant',
+                    'circular' if seamless_axes.y else 'constant'
+                )
+                m.asymmetric_padding = (
+                    (m._reversed_padding_repeated_twice[0], m._reversed_padding_repeated_twice[1], 0, 0),
+                    (0, 0, m._reversed_padding_repeated_twice[2], m._reversed_padding_repeated_twice[3])
+                )
+                m._conv_forward = _conv_forward_asymmetric.__get__(m, nn.Conv2d)
+            else:
+                m._conv_forward = nn.Conv2d._conv_forward.__get__(m, nn.Conv2d)
+                if hasattr(m, 'asymmetric_padding_mode'):
+                    del m.asymmetric_padding_mode
+                if hasattr(m, 'asymmetric_padding'):
+                    del m.asymmetric_padding
+
+def _conv_forward_asymmetric(self, input, weight, bias):
+    import torch.nn as nn
+    """
+    Patch for Conv2d._conv_forward that supports asymmetric padding
+    """
+    if input.device.type == "privateuseone":
+        # DML pad() will wrongly fill the tensor in constant mode with the supplied value
+        # (default 0) when padding on both ends of a dimension, can't split to two calls.
+        working = nn.functional.pad(input, self._reversed_padding_repeated_twice, mode='circular')
+        pad_w0, pad_w1, pad_h0, pad_h1 = self._reversed_padding_repeated_twice
+        if self.asymmetric_padding_mode[0] == 'constant':
+            working[:, :, :, :pad_w0] = 0
+            if pad_w1 > 0:
+                working[:, :, :, -pad_w1:] = 0
+        if self.asymmetric_padding_mode[1] == 'constant':
+            working[:, :, :pad_h0] = 0
+            if pad_h1 > 0:
+                working[:, :, -pad_h1:] = 0
+    else:
+        working = nn.functional.pad(input, self.asymmetric_padding[0], mode=self.asymmetric_padding_mode[0])
+        working = nn.functional.pad(working, self.asymmetric_padding[1], mode=self.asymmetric_padding_mode[1])
+    return nn.functional.conv2d(working, weight, bias, self.stride, nn.modules.utils._pair(0), self.dilation, self.groups)
diff --git a/property_groups/dream_prompt.py b/property_groups/dream_prompt.py
index 53bbe8ac..b81f6640 100644
--- a/property_groups/dream_prompt.py
+++ b/property_groups/dream_prompt.py
@@ -204,6 +204,14 @@ def optimization(optim, property=None, **kwargs):
                          "Prompt recall may not produce the same image")
 optimization("batch_size", default=1, min=1, description="Improves speed when using iterations or upscaling in exchange for higher memory usage.\nHighly recommended to use with VAE slicing enabled")
 optimization("vae_slicing", name="VAE Slicing", description="Reduces memory usage of batched VAE decoding. Has no effect if batch size is 1.\nMay have a small performance improvement with large batches")
+optimization("vae_tiling", property=EnumProperty, items=(
+    ("off", "Off", "", 0),
+    ("half", "Half", "Uses tiles of half the selected model's default size. Likely to cause noticeably inaccurate colors", 1),
+    ("full", "Full", "Uses tiles of the selected model's default size, intended for use where image size is manually set higher. May cause slightly inaccurate colors", 2),
+    ("manual", "Manual", "", 3)
+), default=0, name="VAE Tiling", description="Decodes generated images in tiled regions to reduce memory usage in exchange for longer decode time and less accurate colors.\nCan allow for generating larger images that would otherwise run out of memory on the final step")
+optimization("vae_tile_size", min=1, name="VAE Tile Size", description="Width and height measurement of tiles. Smaller sizes are more likely to cause inaccurate colors and other undesired artifacts")
+optimization("vae_tile_blend", min=0, name="VAE Tile Blend", description="Minimum amount of how much each edge of a tile will intersect its adjacent tile")
 optimization("cpu_only", name="CPU Only", description="Disables GPU acceleration and is extremely slow")
 
 def map_structure_token_items(value):
diff --git a/ui/panels/dream_texture.py b/ui/panels/dream_texture.py
index 38f45978..87165e70 100644
--- a/ui/panels/dream_texture.py
+++ b/ui/panels/dream_texture.py
@@ -276,9 +276,8 @@ def draw(self, context):
 
             inferred_device = Optimizations.infer_device()
             def optimization(prop):
-                if hasattr(prompt, f"optimizations_{prop}"):
-                    if Optimizations.device_supports(prop, inferred_device):
-                        layout.prop(prompt, f"optimizations_{prop}")
+                if Optimizations.device_supports(prop, inferred_device):
+                    layout.prop(prompt, f"optimizations_{prop}")
 
             optimization("cudnn_benchmark")
             optimization("tf32")
@@ -301,9 +300,8 @@ def draw(self, context):
 
             inferred_device = Optimizations.infer_device()
             def optimization(prop):
-                if hasattr(prompt, f"optimizations_{prop}"):
-                    if Optimizations.device_supports(prop, inferred_device):
-                        layout.prop(prompt, f"optimizations_{prop}")
+                if Optimizations.device_supports(prop, inferred_device):
+                    layout.prop(prompt, f"optimizations_{prop}")
 
             optimization("attention_slicing")
             slice_size_row = layout.row()
@@ -314,6 +312,10 @@ def optimization(prop):
             optimization("cpu_offload")
             optimization("cpu_only")
             optimization("vae_slicing")
+            optimization("vae_tiling")
+            if prompt.optimizations_vae_tiling == "manual":
+                optimization("vae_tile_size")
+                optimization("vae_tile_blend")
     yield MemoryOptimizationPanel
 
 def actions_panel(sub_panel, space_type, get_prompt):

From ad0a270ac44433b75c3bac92a51aadbaeaf47a1c Mon Sep 17 00:00:00 2001
From: NullSenseStudio <47096043+NullSenseStudio@users.noreply.github.com>
Date: Fri, 3 Mar 2023 18:28:17 -0500
Subject: [PATCH 09/13] filter optimizations for cpu only

---
 ui/panels/dream_texture.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ui/panels/dream_texture.py b/ui/panels/dream_texture.py
index 87165e70..684fb2b8 100644
--- a/ui/panels/dream_texture.py
+++ b/ui/panels/dream_texture.py
@@ -275,6 +275,8 @@ def draw(self, context):
             prompt = get_prompt(context)
 
             inferred_device = Optimizations.infer_device()
+            if prompt.optimizations_cpu_only:
+                inferred_device = "cpu"
             def optimization(prop):
                 if Optimizations.device_supports(prop, inferred_device):
                     layout.prop(prompt, f"optimizations_{prop}")
@@ -299,6 +301,8 @@ def draw(self, context):
             prompt = get_prompt(context)
 
             inferred_device = Optimizations.infer_device()
+            if prompt.optimizations_cpu_only:
+                inferred_device = "cpu"
             def optimization(prop):
                 if Optimizations.device_supports(prop, inferred_device):
                     layout.prop(prompt, f"optimizations_{prop}")

From 35f1b27b9efd0c002b57048cae6fd55ac9cc14eb Mon Sep 17 00:00:00 2001
From: NullSenseStudio <47096043+NullSenseStudio@users.noreply.github.com>
Date: Fri, 10 Mar 2023 19:37:36 -0500
Subject: [PATCH 10/13] support triton, silence warning

---
 __init__.py                                   | 12 +++++++++--
 generator_process/actor.py                    |  6 ++++++
 requirements/linux-cuda.txt                   | 20 +++++++++++++++++++
 .../{win-linux-cuda.txt => win-cuda.txt}      |  0
 4 files changed, 36 insertions(+), 2 deletions(-)
 create mode 100644 requirements/linux-cuda.txt
 rename requirements/{win-linux-cuda.txt => win-cuda.txt} (100%)

diff --git a/__init__.py b/__init__.py
index 7f906cff..db319ac9 100644
--- a/__init__.py
+++ b/__init__.py
@@ -49,7 +49,8 @@ def clear_modules():
     from .ui.presets import register_default_presets
 
     requirements_path_items = (
-        ('requirements/win-linux-cuda.txt', 'Linux/Windows (CUDA)', 'Linux or Windows with NVIDIA GPU'),
+        ('requirements/win-cuda.txt', 'Windows (CUDA)', 'Windows with NVIDIA GPU'),
+        ('requirements/linux-cuda.txt', 'Linux (CUDA)', 'Linux with NVIDIA GPU'),
         ('requirements/mac-mps-cpu.txt', 'Apple Silicon', 'Apple M1/M2'),
         ('requirements/linux-rocm.txt', 'Linux (AMD)', 'Linux with AMD GPU'),
         ('requirements/win-dml.txt', 'Windows (DirectML)', 'Windows with DirectX 12 GPU'),
@@ -63,7 +64,14 @@ def register():
         if hasattr(bpy.types, dt_op.idname()): # objects under bpy.ops are created on the fly, have to check that it actually exists a little differently
             raise RuntimeError("Another instance of Dream Textures is already running.")
 
-        bpy.types.Scene.dream_textures_requirements_path = EnumProperty(name="Platform", items=requirements_path_items, description="Specifies which set of dependencies to install", default='requirements/mac-mps-cpu.txt' if sys.platform == 'darwin' else 'requirements/win-linux-cuda.txt')
+        match sys.platform:
+            case 'darwin':
+                default_req = 'requirements/mac-mps-cpu.txt'
+            case 'win32':
+                default_req = 'requirements/win-cuda.txt'
+            case _:
+                default_req = 'requirements/linux-cuda.txt'
+        bpy.types.Scene.dream_textures_requirements_path = EnumProperty(name="Platform", items=requirements_path_items, description="Specifies which set of dependencies to install", default=default_req)
 
         for cls in PREFERENCE_CLASSES:
             bpy.utils.register_class(cls)
diff --git a/generator_process/actor.py b/generator_process/actor.py
index 2c9d0923..1b62c551 100644
--- a/generator_process/actor.py
+++ b/generator_process/actor.py
@@ -12,6 +12,12 @@ def _load_dependencies():
     site.addsitedir(absolute_path(".python_dependencies"))
     deps = sys.path.pop(-1)
     sys.path.insert(0, deps)
+
+    if sys.platform == "win32":
+        import logging
+        logging.getLogger("xformers").addFilter(
+            lambda record: not record.msg.startswith("A matching Triton is not available"))
+
 if current_process().name == "__actor__":
     _load_dependencies()
 
diff --git a/requirements/linux-cuda.txt b/requirements/linux-cuda.txt
new file mode 100644
index 00000000..ba22c0b4
--- /dev/null
+++ b/requirements/linux-cuda.txt
@@ -0,0 +1,20 @@
+git+https://github.com/huggingface/diffusers@main#egg=diffusers
+transformers
+accelerate
+huggingface_hub
+
+--extra-index-url https://download.pytorch.org/whl/cu117
+torch>=1.13
+
+# Original SD checkpoint conversion
+pytorch-lightning
+tensorboard
+
+scipy # LMSDiscreteScheduler
+
+stability-sdk # DreamStudio
+
+opencolorio==2.1.2 # color management
+
+xformers # memory efficient attention
+triton # xformers extra
diff --git a/requirements/win-linux-cuda.txt b/requirements/win-cuda.txt
similarity index 100%
rename from requirements/win-linux-cuda.txt
rename to requirements/win-cuda.txt

From a3f04db214dcdb2a46f63e561fe447a62a573139 Mon Sep 17 00:00:00 2001
From: NullSenseStudio <47096043+NullSenseStudio@users.noreply.github.com>
Date: Thu, 6 Apr 2023 17:51:08 -0400
Subject: [PATCH 11/13] Revert "support triton, silence warning"

This reverts commit 35f1b27b9efd0c002b57048cae6fd55ac9cc14eb.
---
 __init__.py                                   | 12 ++---------
 generator_process/actor.py                    |  6 ------
 requirements/linux-cuda.txt                   | 20 -------------------
 .../{win-cuda.txt => win-linux-cuda.txt}      |  0
 4 files changed, 2 insertions(+), 36 deletions(-)
 delete mode 100644 requirements/linux-cuda.txt
 rename requirements/{win-cuda.txt => win-linux-cuda.txt} (100%)

diff --git a/__init__.py b/__init__.py
index db319ac9..7f906cff 100644
--- a/__init__.py
+++ b/__init__.py
@@ -49,8 +49,7 @@ def clear_modules():
     from .ui.presets import register_default_presets
 
     requirements_path_items = (
-        ('requirements/win-cuda.txt', 'Windows (CUDA)', 'Windows with NVIDIA GPU'),
-        ('requirements/linux-cuda.txt', 'Linux (CUDA)', 'Linux with NVIDIA GPU'),
+        ('requirements/win-linux-cuda.txt', 'Linux/Windows (CUDA)', 'Linux or Windows with NVIDIA GPU'),
         ('requirements/mac-mps-cpu.txt', 'Apple Silicon', 'Apple M1/M2'),
         ('requirements/linux-rocm.txt', 'Linux (AMD)', 'Linux with AMD GPU'),
         ('requirements/win-dml.txt', 'Windows (DirectML)', 'Windows with DirectX 12 GPU'),
@@ -64,14 +63,7 @@ def register():
         if hasattr(bpy.types, dt_op.idname()): # objects under bpy.ops are created on the fly, have to check that it actually exists a little differently
             raise RuntimeError("Another instance of Dream Textures is already running.")
 
-        match sys.platform:
-            case 'darwin':
-                default_req = 'requirements/mac-mps-cpu.txt'
-            case 'win32':
-                default_req = 'requirements/win-cuda.txt'
-            case _:
-                default_req = 'requirements/linux-cuda.txt'
-        bpy.types.Scene.dream_textures_requirements_path = EnumProperty(name="Platform", items=requirements_path_items, description="Specifies which set of dependencies to install", default=default_req)
+        bpy.types.Scene.dream_textures_requirements_path = EnumProperty(name="Platform", items=requirements_path_items, description="Specifies which set of dependencies to install", default='requirements/mac-mps-cpu.txt' if sys.platform == 'darwin' else 'requirements/win-linux-cuda.txt')
 
         for cls in PREFERENCE_CLASSES:
             bpy.utils.register_class(cls)
diff --git a/generator_process/actor.py b/generator_process/actor.py
index 1b62c551..2c9d0923 100644
--- a/generator_process/actor.py
+++ b/generator_process/actor.py
@@ -12,12 +12,6 @@ def _load_dependencies():
     site.addsitedir(absolute_path(".python_dependencies"))
     deps = sys.path.pop(-1)
     sys.path.insert(0, deps)
-
-    if sys.platform == "win32":
-        import logging
-        logging.getLogger("xformers").addFilter(
-            lambda record: not record.msg.startswith("A matching Triton is not available"))
-
 if current_process().name == "__actor__":
     _load_dependencies()
 
diff --git a/requirements/linux-cuda.txt b/requirements/linux-cuda.txt
deleted file mode 100644
index ba22c0b4..00000000
--- a/requirements/linux-cuda.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-git+https://github.com/huggingface/diffusers@main#egg=diffusers
-transformers
-accelerate
-huggingface_hub
-
---extra-index-url https://download.pytorch.org/whl/cu117
-torch>=1.13
-
-# Original SD checkpoint conversion
-pytorch-lightning
-tensorboard
-
-scipy # LMSDiscreteScheduler
-
-stability-sdk # DreamStudio
-
-opencolorio==2.1.2 # color management
-
-xformers # memory efficient attention
-triton # xformers extra
diff --git a/requirements/win-cuda.txt b/requirements/win-linux-cuda.txt
similarity index 100%
rename from requirements/win-cuda.txt
rename to requirements/win-linux-cuda.txt

From 29417ad834b3a979428307e66256170be55d5ffe Mon Sep 17 00:00:00 2001
From: NullSenseStudio <47096043+NullSenseStudio@users.noreply.github.com>
Date: Thu, 6 Apr 2023 19:06:22 -0400
Subject: [PATCH 12/13] sdp attention

---
 generator_process/actions/prompt_to_image.py | 17 ++++++-----------
 property_groups/dream_prompt.py              |  8 ++++----
 requirements/mac-mps-cpu.txt                 |  2 +-
 requirements/win-linux-cuda.txt              |  6 ++----
 ui/panels/dream_texture.py                   |  2 +-
 5 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/generator_process/actions/prompt_to_image.py b/generator_process/actions/prompt_to_image.py
index 72445d87..c1855e19 100644
--- a/generator_process/actions/prompt_to_image.py
+++ b/generator_process/actions/prompt_to_image.py
@@ -152,7 +152,7 @@ class Optimizations:
     half_precision: Annotated[bool, {"cuda", "privateuseone"}] = True
     cpu_offload: Annotated[str, {"cuda", "privateuseone"}] = "off"
     channels_last_memory_format: bool = False
-    xformers_attention: Annotated[bool, "cuda"] = False
+    sdp_attention: Annotated[bool, {"cpu", "cuda", "mps"}] = True
     batch_size: int = 1
     vae_slicing: bool = True
     vae_tiling: str = "off"
@@ -205,10 +205,13 @@ def apply(self, pipeline, device):
         torch.backends.cuda.matmul.allow_tf32 = self.can_use("tf32", device)
 
         try:
-            if self.can_use("attention_slicing", device):
+            if self.can_use("sdp_attention", device):
+                from diffusers.models.cross_attention import AttnProcessor2_0
+                pipeline.unet.set_attn_processor(AttnProcessor2_0())
+            elif self.can_use("attention_slicing", device):
                 pipeline.enable_attention_slicing(self.attention_slice_size)
             else:
-                pipeline.disable_attention_slicing()
+                pipeline.disable_attention_slicing()  # will also disable AttnProcessor2_0
         except: pass
         
         try:
@@ -247,14 +250,6 @@ def apply(self, pipeline, device):
                 pipeline.unet.to(memory_format=torch.contiguous_format)
         except: pass
 
-        try:
-            if self.can_use("xformers_attention", device):
-                pipeline.enable_xformers_memory_efficient_attention()
-            elif not self.can_use("attention_slicing", device):
-                # disabling xformers will also disable attention slicing
-                pipeline.disable_xformers_memory_efficient_attention()
-        except: pass
-
         try:
             if self.can_use("vae_slicing", device):
                 # Not many pipelines implement the enable_vae_slicing()/disable_vae_slicing()
diff --git a/property_groups/dream_prompt.py b/property_groups/dream_prompt.py
index b81f6640..d78de981 100644
--- a/property_groups/dream_prompt.py
+++ b/property_groups/dream_prompt.py
@@ -198,10 +198,10 @@ def optimization(optim, property=None, **kwargs):
     ("submodule", "Submodule", "Better memory savings with large speed penalty", 2)
 ), default=0, name="CPU Offload", description="Dynamically moves models in and out of device memory for reduced memory usage with reduced speed")
 optimization("channels_last_memory_format", description="An alternative way of ordering NCHW tensors that may be faster or slower depending on the device")
-optimization("xformers_attention", name="xFormers Attention",
-             description="Memory efficient attention that also often inscreases speed.\n"
-                         "Requires a Pascal (GTX 10xx) or newer GPU. Overrides attention slicing.\n"
-                         "Prompt recall may not produce the same image")
+optimization("sdp_attention", name="SDP Attention",
+             description="Scaled dot product attention requires less memory and often comes with a good speed increase.\n"
+                         "Prompt recall may not produce the exact same image, but usually only minor noise differences.\n"
+                         "Overrides attention slicing")
 optimization("batch_size", default=1, min=1, description="Improves speed when using iterations or upscaling in exchange for higher memory usage.\nHighly recommended to use with VAE slicing enabled")
 optimization("vae_slicing", name="VAE Slicing", description="Reduces memory usage of batched VAE decoding. Has no effect if batch size is 1.\nMay have a small performance improvement with large batches")
 optimization("vae_tiling", property=EnumProperty, items=(
diff --git a/requirements/mac-mps-cpu.txt b/requirements/mac-mps-cpu.txt
index b423ff8c..010031eb 100644
--- a/requirements/mac-mps-cpu.txt
+++ b/requirements/mac-mps-cpu.txt
@@ -3,7 +3,7 @@ transformers
 accelerate==0.14.0
 huggingface_hub
 
-torch>=1.13
+torch>=2.0
 
 # Original SD checkpoint conversion
 pytorch-lightning
diff --git a/requirements/win-linux-cuda.txt b/requirements/win-linux-cuda.txt
index a5223563..04dfd47c 100644
--- a/requirements/win-linux-cuda.txt
+++ b/requirements/win-linux-cuda.txt
@@ -3,8 +3,8 @@ transformers
 accelerate
 huggingface_hub
 
---extra-index-url https://download.pytorch.org/whl/cu117
-torch>=1.13
+--extra-index-url https://download.pytorch.org/whl/cu118
+torch>=2.0
 
 # Original SD checkpoint conversion
 pytorch-lightning
@@ -15,5 +15,3 @@ scipy # LMSDiscreteScheduler
 stability-sdk # DreamStudio
 
 opencolorio==2.1.2 # color management
-
-xformers # memory efficient attention
diff --git a/ui/panels/dream_texture.py b/ui/panels/dream_texture.py
index 684fb2b8..23b5fad2 100644
--- a/ui/panels/dream_texture.py
+++ b/ui/panels/dream_texture.py
@@ -312,7 +312,7 @@ def optimization(prop):
             slice_size_row.prop(prompt, "optimizations_attention_slice_size_src")
             if prompt.optimizations_attention_slice_size_src == 'manual':
                 slice_size_row.prop(prompt, "optimizations_attention_slice_size", text="Size")
-            optimization("xformers_attention")
+            optimization("sdp_attention")
             optimization("cpu_offload")
             optimization("cpu_only")
             optimization("vae_slicing")

From 679296f0939bc3fcc528a206031415e5b4d97398 Mon Sep 17 00:00:00 2001
From: NullSenseStudio <47096043+NullSenseStudio@users.noreply.github.com>
Date: Thu, 6 Apr 2023 20:32:12 -0400
Subject: [PATCH 13/13] fix controlnet offload

---
 generator_process/actions/prompt_to_image.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/generator_process/actions/prompt_to_image.py b/generator_process/actions/prompt_to_image.py
index 06c7c8d7..14d124cd 100644
--- a/generator_process/actions/prompt_to_image.py
+++ b/generator_process/actions/prompt_to_image.py
@@ -224,7 +224,10 @@ def apply(self, pipeline, device):
                 from accelerate import cpu_offload_with_hook
 
                 hook = None
-                for cpu_offloaded_model in [pipeline.text_encoder, pipeline.unet, pipeline.vae]:
+                models = [pipeline.text_encoder, pipeline.unet, pipeline.vae]
+                if hasattr(pipeline, "controlnet"):
+                    models.append(pipeline.controlnet)
+                for cpu_offloaded_model in models:
                     _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
 
                 # FIXME: due to the safety checker not running it prevents the VAE from being offloaded, uncomment when safety checker is enabled
@@ -237,9 +240,11 @@ def apply(self, pipeline, device):
                 # adapted from diffusers.StableDiffusionPipeline.enable_sequential_cpu_offload() to allow DirectML device and unimplemented pipelines
                 from accelerate import cpu_offload
 
-                for cpu_offloaded_model in [pipeline.unet, pipeline.text_encoder, pipeline.vae]:
-                    if cpu_offloaded_model is not None:
-                        cpu_offload(cpu_offloaded_model, device)
+                models = [pipeline.text_encoder, pipeline.unet, pipeline.vae]
+                if hasattr(pipeline, "controlnet"):
+                    models.append(pipeline.controlnet)
+                for cpu_offloaded_model in models:
+                    cpu_offload(cpu_offloaded_model, device)
 
                 if pipeline.safety_checker is not None:
                     cpu_offload(pipeline.safety_checker.vision_model, device, offload_buffers=True)