black-forest-labs · timudk · Jan 31, 2025 · Oct 2, 2024 · Oct 2, 2024 · Oct 2, 2024
diff --git a/README.md b/README.md
@@ -17,9 +17,22 @@ source .venv/bin/activate
 pip install -e ".[all]"
 ```
 
+## Local installation with TRT support
+
+```bash
+docker pull nvcr.io/nvidia/pytorch:24.10-py3
+cd $HOME && git clone https://github.com/black-forest-labs/flux
+cd $HOME/flux
+docker run --rm -it --gpus all -v $PWD:/workspace/flux nvcr.io/nvidia/pytorch:24.10-py3 /bin/bash
+# inside container
+cd /workspace/flux
+pip install -e ".[all]"
+pip install -r trt_requirements.txt
+```
+
 ### Models
 
-We are offering an extensive suite of models. For more information about the individual models, please refer to the link under **Usage**.
+We are offering an extensive suite of models. For more information about the invidual models, please refer to the link under **Usage**.
 
 | Name                        | Usage                                                      | HuggingFace repo                                               | License                                                               |
 | --------------------------- | ---------------------------------------------------------- | -------------------------------------------------------------- | --------------------------------------------------------------------- |
@@ -42,6 +55,57 @@ We are offering an extensive suite of models. For more information about the ind
 
 The weights of the autoencoder are also released under [apache-2.0](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md) and can be found in the HuggingFace repos above.
 
+We also offer a Gradio-based demo for an interactive experience. To run the Gradio demo:
+
+```bash
+python demo_gr.py --name flux-schnell --device cuda
+```
+
+Options:
+
+- `--name`: Choose the model to use (options: "flux-schnell", "flux-dev")
+- `--device`: Specify the device to use (default: "cuda" if available, otherwise "cpu")
+- `--offload`: Offload model to CPU when not in use
+- `--share`: Create a public link to your demo
+
+To run the demo with the dev model and create a public link:
+
+```bash
+python demo_gr.py --name flux-dev --share
+```
+
+## Diffusers integration
+
+`FLUX.1 [schnell]` and `FLUX.1 [dev]` are integrated with the [🧨 diffusers](https://github.com/huggingface/diffusers) library. To use it with diffusers, install it:
+
+```shell
+pip install git+https://github.com/huggingface/diffusers.git
+```
+
+Then you can use `FluxPipeline` to run the model
+
+```python
+import torch
+from diffusers import FluxPipeline
+
+model_id = "black-forest-labs/FLUX.1-schnell" #you can also use `black-forest-labs/FLUX.1-dev`
+
+pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
+pipe.enable_model_cpu_offload() #save some VRAM by offloading the model to CPU. Remove this if you have enough GPU power
+
+prompt = "A cat holding a sign that says hello world"
+seed = 42
+image = pipe(
+    prompt,
+    output_type="pil",
+    num_inference_steps=4, #use a larger number if you are using [dev]
+    generator=torch.Generator("cpu").manual_seed(seed)
+).images[0]
+image.save("flux-schnell.png")
+```
+
+To learn more check out the [diffusers](https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux) documentation
+
 ## API usage
 
 Our API offers access to our models. It is documented here:

diff --git a/demo_gr.py b/demo_gr.py
@@ -15,7 +15,6 @@
 
 NSFW_THRESHOLD = 0.85
 
-
 def get_models(name: str, device: torch.device, offload: bool, is_schnell: bool):
     t5 = load_t5(device, max_length=256 if is_schnell else 512)
     clip = load_clip(device)
@@ -24,7 +23,6 @@ def get_models(name: str, device: torch.device, offload: bool, is_schnell: bool)
     nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device)
     return model, ae, t5, clip, nsfw_classifier
 
-
 class FluxGenerator:
     def __init__(self, model_name: str, device: str, offload: bool):
         self.device = torch.device(device)
@@ -153,7 +151,6 @@ def generate_image(
             exif_data[ExifTags.Base.Model] = self.model_name
             if add_sampling_metadata:
                 exif_data[ExifTags.Base.ImageDescription] = prompt
-
             img.save(filename, format="jpeg", exif=exif_data, quality=95, subsampling=0)
 
             return img, str(opts.seed), filename, None

diff --git a/src/flux/cli.py b/src/flux/cli.py
@@ -5,10 +5,12 @@
 from glob import iglob
 
 import torch
+from cuda import cudart
 from fire import Fire
 from transformers import pipeline
 
 from flux.sampling import denoise, get_noise, get_schedule, prepare, unpack
+from flux.trt.trt_manager import TRTManager
 from flux.util import configs, load_ae, load_clip, load_flow_model, load_t5, save_image
 
 NSFW_THRESHOLD = 0.85
@@ -25,7 +27,9 @@ class SamplingOptions:
 
 
 def parse_prompt(options: SamplingOptions) -> SamplingOptions | None:
-    user_question = "Next prompt (write /h for help, /q to quit and leave empty to repeat):\n"
+    user_question = (
+        "Next prompt (write /h for help, /q to quit and leave empty to repeat):\n"
+    )
     usage = (
         "Usage: Either write your prompt directly, leave this field empty "
         "to repeat the prompt or write a command starting with a slash:\n"
@@ -108,6 +112,8 @@ def main(
     offload: bool = False,
     output_dir: str = "output",
     add_sampling_metadata: bool = True,
+    trt: bool = False,
+    **kwargs: dict | None,
 ):
     """
     Sample the flux model. Either interactively (set `--loop`) or run for a
@@ -126,6 +132,8 @@ def main(
         loop: start an interactive session and sample multiple times
         guidance: guidance value used for guidance distillation
         add_sampling_metadata: Add the prompt to the image Exif metadata
+        trt: use TensorRT backend for optimized inference
+        kwargs: additional arguments for TensorRT support
     """
     nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device)
 
@@ -158,6 +166,57 @@ def main(
     model = load_flow_model(name, device="cpu" if offload else torch_device)
     ae = load_ae(name, device="cpu" if offload else torch_device)
 
+    if trt:
+        # offload to CPU to save memory
+        ae = ae.cpu()
+        model = model.cpu()
+        clip = clip.cpu()
+        t5 = t5.cpu()
+
+        torch.cuda.empty_cache()
+
+        trt_ctx_manager = TRTManager(
+            bf16=True,
+            device=torch_device,
+            static_batch=kwargs.get("static_batch", True),
+            static_shape=kwargs.get("static_shape", True),
+        )
+        ae.decoder.params = ae.params
+        engines = trt_ctx_manager.load_engines(
+            models={
+                "clip": clip,
+                "transformer": model,
+                "t5": t5,
+                "vae": ae.decoder,
+            },
+            engine_dir=os.environ.get("TRT_ENGINE_DIR", "./engines"),
+            onnx_dir=os.environ.get("ONNX_DIR", "./onnx"),
+            opt_image_height=height,
+            opt_image_width=width,
+        )
+
+        torch.cuda.synchronize()
+
+        trt_ctx_manager.init_runtime()
+        # TODO: refactor. stream should be part of engine constructor maybe !!
+        for _, engine in engines.items():
+            engine.set_stream(stream=trt_ctx_manager.stream)
+
+        if not offload:
+            for _, engine in engines.items():
+                engine.load()
+
+            calculate_max_device_memory = trt_ctx_manager.calculate_max_device_memory(engines)
+            _, shared_device_memory = cudart.cudaMalloc(calculate_max_device_memory)
+
+            for _, engine in engines.items():
+                engine.activate(device=torch_device, device_memory=shared_device_memory)
+
+        ae = engines["vae"]
+        model = engines["transformer"]
+        clip = engines["clip"]
+        t5 = engines["t5"]
+
     rng = torch.Generator(device="cpu")
     opts = SamplingOptions(
         prompt=prompt,
@@ -192,7 +251,9 @@ def main(
             torch.cuda.empty_cache()
             t5, clip = t5.to(torch_device), clip.to(torch_device)
         inp = prepare(t5, clip, x, prompt=opts.prompt)
-        timesteps = get_schedule(opts.num_steps, inp["img"].shape[1], shift=(name != "flux-schnell"))
+        timesteps = get_schedule(
+            opts.num_steps, inp["img"].shape[1], shift=(name != "flux-schnell")
+        )
 
         # offload TEs to CPU, load model to gpu
         if offload:
@@ -229,6 +290,8 @@ def main(
         else:
             opts = None
 
+    if trt:
+        trt_ctx_manager.stop_runtime()
 
 def app():
     Fire(main)

diff --git a/src/flux/cli_control.py b/src/flux/cli_control.py
@@ -5,11 +5,13 @@
 from glob import iglob
 
 import torch
+from cuda import cudart
 from fire import Fire
 from transformers import pipeline
 
 from flux.modules.image_embedders import CannyImageEncoder, DepthImageEncoder
 from flux.sampling import denoise, get_noise, get_schedule, prepare_control, unpack
+from flux.trt.trt_manager import TRTManager
 from flux.util import configs, load_ae, load_clip, load_flow_model, load_t5, save_image
 
 
@@ -174,6 +176,8 @@ def main(
     add_sampling_metadata: bool = True,
     img_cond_path: str = "assets/robot.webp",
     lora_scale: float | None = 0.85,
+    trt: bool = False,
+    **kwargs: dict | None,
 ):
     """
     Sample the flux model. Either interactively (set `--loop`) or run for a
@@ -192,6 +196,7 @@ def main(
         guidance: guidance value used for guidance distillation
         add_sampling_metadata: Add the prompt to the image Exif metadata
         img_cond_path: path to conditioning image (jpeg/png/webp)
+        trt: use TensorRT backend for optimized inference
     """
     nsfw_classifier = pipeline("image-classification", model="Falconsai/nsfw_image_detection", device=device)
 
@@ -234,6 +239,7 @@ def main(
 
     # set lora scale
     if "lora" in name and lora_scale is not None:
+        assert not trt, "TRT does not support LORA yet"
         for _, module in model.named_modules():
             if hasattr(module, "set_scale"):
                 module.set_scale(lora_scale)
@@ -245,6 +251,50 @@ def main(
     else:
         raise NotImplementedError()
 
+    if trt:
+        trt_ctx_manager = TRTManager(
+            bf16=True,
+            device=torch_device,
+            static_batch=kwargs.get("static_batch", True),
+            static_shape=kwargs.get("static_shape", True),
+        )
+        ae.decoder.params = ae.params
+        ae.encoder.params = ae.params
+        engines = trt_ctx_manager.load_engines(
+            models={
+                "clip": clip.cpu(),
+                "transformer": model.cpu(),
+                "t5": t5.cpu(),
+                "vae": ae.decoder.cpu(),
+                "vae_encoder": ae.encoder.cpu(),
+            },
+            engine_dir=os.environ.get("TRT_ENGINE_DIR", "./engines"),
+            onnx_dir=os.environ.get("ONNX_DIR", "./onnx"),
+            opt_image_height=height,
+            opt_image_width=width,
+        )
+        torch.cuda.synchronize()
+
+        trt_ctx_manager.init_runtime()
+        # TODO: refactor. stream should be part of engine constructor maybe !!
+        for _, engine in engines.items():
+            engine.set_stream(stream=trt_ctx_manager.stream)
+
+        if not offload:
+            for _, engine in engines.items():
+                engine.load()
+
+            calculate_max_device_memory = trt_ctx_manager.calculate_max_device_memory(engines)
+            _, shared_device_memory = cudart.cudaMalloc(calculate_max_device_memory)
+
+            for _, engine in engines.items():
+                engine.activate(device=torch_device, device_memory=shared_device_memory)
+
+        ae = engines["vae"]
+        model = engines["transformer"]
+        clip = engines["clip"]
+        t5 = engines["t5"]
+
     rng = torch.Generator(device="cpu")
     opts = SamplingOptions(
         prompt=prompt,

diff --git a/src/flux/math.py b/src/flux/math.py
@@ -14,7 +14,7 @@ def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
 
 def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
     assert dim % 2 == 0
-    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    scale = torch.arange(0, dim, 2, dtype=pos.dtype, device=pos.device) / dim
     omega = 1.0 / (theta**scale)
     out = torch.einsum("...n,d->...nd", pos, omega)
     out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)

diff --git a/src/flux/modules/autoencoder.py b/src/flux/modules/autoencoder.py
@@ -235,6 +235,9 @@ def __init__(
         self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
 
     def forward(self, z: Tensor) -> Tensor:
+        # get dtype for proper tracing
+        upscale_dtype = next(self.up.parameters()).dtype
+
         # z to block_in
         h = self.conv_in(z)
 
@@ -243,6 +246,8 @@ def forward(self, z: Tensor) -> Tensor:
         h = self.mid.attn_1(h)
         h = self.mid.block_2(h)
 
+        # cast to proper dtype
+        h = h.to(upscale_dtype)
         # upsampling
         for i_level in reversed(range(self.num_resolutions)):
             for i_block in range(self.num_res_blocks + 1):
@@ -277,6 +282,7 @@ def forward(self, z: Tensor) -> Tensor:
 class AutoEncoder(nn.Module):
     def __init__(self, params: AutoEncoderParams):
         super().__init__()
+        self.params = params
         self.encoder = Encoder(
             resolution=params.resolution,
             in_channels=params.in_channels,

diff --git a/src/flux/trt/__init__.py b/src/flux/trt/__init__.py
diff --git a/src/flux/trt/engine/__init__.py b/src/flux/trt/engine/__init__.py
@@ -0,0 +1,32 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from flux.trt.engine.base_engine import BaseEngine, Engine
+from flux.trt.engine.clip_engine import CLIPEngine
+from flux.trt.engine.t5_engine import T5Engine
+from flux.trt.engine.transformer_engine import TransformerEngine
+from flux.trt.engine.vae_engine import VAEEngine, VAEDecoder, VAEEncoder
+
+__all__ = [
+    "BaseEngine",
+    "Engine",
+    "CLIPEngine",
+    "TransformerEngine",
+    "T5Engine",
+    "VAEEngine",
+    "VAEDecoder",
+    "VAEEncoder",
+]