feature: stable diffusion video (SVD)

brycedrennan · Nov 23, 2023 · e8fe8d7 · e8fe8d7
1 parent 80ff006
commit e8fe8d7
Show file tree

Hide file tree

Showing 55 changed files with 9,453 additions and 6 deletions.
diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
 SHELL := /bin/bash
-python_version = 3.10.10
+python_version = 3.10.13
 venv_prefix = imaginairy
 venv_name = $(venv_prefix)-$(python_version)
 pyenv_instructions=https://github.com/pyenv/pyenv#installation

diff --git a/assets/rocket-wide.png b/assets/rocket-wide.png
diff --git a/imaginairy/cli/main.py b/imaginairy/cli/main.py
@@ -11,6 +11,7 @@
 from imaginairy.cli.run_api import run_server_cmd
 from imaginairy.cli.train import prep_images_cmd, prune_ckpt_cmd, train_concept_cmd
 from imaginairy.cli.upscale import upscale_cmd
+from imaginairy.cli.videogen import videogen_cmd
 
 logger = logging.getLogger(__name__)
 
@@ -50,6 +51,7 @@ def aimg(ctx):
 aimg.add_command(train_concept_cmd, name="train-concept")
 aimg.add_command(upscale_cmd, name="upscale")
 aimg.add_command(run_server_cmd, name="server")
+aimg.add_command(videogen_cmd, name="videogen")
 
 
 @aimg.command()

diff --git a/imaginairy/cli/videogen.py b/imaginairy/cli/videogen.py
@@ -0,0 +1,92 @@
+import logging
+
+import click
+
+logger = logging.getLogger(__name__)
+
+
+@click.command()
+@click.option(
+    "--start-image",
+    default="other/images/sound-music.jpg",
+    help="Input path for image file.",
+)
+@click.option("--num-frames", default=None, type=int, help="Number of frames.")
+@click.option("--num-steps", default=None, type=int, help="Number of steps.")
+@click.option(
+    "--model",
+    default="svd",
+    help="Model to use. One of: svd, svd_xt, svd_image_decoder, svd_xt_image_decoder",
+)
+@click.option(
+    "--fps", default=6, type=int, help="FPS for the AI to target when generating video"
+)
+@click.option("--output-fps", default=None, type=int, help="FPS for the output video")
+@click.option(
+    "--motion-amount",
+    default=127,
+    type=int,
+    help="How much motion to generate. value between 0 and 255.",
+)
+@click.option(
+    "-r",
+    "--repeats",
+    default=1,
+    show_default=True,
+    type=int,
+    help="How many times to repeat the renders. ",
+)
+@click.option("--cond-aug", default=0.02, type=float, help="Conditional augmentation.")
+@click.option(
+    "--seed", default=None, type=int, help="Seed for random number generator."
+)
+@click.option(
+    "--decoding_t", default=1, type=int, help="Number of frames decoded at a time."
+)
+@click.option("--device", default=None, help="Device to use.")
+@click.option("--output_folder", default=None, help="Output folder.")
+def videogen_cmd(
+    start_image,
+    num_frames,
+    num_steps,
+    model,
+    fps,
+    output_fps,
+    motion_amount,
+    repeats,
+    cond_aug,
+    seed,
+    decoding_t,
+    device,
+    output_folder,
+):
+    """
+    AI generate a video from an image
+
+    Example:
+
+        aimg videogen --start-image assets/rocket-wide.png
+
+    """
+    from imaginairy.log_utils import configure_logging
+    from imaginairy.video_sample import generate_video
+
+    configure_logging()
+
+    output_fps = output_fps or fps
+    for i in range(repeats):
+        logger.info(f"Generating video from image {start_image}")
+        generate_video(
+            input_path=start_image,
+            num_frames=num_frames,
+            num_steps=num_steps,
+            model_name=model,
+            fps_id=fps,
+            output_fps=output_fps,
+            motion_bucket_id=motion_amount,
+            cond_aug=cond_aug,
+            seed=seed,
+            decoding_t=decoding_t,
+            device=device,
+            output_folder=output_folder,
+        )
diff --git a/imaginairy/config.py b/imaginairy/config.py
@@ -86,6 +86,43 @@ class ModelConfig:
     ),
 ]
 
+
+video_models = [
+    {
+        "short_name": "svd",
+        "description": "Stable Video Diffusion",
+        "default_frames": 14,
+        "default_steps": 25,
+        "config_path": "configs/svd.yaml",
+        "weights_url": "https://huggingface.co/imaginairy/stable-video-diffusion/resolve/f9dce2757a0713da6262f35438050357c2be7ee6/svd.fp16.safetensors",
+    },
+    {
+        "short_name": "svd_image_decoder",
+        "description": "Stable Video Diffusion - Image Decoder",
+        "default_frames": 14,
+        "default_steps": 25,
+        "config_path": "configs/svd_image_decoder.yaml",
+        "weights_url": "https://huggingface.co/imaginairy/stable-video-diffusion/resolve/f9dce2757a0713da6262f35438050357c2be7ee6/svd_image_decoder.fp16.safetensors",
+    },
+    {
+        "short_name": "svd_xt",
+        "description": "Stable Video Diffusion - XT",
+        "default_frames": 25,
+        "default_steps": 30,
+        "config_path": "configs/svd_xt.yaml",
+        "weights_url": "https://huggingface.co/imaginairy/stable-video-diffusion/resolve/f9dce2757a0713da6262f35438050357c2be7ee6/svd_xt.fp16.safetensors",
+    },
+    {
+        "short_name": "svd_xt_image_decoder",
+        "description": "Stable Video Diffusion - XT - Image Decoder",
+        "default_frames": 25,
+        "default_steps": 30,
+        "config_path": "configs/svd_xt_image_decoder.yaml",
+        "weights_url": "https://huggingface.co/imaginairy/stable-video-diffusion/resolve/f9dce2757a0713da6262f35438050357c2be7ee6/svd_xt_image_decoder.fp16.safetensors",
+    },
+]
+video_models = {m["short_name"]: m for m in video_models}
+
 MODEL_CONFIG_SHORTCUTS = {m.short_name: m for m in MODEL_CONFIGS}
 for m in MODEL_CONFIGS:
     if m.alias:

diff --git a/imaginairy/configs/svd.yaml b/imaginairy/configs/svd.yaml
@@ -0,0 +1,146 @@
+model:
+  target: imaginairy.modules.sgm.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.18215
+    disable_first_stage_autocast: False
+
+
+    denoiser_config:
+      target: imaginairy.modules.sgm.diffusionmodules.denoiser.Denoiser
+      params:
+        scaling_config:
+          target: imaginairy.modules.sgm.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+
+    network_config:
+      target: imaginairy.modules.sgm.diffusionmodules.video_model.VideoUNet
+      params:
+        adm_in_channels: 768
+        num_classes: sequential
+        use_checkpoint: False
+        in_channels: 8
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2, 1]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        spatial_transformer_attn_type: softmax-xformers
+        extra_ff_mix_layer: True
+        use_spatial_context: True
+        merge_strategy: learned_with_images
+        video_kernel_size: [3, 1, 1]
+
+    conditioner_config:
+      target: imaginairy.modules.sgm.encoders.modules.GeneralConditioner
+      params:
+        emb_models:
+        - is_trainable: False
+          input_key: cond_frames_without_noise
+          target: imaginairy.modules.sgm.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
+          params:
+            n_cond_frames: 1
+            n_copies: 1
+            open_clip_embedding_config:
+              target: imaginairy.modules.sgm.encoders.modules.FrozenOpenCLIPImageEmbedder
+              params:
+                freeze: True
+
+        - input_key: fps_id
+          is_trainable: False
+          target: imaginairy.modules.sgm.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+
+        - input_key: motion_bucket_id
+          is_trainable: False
+          target: imaginairy.modules.sgm.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+
+        - input_key: cond_frames
+          is_trainable: False
+          target: imaginairy.modules.sgm.encoders.modules.VideoPredictionEmbedderWithEncoder
+          params:
+            disable_encoder_autocast: False
+            n_cond_frames: 1
+            n_copies: 1
+            is_ae: True
+            encoder_config:
+              target: imaginairy.modules.sgm.autoencoder.AutoencoderKLModeOnly
+              params:
+                embed_dim: 4
+                monitor: val/rec_loss
+                ddconfig:
+                  attn_type: vanilla-xformers
+                  double_z: True
+                  z_channels: 4
+                  resolution: 256
+                  in_channels: 3
+                  out_ch: 3
+                  ch: 128
+                  ch_mult: [1, 2, 4, 4]
+                  num_res_blocks: 2
+                  attn_resolutions: []
+                  dropout: 0.0
+                lossconfig:
+                  target: torch.nn.Identity
+
+        - input_key: cond_aug
+          is_trainable: False
+          target: imaginairy.modules.sgm.encoders.modules.ConcatTimestepEmbedderND
+          params:
+            outdim: 256
+
+    first_stage_config:
+      target: imaginairy.modules.sgm.autoencoder.AutoencodingEngine
+      params:
+        loss_config:
+          target: torch.nn.Identity
+        regularizer_config:
+          target: imaginairy.modules.sgm.autoencoding.regularizers.DiagonalGaussianRegularizer
+        encoder_config: 
+          target: imaginairy.modules.sgm.diffusionmodules.model.Encoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [1, 2, 4, 4]
+            num_res_blocks: 2
+            attn_resolutions: []
+            dropout: 0.0
+        decoder_config:
+          target: imaginairy.modules.sgm.autoencoding.temporal_ae.VideoDecoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [1, 2, 4, 4]
+            num_res_blocks: 2
+            attn_resolutions: []
+            dropout: 0.0
+            video_kernel_size: [3, 1, 1]
+
+    sampler_config:
+      target: imaginairy.modules.sgm.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        discretization_config:
+          target: imaginairy.modules.sgm.diffusionmodules.discretizer.EDMDiscretization
+          params:
+            sigma_max: 700.0
+
+        guider_config:
+          target: imaginairy.modules.sgm.diffusionmodules.guiders.LinearPredictionGuider
+          params:
+            max_scale: 2.5
+            min_scale: 1.0