Skip to content

Commit

Permalink
feature: stable diffusion video (SVD)
Browse files Browse the repository at this point in the history
  • Loading branch information
brycedrennan committed Nov 23, 2023
1 parent 80ff006 commit e8fe8d7
Show file tree
Hide file tree
Showing 55 changed files with 9,453 additions and 6 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
SHELL := /bin/bash
python_version = 3.10.10
python_version = 3.10.13
venv_prefix = imaginairy
venv_name = $(venv_prefix)-$(python_version)
pyenv_instructions=https://github.com/pyenv/pyenv#installation
Expand Down
Binary file added assets/rocket-wide.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 2 additions & 0 deletions imaginairy/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from imaginairy.cli.run_api import run_server_cmd
from imaginairy.cli.train import prep_images_cmd, prune_ckpt_cmd, train_concept_cmd
from imaginairy.cli.upscale import upscale_cmd
from imaginairy.cli.videogen import videogen_cmd

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -50,6 +51,7 @@ def aimg(ctx):
aimg.add_command(train_concept_cmd, name="train-concept")
aimg.add_command(upscale_cmd, name="upscale")
aimg.add_command(run_server_cmd, name="server")
aimg.add_command(videogen_cmd, name="videogen")


@aimg.command()
Expand Down
92 changes: 92 additions & 0 deletions imaginairy/cli/videogen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import logging

import click

logger = logging.getLogger(__name__)


@click.command()
@click.option(
"--start-image",
default="other/images/sound-music.jpg",
help="Input path for image file.",
)
@click.option("--num-frames", default=None, type=int, help="Number of frames.")
@click.option("--num-steps", default=None, type=int, help="Number of steps.")
@click.option(
"--model",
default="svd",
help="Model to use. One of: svd, svd_xt, svd_image_decoder, svd_xt_image_decoder",
)
@click.option(
"--fps", default=6, type=int, help="FPS for the AI to target when generating video"
)
@click.option("--output-fps", default=None, type=int, help="FPS for the output video")
@click.option(
"--motion-amount",
default=127,
type=int,
help="How much motion to generate. value between 0 and 255.",
)
@click.option(
"-r",
"--repeats",
default=1,
show_default=True,
type=int,
help="How many times to repeat the renders. ",
)
@click.option("--cond-aug", default=0.02, type=float, help="Conditional augmentation.")
@click.option(
"--seed", default=None, type=int, help="Seed for random number generator."
)
@click.option(
"--decoding_t", default=1, type=int, help="Number of frames decoded at a time."
)
@click.option("--device", default=None, help="Device to use.")
@click.option("--output_folder", default=None, help="Output folder.")
def videogen_cmd(
start_image,
num_frames,
num_steps,
model,
fps,
output_fps,
motion_amount,
repeats,
cond_aug,
seed,
decoding_t,
device,
output_folder,
):
"""
AI generate a video from an image
Example:
aimg videogen --start-image assets/rocket-wide.png
"""
from imaginairy.log_utils import configure_logging
from imaginairy.video_sample import generate_video

configure_logging()

output_fps = output_fps or fps
for i in range(repeats):
logger.info(f"Generating video from image {start_image}")
generate_video(
input_path=start_image,
num_frames=num_frames,
num_steps=num_steps,
model_name=model,
fps_id=fps,
output_fps=output_fps,
motion_bucket_id=motion_amount,
cond_aug=cond_aug,
seed=seed,
decoding_t=decoding_t,
device=device,
output_folder=output_folder,
)
37 changes: 37 additions & 0 deletions imaginairy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,43 @@ class ModelConfig:
),
]


video_models = [
{
"short_name": "svd",
"description": "Stable Video Diffusion",
"default_frames": 14,
"default_steps": 25,
"config_path": "configs/svd.yaml",
"weights_url": "https://huggingface.co/imaginairy/stable-video-diffusion/resolve/f9dce2757a0713da6262f35438050357c2be7ee6/svd.fp16.safetensors",
},
{
"short_name": "svd_image_decoder",
"description": "Stable Video Diffusion - Image Decoder",
"default_frames": 14,
"default_steps": 25,
"config_path": "configs/svd_image_decoder.yaml",
"weights_url": "https://huggingface.co/imaginairy/stable-video-diffusion/resolve/f9dce2757a0713da6262f35438050357c2be7ee6/svd_image_decoder.fp16.safetensors",
},
{
"short_name": "svd_xt",
"description": "Stable Video Diffusion - XT",
"default_frames": 25,
"default_steps": 30,
"config_path": "configs/svd_xt.yaml",
"weights_url": "https://huggingface.co/imaginairy/stable-video-diffusion/resolve/f9dce2757a0713da6262f35438050357c2be7ee6/svd_xt.fp16.safetensors",
},
{
"short_name": "svd_xt_image_decoder",
"description": "Stable Video Diffusion - XT - Image Decoder",
"default_frames": 25,
"default_steps": 30,
"config_path": "configs/svd_xt_image_decoder.yaml",
"weights_url": "https://huggingface.co/imaginairy/stable-video-diffusion/resolve/f9dce2757a0713da6262f35438050357c2be7ee6/svd_xt_image_decoder.fp16.safetensors",
},
]
video_models = {m["short_name"]: m for m in video_models}

MODEL_CONFIG_SHORTCUTS = {m.short_name: m for m in MODEL_CONFIGS}
for m in MODEL_CONFIGS:
if m.alias:
Expand Down
146 changes: 146 additions & 0 deletions imaginairy/configs/svd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
model:
target: imaginairy.modules.sgm.diffusion.DiffusionEngine
params:
scale_factor: 0.18215
disable_first_stage_autocast: False


denoiser_config:
target: imaginairy.modules.sgm.diffusionmodules.denoiser.Denoiser
params:
scaling_config:
target: imaginairy.modules.sgm.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise

network_config:
target: imaginairy.modules.sgm.diffusionmodules.video_model.VideoUNet
params:
adm_in_channels: 768
num_classes: sequential
use_checkpoint: False
in_channels: 8
out_channels: 4
model_channels: 320
attention_resolutions: [4, 2, 1]
num_res_blocks: 2
channel_mult: [1, 2, 4, 4]
num_head_channels: 64
use_linear_in_transformer: True
transformer_depth: 1
context_dim: 1024
spatial_transformer_attn_type: softmax-xformers
extra_ff_mix_layer: True
use_spatial_context: True
merge_strategy: learned_with_images
video_kernel_size: [3, 1, 1]

conditioner_config:
target: imaginairy.modules.sgm.encoders.modules.GeneralConditioner
params:
emb_models:
- is_trainable: False
input_key: cond_frames_without_noise
target: imaginairy.modules.sgm.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
params:
n_cond_frames: 1
n_copies: 1
open_clip_embedding_config:
target: imaginairy.modules.sgm.encoders.modules.FrozenOpenCLIPImageEmbedder
params:
freeze: True

- input_key: fps_id
is_trainable: False
target: imaginairy.modules.sgm.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256

- input_key: motion_bucket_id
is_trainable: False
target: imaginairy.modules.sgm.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256

- input_key: cond_frames
is_trainable: False
target: imaginairy.modules.sgm.encoders.modules.VideoPredictionEmbedderWithEncoder
params:
disable_encoder_autocast: False
n_cond_frames: 1
n_copies: 1
is_ae: True
encoder_config:
target: imaginairy.modules.sgm.autoencoder.AutoencoderKLModeOnly
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
attn_type: vanilla-xformers
double_z: True
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [1, 2, 4, 4]
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity

- input_key: cond_aug
is_trainable: False
target: imaginairy.modules.sgm.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256

first_stage_config:
target: imaginairy.modules.sgm.autoencoder.AutoencodingEngine
params:
loss_config:
target: torch.nn.Identity
regularizer_config:
target: imaginairy.modules.sgm.autoencoding.regularizers.DiagonalGaussianRegularizer
encoder_config:
target: imaginairy.modules.sgm.diffusionmodules.model.Encoder
params:
attn_type: vanilla
double_z: True
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [1, 2, 4, 4]
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
decoder_config:
target: imaginairy.modules.sgm.autoencoding.temporal_ae.VideoDecoder
params:
attn_type: vanilla
double_z: True
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [1, 2, 4, 4]
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
video_kernel_size: [3, 1, 1]

sampler_config:
target: imaginairy.modules.sgm.diffusionmodules.sampling.EulerEDMSampler
params:
discretization_config:
target: imaginairy.modules.sgm.diffusionmodules.discretizer.EDMDiscretization
params:
sigma_max: 700.0

guider_config:
target: imaginairy.modules.sgm.diffusionmodules.guiders.LinearPredictionGuider
params:
max_scale: 2.5
min_scale: 1.0
Loading

0 comments on commit e8fe8d7

Please sign in to comment.