-
Notifications
You must be signed in to change notification settings - Fork 965
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2183 from graemeniedermayer/sd35_integration
sd3.5 integration (naive)
- Loading branch information
Showing
32 changed files
with
332,140 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
import torch | ||
|
||
from huggingface_guess import model_list | ||
# from huggingface_guess.latent import SD3 | ||
from backend.diffusion_engine.base import ForgeDiffusionEngine, ForgeObjects | ||
from backend.patcher.clip import CLIP | ||
from backend.patcher.vae import VAE | ||
from backend.patcher.unet import UnetPatcher | ||
from backend.text_processing.classic_engine import ClassicTextProcessingEngine | ||
from backend.text_processing.t5_engine import T5TextProcessingEngine | ||
from backend.args import dynamic_args | ||
from backend import memory_management | ||
from backend.modules.k_prediction import PredictionDiscreteFlow | ||
|
||
class StableDiffusion3(ForgeDiffusionEngine): | ||
matched_guesses = [model_list.SD35] | ||
|
||
def __init__(self, estimated_config, huggingface_components): | ||
super().__init__(estimated_config, huggingface_components) | ||
|
||
clip = CLIP( | ||
model_dict={ | ||
'clip_l': huggingface_components['text_encoder'], | ||
'clip_g': huggingface_components['text_encoder_2'], | ||
't5xxl': huggingface_components['text_encoder_3'] | ||
}, | ||
tokenizer_dict={ | ||
'clip_l': huggingface_components['tokenizer'], | ||
'clip_g': huggingface_components['tokenizer_2'], | ||
't5xxl': huggingface_components['tokenizer_3'] | ||
} | ||
) | ||
|
||
k_predictor = PredictionDiscreteFlow( shift=3.0) | ||
|
||
vae = VAE(model=huggingface_components['vae']) | ||
|
||
unet = UnetPatcher.from_model( | ||
model=huggingface_components['transformer'], | ||
diffusers_scheduler= None, | ||
k_predictor=k_predictor, | ||
config=estimated_config | ||
) | ||
|
||
self.text_processing_engine_l = ClassicTextProcessingEngine( | ||
text_encoder=clip.cond_stage_model.clip_l, | ||
tokenizer=clip.tokenizer.clip_l, | ||
embedding_dir=dynamic_args['embedding_dir'], | ||
embedding_key='clip_l', | ||
embedding_expected_shape=768, | ||
emphasis_name=dynamic_args['emphasis_name'], | ||
text_projection=True, | ||
minimal_clip_skip=1, | ||
clip_skip=1, | ||
return_pooled=True, | ||
final_layer_norm=False, | ||
) | ||
|
||
self.text_processing_engine_g = ClassicTextProcessingEngine( | ||
text_encoder=clip.cond_stage_model.clip_g, | ||
tokenizer=clip.tokenizer.clip_g, | ||
embedding_dir=dynamic_args['embedding_dir'], | ||
embedding_key='clip_g', | ||
embedding_expected_shape=1280, | ||
emphasis_name=dynamic_args['emphasis_name'], | ||
text_projection=True, | ||
minimal_clip_skip=1, | ||
clip_skip=1, | ||
return_pooled=True, | ||
final_layer_norm=False, | ||
) | ||
|
||
self.text_processing_engine_t5 = T5TextProcessingEngine( | ||
text_encoder=clip.cond_stage_model.t5xxl, | ||
tokenizer=clip.tokenizer.t5xxl, | ||
emphasis_name=dynamic_args['emphasis_name'], | ||
) | ||
|
||
|
||
self.forge_objects = ForgeObjects(unet=unet, clip=clip, vae=vae, clipvision=None) | ||
self.forge_objects_original = self.forge_objects.shallow_copy() | ||
self.forge_objects_after_applying_lora = self.forge_objects.shallow_copy() | ||
|
||
# WebUI Legacy | ||
self.is_sd3 = True | ||
|
||
def set_clip_skip(self, clip_skip): | ||
self.text_processing_engine_l.clip_skip = clip_skip | ||
self.text_processing_engine_g.clip_skip = clip_skip | ||
|
||
@torch.inference_mode() | ||
def get_learned_conditioning(self, prompt: list[str]): | ||
memory_management.load_model_gpu(self.forge_objects.clip.patcher) | ||
|
||
cond_g, g_pooled = self.text_processing_engine_g(prompt) | ||
cond_l, l_pooled = self.text_processing_engine_l(prompt) | ||
# if enabled? | ||
cond_t5 = self.text_processing_engine_t5(prompt) | ||
|
||
is_negative_prompt = getattr(prompt, 'is_negative_prompt', False) | ||
|
||
force_zero_negative_prompt = is_negative_prompt and all(x == '' for x in prompt) | ||
|
||
if force_zero_negative_prompt: | ||
l_pooled = torch.zeros_like(l_pooled) | ||
g_pooled = torch.zeros_like(g_pooled) | ||
cond_l = torch.zeros_like(cond_l) | ||
cond_g = torch.zeros_like(cond_g) | ||
cond_t5 = torch.zeros_like(cond_t5) | ||
|
||
cond_lg = torch.cat([cond_l, cond_g], dim=-1) | ||
cond_lg = torch.nn.functional.pad(cond_lg, (0, 4096 - cond_lg.shape[-1])) | ||
|
||
cond = dict( | ||
crossattn=torch.cat([cond_lg, cond_t5], dim=-2), | ||
vector=torch.cat([l_pooled, g_pooled], dim=-1), | ||
) | ||
|
||
return cond | ||
|
||
@torch.inference_mode() | ||
def get_prompt_lengths_on_ui(self, prompt): | ||
token_count = len(self.text_processing_engine_t5.tokenize([prompt])[0]) | ||
return token_count, max(255, token_count) | ||
|
||
@torch.inference_mode() | ||
def encode_first_stage(self, x): | ||
sample = self.forge_objects.vae.encode(x.movedim(1, -1) * 0.5 + 0.5) | ||
sample = self.forge_objects.vae.first_stage_model.process_in(sample) | ||
return sample.to(x) | ||
|
||
@torch.inference_mode() | ||
def decode_first_stage(self, x): | ||
sample = self.forge_objects.vae.first_stage_model.process_out(x) | ||
sample = self.forge_objects.vae.decode(sample).movedim(-1, 1) * 2.0 - 1.0 | ||
|
||
return sample.to(x) |
40 changes: 40 additions & 0 deletions
40
backend/huggingface/stabilityai/stable-diffusion-3.5-large/model_index.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
{ | ||
"_class_name": "StableDiffusion3Pipeline", | ||
"_diffusers_version": "0.30.3.dev0", | ||
"scheduler": [ | ||
"diffusers", | ||
"FlowMatchEulerDiscreteScheduler" | ||
], | ||
"text_encoder": [ | ||
"transformers", | ||
"CLIPTextModelWithProjection" | ||
], | ||
"text_encoder_2": [ | ||
"transformers", | ||
"CLIPTextModelWithProjection" | ||
], | ||
"text_encoder_3": [ | ||
"transformers", | ||
"T5EncoderModel" | ||
], | ||
"tokenizer": [ | ||
"transformers", | ||
"CLIPTokenizer" | ||
], | ||
"tokenizer_2": [ | ||
"transformers", | ||
"CLIPTokenizer" | ||
], | ||
"tokenizer_3": [ | ||
"transformers", | ||
"T5TokenizerFast" | ||
], | ||
"transformer": [ | ||
"diffusers", | ||
"SD3Transformer2DModel" | ||
], | ||
"vae": [ | ||
"diffusers", | ||
"AutoencoderKL" | ||
] | ||
} |
6 changes: 6 additions & 0 deletions
6
backend/huggingface/stabilityai/stable-diffusion-3.5-large/scheduler/scheduler_config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{ | ||
"_class_name": "FlowMatchEulerDiscreteScheduler", | ||
"_diffusers_version": "0.29.0.dev0", | ||
"num_train_timesteps": 1000, | ||
"shift": 3.0 | ||
} |
24 changes: 24 additions & 0 deletions
24
backend/huggingface/stabilityai/stable-diffusion-3.5-large/text_encoder/config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
{ | ||
"architectures": [ | ||
"CLIPTextModelWithProjection" | ||
], | ||
"attention_dropout": 0.0, | ||
"bos_token_id": 0, | ||
"dropout": 0.0, | ||
"eos_token_id": 2, | ||
"hidden_act": "quick_gelu", | ||
"hidden_size": 768, | ||
"initializer_factor": 1.0, | ||
"initializer_range": 0.02, | ||
"intermediate_size": 3072, | ||
"layer_norm_eps": 1e-05, | ||
"max_position_embeddings": 77, | ||
"model_type": "clip_text_model", | ||
"num_attention_heads": 12, | ||
"num_hidden_layers": 12, | ||
"pad_token_id": 1, | ||
"projection_dim": 768, | ||
"torch_dtype": "float16", | ||
"transformers_version": "4.41.2", | ||
"vocab_size": 49408 | ||
} |
24 changes: 24 additions & 0 deletions
24
backend/huggingface/stabilityai/stable-diffusion-3.5-large/text_encoder_2/config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
{ | ||
"architectures": [ | ||
"CLIPTextModelWithProjection" | ||
], | ||
"attention_dropout": 0.0, | ||
"bos_token_id": 0, | ||
"dropout": 0.0, | ||
"eos_token_id": 2, | ||
"hidden_act": "gelu", | ||
"hidden_size": 1280, | ||
"initializer_factor": 1.0, | ||
"initializer_range": 0.02, | ||
"intermediate_size": 5120, | ||
"layer_norm_eps": 1e-05, | ||
"max_position_embeddings": 77, | ||
"model_type": "clip_text_model", | ||
"num_attention_heads": 20, | ||
"num_hidden_layers": 32, | ||
"pad_token_id": 1, | ||
"projection_dim": 1280, | ||
"torch_dtype": "float16", | ||
"transformers_version": "4.41.2", | ||
"vocab_size": 49408 | ||
} |
31 changes: 31 additions & 0 deletions
31
backend/huggingface/stabilityai/stable-diffusion-3.5-large/text_encoder_3/config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
{ | ||
"architectures": [ | ||
"T5EncoderModel" | ||
], | ||
"classifier_dropout": 0.0, | ||
"d_ff": 10240, | ||
"d_kv": 64, | ||
"d_model": 4096, | ||
"decoder_start_token_id": 0, | ||
"dense_act_fn": "gelu_new", | ||
"dropout_rate": 0.1, | ||
"eos_token_id": 1, | ||
"feed_forward_proj": "gated-gelu", | ||
"initializer_factor": 1.0, | ||
"is_encoder_decoder": true, | ||
"is_gated_act": true, | ||
"layer_norm_epsilon": 1e-06, | ||
"model_type": "t5", | ||
"num_decoder_layers": 24, | ||
"num_heads": 64, | ||
"num_layers": 24, | ||
"output_past": true, | ||
"pad_token_id": 0, | ||
"relative_attention_max_distance": 128, | ||
"relative_attention_num_buckets": 32, | ||
"tie_word_embeddings": false, | ||
"torch_dtype": "float16", | ||
"transformers_version": "4.41.2", | ||
"use_cache": true, | ||
"vocab_size": 32128 | ||
} |
Oops, something went wrong.