-
Notifications
You must be signed in to change notification settings - Fork 3
/
app.py
305 lines (275 loc) · 12.9 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
import os
import cv2
import tempfile
import shutil
from PIL import Image
import gradio as gr
from inference import parse_args
from runners.paint_by_example_inference_runner import paint_by_example_inference_runner
from runners.instructpix2pix_inference_runner import instructpix2pix_inference_runner
from runners.controlnet_inpaint_inference_runner import controlnet_inpaint_inference_runner
from runners.completion_net_inference_runner import depth_completion_net_inference_runner
from runners.i2vgenxl_ctrl_adapter_inference_runner import i2vgenxl_ctrl_adapter_inference_runner
from runners.iterative_warping.run_flow_extraction import raft_flow_extraction_runner
from runners.midas_depth_estimation_runner import midas_depth_estimation_runner
from runners.u2net_saliency_detection_runner import u2net_saliency_detection_runner
from runners.iterative_warping_runner import iterative_warping_runner
from runners.stable_diffusion_inpaint_inference_runner import stable_diffusion_inpaint_inference_runner
from runners.anydoor_inference_runner import anydoor_inference_runner
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
# Initialize default arguments
args = parse_args()
# TODO: Model paths configuration
args.paint_by_example_checkpoint_path = "checkpoints/paint-by-example"
args.instructpix2pix_checkpoint_path = "checkpoints/instruct-pix2pix"
args.stable_diffusion_inpaint_checkpoint_path = "checkpoints/stable-diffusion-inpaint"
args.anydoor_checkpoint_path = "checkpoints/anydoor"
args.raft_checkpoint_path = "checkpoints/raft/raft-things.pth"
args.midas_checkpoint_path = "checkpoints/dpt_swin2_large_384.pt"
args.u2net_checkpoint_path = "checkpoints/u2net.pth"
args.stable_diffusion_checkpoint_path = "checkpoints/stable-diffusion-v1.5"
args.controlnet_checkpoint_path = "checkpoints/controlnet-depth"
args.ctrl_adapter_checkpoint_path = "checkpoints/ctrl-adapter-i2vgenxl-depth"
args.i2vgenxl_checkpoint_path = "checkpoints/i2vgenxl"
args.completion_net_checkpoint_path = "checkpoints/50000.ckpt"
args.edited_first_frame = None
def run_stablev2v(
video_input,
prompt,
image_editor_type,
external_guidance_text=None,
external_guidance_image=None,
negative_prompt=None,
guidance_scale=9.0,
n_frames=16,
output_fps=16,
height=512,
width=512,
edited_first_frame=None,
kernel_size=11,
dilation_iteration=3,
mixed_precision='bf16',
seed=42,
):
# Create temporary directories for processing
with tempfile.TemporaryDirectory() as temp_dir:
# Extract frames from video to temp directory
frames_dir = os.path.join(temp_dir, "frames")
os.makedirs(frames_dir, exist_ok=True)
# Set up arguments
args.prompt = prompt
args.image_editor_type = image_editor_type
args.height = height
args.width = width
args.n_sample_frames = n_frames
args.output_fps = output_fps
args.guidance_scale = guidance_scale
args.outdir = "results"
args.kernel_size = kernel_size
args.dilation_iteration = dilation_iteration
args.mixed_precision = mixed_precision
args.seed = seed
args.controlnet_conditioning_scale = 1.0
args.control_guidance_start = 0.0
args.control_guidance_end = 1.0
args.sparse_frames = None
args.skip_conv_in = False
args.num_inference_steps = 50
args.video_length = 8
args.video_duration = 1000
args.input_condition = None
args.reference_image = None
args.reference_masks = None
args.image_guidance_scale = 1.0
args.anydoor_config_path = 'models/anydoor/configs/inference.yaml'
os.makedirs(args.outdir, exist_ok=True)
os.makedirs(os.path.join(args.outdir, 'frames'), exist_ok=True)
if negative_prompt:
args.negative_prompt = negative_prompt
for video_frame_path in video_input:
current_frame = Image.open(video_frame_path)
filename = os.path.basename(video_frame_path).split('.')[0] + '.png'
frame_path = os.path.join(args.outdir, 'frames', filename)
current_frame.save(frame_path)
args.source_video_frames = os.path.join(args.outdir, 'frames')
if image_editor_type in ['paint-by-example', 'anydoor'] and edited_first_frame is None:
external_guidance_image.save(os.path.join(args.outdir, 'reference_image.png'))
args.external_guidance = os.path.join(args.outdir, 'reference_image.png')
elif image_editor_type == 'instructpix2pix' and edited_first_frame is None:
args.external_guidance = external_guidance_text
if edited_first_frame:
edited_first_frame.save(os.path.join(args.outdir, 'edited_first_frame.png'))
args.edited_first_frame = os.path.join(args.outdir, 'edited_first_frame.png')
# Run inference pipeline
u2net_saliency_detection_runner(args)
args.input_masks = os.path.join(args.outdir, 'iterative_warping', 'object_masks')
# 1. Run image editing on the first video frame
if args.edited_first_frame is None:
if args.image_editor_type == 'paint-by-example':
assert args.external_guidance is not None, "External guidance must be provided for `Paint-by-Example` editor."
paint_by_example_inference_runner(args)
elif args.image_editor_type == "instructpix2pix":
assert args.external_guidance is not None, "External guidance must be provided for `InstructPix2Pix` editor."
instructpix2pix_inference_runner(args)
elif args.image_editor_type == "controlnet-inpaint":
assert args.external_guidance is not None, "External guidance must be provided for `ControlNet Inpaint` editor."
controlnet_inpaint_inference_runner(args)
elif args.image_editor_type == "stable-diffusion-inpaint":
stable_diffusion_inpaint_inference_runner(args)
elif args.image_editor_type == "anydoor":
anydoor_inference_runner(args)
print("\n1. Image editing done.\n")
# 2. Run iterative warping
# Extract optical flows from source video frames
raft_flow_extraction_runner(args)
print("\n2. Optical flow extraction done.\n")
# Extract depth maps from source video frames
midas_depth_estimation_runner(args)
print("\n3. Depth map extraction done.\n")
# Extract shape mask from the edited first frame
# Extract object masks from the source video frames
if args.input_masks is None:
u2net_saliency_detection_runner(args)
else:
# Copy files in args.input_masks to the object_masks directory
object_masks_dir = os.path.join(args.outdir, 'iterative_warping', 'object_masks')
os.makedirs(object_masks_dir, exist_ok=True)
for mask_file in sorted(os.listdir(args.input_masks))[:args.n_sample_frames]:
src_path = os.path.join(args.input_masks, mask_file)
dst_path = os.path.join(object_masks_dir, mask_file)
if os.path.isfile(src_path) and not os.path.isfile(dst_path):
shutil.copy(src_path, dst_path)
# Extract editing masks from the edited first frame
if args.edited_first_frame is None:
args.edited_first_frame = os.path.join(args.outdir, 'image_editing_results', f'{args.prompt.lower().replace(" ", "_")}.png')
u2net_saliency_detection_runner(args, args.edited_first_frame)
print("\n4. Object and editing masks extraction done.\n")
# Get edited optical flows and depth maps
iterative_warping_runner(args)
print("\n5. Iterative warping done.\n")
# Run depth completion net to remove extra region
depth_completion_net_inference_runner(args)
print("\n6. Depth completion done.\n")
# 3. Run image-to-video generation with the edited depth maps
i2vgenxl_ctrl_adapter_inference_runner(args)
print("\n7. Image-to-video generation done.\n")
def image_to_video(file_path, output, fps=24):
img_list = sorted(os.listdir(file_path))
image = cv2.imread(os.path.join(file_path, img_list[1]))
height, width, _ = image.shape
fourcc = cv2.VideoWriter_fourcc('M', 'P', '4', 'V')
videowriter = cv2.VideoWriter(output, fourcc, fps, (width, height))
for img in img_list:
path = os.path.join(file_path, img)
frame = cv2.imread(path)
videowriter.write(frame)
videowriter.release()
# Return the generated gif path
result_path = os.path.join(args.outdir, 'generator_outputs', 'output_frames')
image_to_video(result_path, os.path.join(args.outdir, 'generator_outputs', 'output_video.mp4'))
return os.path.join(args.outdir, 'generator_outputs', 'output_video.mp4')
# Create Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Gradio Demo for StableV2V")
with gr.Row():
# Left column - Input settings
with gr.Column(scale=1):
video_input = gr.Files(label="Input Frames Folder", file_count="directory")
prompt = gr.Textbox(label="Prompt", value="a high quality video")
image_editor_type = gr.Dropdown(
label="Image Editor Type",
choices=["paint-by-example", "instructpix2pix", "controlnet-inpaint",
"stable-diffusion-inpaint", "anydoor"],
value="paint-by-example"
)
submit_btn = gr.Button("Generate")
# Middle column - Advanced settings
with gr.Column(scale=1):
# Guidance inputs
with gr.Tabs():
with gr.Tab("Reference Image"):
external_guidance_image = gr.Image(
label="Required by Paint-by-Example and AnyDoor",
type='pil'
)
with gr.Tab("User Instructions"):
external_guidance_text = gr.Textbox(
label="Required by InstructPix2Pix",
type='text'
)
with gr.Tab("First Edited Frame"):
edited_first_frame = gr.Image(
label="First Edited Frame",
type='pil'
)
with gr.Accordion("Advanced Options", open=False):
mixed_precision = gr.Dropdown(
label="Mixed Precision",
choices=["fp32", "fp16", 'bf16'],
value="bf16",
type='value'
)
negative_prompt = gr.Textbox(
label="Negative Prompt",
value="Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms"
)
guidance_scale = gr.Slider(
minimum=1, maximum=20, value=9.0,
label="Guidance Scale"
)
n_frames = gr.Slider(
minimum=1, maximum=32, value=16, step=1,
label="Number of Frames"
)
output_fps = gr.Slider(
minimum=1, maximum=30, value=16, step=1,
label="Output FPS"
)
height = gr.Slider(
minimum=1, maximum=1024, value=512, step=1,
label="Height"
)
width = gr.Slider(
minimum=1, maximum=1024, value=512, step=1,
label="Width"
)
kernel_size = gr.Slider(
minimum=1, maximum=49, value=11, step=2,
label="Kernel Size"
)
dilation_iteration = gr.Slider(
minimum=1, maximum=10, value=3, step=1,
label="Dilation Iterations"
)
seed = gr.Slider(
minimum=1, maximum=1000000, value=42, step=1,
label="Seed"
)
# Right column - Output
with gr.Column(scale=1):
output_video = gr.Video(label="Generated Video")
# Submit button click handler
submit_btn.click(
run_stablev2v,
inputs=[
video_input,
prompt,
image_editor_type,
external_guidance_text,
external_guidance_image,
negative_prompt,
guidance_scale,
n_frames,
output_fps,
height,
width,
edited_first_frame,
kernel_size,
dilation_iteration,
mixed_precision,
seed,
],
outputs=output_video
)
if __name__ == "__main__":
demo.launch()