Spaces:
Running
on
Zero
Running
on
Zero
misspoppins
#2
by
InvisibleFreakKollektiv8020
- opened
- app.py +76 -130
- requirements.txt +4 -7
app.py
CHANGED
@@ -15,8 +15,10 @@ import gc
|
|
15 |
from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
|
16 |
from compel import Compel, ReturnedEmbeddingsType
|
17 |
|
18 |
-
#
|
19 |
-
|
|
|
|
|
20 |
|
21 |
# --- Global Setup ---
|
22 |
print("Starting Integrated Text-to-Image-to-Video App...")
|
@@ -52,66 +54,35 @@ compel = Compel(
|
|
52 |
)
|
53 |
|
54 |
# --- 2. Setup Image-to-Video Model (Wan2.2) ---
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
dit_fsdp=False,
|
85 |
-
use_sp=False,
|
86 |
-
t5_cpu=False,
|
87 |
-
init_on_cpu=False,
|
88 |
-
convert_model_dtype=True,
|
89 |
-
)
|
90 |
-
WAN_AVAILABLE = True
|
91 |
-
print("Wan model loaded successfully.")
|
92 |
-
|
93 |
-
except ImportError as e:
|
94 |
-
print(f"Warning: Wan module not found. Video generation will be disabled. Error: {e}")
|
95 |
-
WAN_AVAILABLE = False
|
96 |
-
# Define default values
|
97 |
-
TASK_NAME = 'ti2v-5B'
|
98 |
-
FIXED_FPS = 24
|
99 |
-
MIN_FRAMES_MODEL = 8
|
100 |
-
MAX_FRAMES_MODEL = 121
|
101 |
-
SUPPORTED_SIZES = {
|
102 |
-
'ti2v-5B': ['704*1280', '576*1024', '512*768', '768*512', '1024*576', '1280*704']
|
103 |
-
}
|
104 |
-
SIZE_CONFIGS = {}
|
105 |
-
MAX_AREA_CONFIGS = {}
|
106 |
-
wan_pipeline = None
|
107 |
-
|
108 |
-
class MockConfig:
|
109 |
-
sample_fps = 24
|
110 |
-
sample_guide_scale = 7.0
|
111 |
-
sample_shift = 3.0
|
112 |
-
|
113 |
-
cfg = MockConfig()
|
114 |
-
|
115 |
print("All models loaded and ready.")
|
116 |
|
117 |
# --- Constants ---
|
@@ -259,9 +230,6 @@ def generate_video(
|
|
259 |
progress=gr.Progress(track_tqdm=True)
|
260 |
):
|
261 |
"""Generate video from image and prompt"""
|
262 |
-
if not WAN_AVAILABLE:
|
263 |
-
raise gr.Error("Video generation is not available. The Wan module is not installed.")
|
264 |
-
|
265 |
errors = validate_video_inputs(image, prompt, duration_seconds)
|
266 |
if errors:
|
267 |
raise gr.Error("\n".join(errors))
|
@@ -305,7 +273,6 @@ def generate_video(
|
|
305 |
|
306 |
progress(0.9, desc="Saving video...")
|
307 |
|
308 |
-
from wan.utils.utils import cache_video
|
309 |
video_path = cache_video(
|
310 |
tensor=video_tensor[None],
|
311 |
save_file=None,
|
@@ -363,21 +330,17 @@ def generate_image_to_video(
|
|
363 |
available_sizes = list(SUPPORTED_SIZES[TASK_NAME])
|
364 |
best_size = select_best_size_for_image(generated_image, available_sizes)
|
365 |
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
)
|
378 |
-
else:
|
379 |
-
video_path = None
|
380 |
-
gr.Warning("Video generation skipped - Wan module not available")
|
381 |
|
382 |
return generated_image, video_path, used_seed, best_size
|
383 |
|
@@ -412,7 +375,7 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
|
|
412 |
|
413 |
Generate images from text and convert them to high-quality videos using:
|
414 |
- **Stable Diffusion XL** for Text-to-Image generation
|
415 |
-
- **Wan 2.2 5B** for Image-to-Video generation
|
416 |
|
417 |
### ✨ Features:
|
418 |
- 📝 **Text-to-Image**: Generate images from text descriptions
|
@@ -437,11 +400,6 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
|
|
437 |
"""
|
438 |
)
|
439 |
|
440 |
-
if not WAN_AVAILABLE:
|
441 |
-
gr.Markdown("""
|
442 |
-
⚠️ **Warning**: The Wan video generation module is not available.
|
443 |
-
Only text-to-image generation will work. Video generation features are disabled.
|
444 |
-
""")
|
445 |
|
446 |
with gr.Tabs() as tabs:
|
447 |
# Tab 1: Text-to-Image
|
@@ -477,9 +435,6 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
|
|
477 |
|
478 |
# Tab 2: Image-to-Video
|
479 |
with gr.Tab("Image to Video", id="i2v_tab"):
|
480 |
-
if not WAN_AVAILABLE:
|
481 |
-
gr.Markdown("### ⚠️ Video generation is not available - Wan module not installed")
|
482 |
-
|
483 |
with gr.Row():
|
484 |
with gr.Column(scale=1):
|
485 |
i2v_image = gr.Image(type="numpy", label="Input Image", elem_id="input_image")
|
@@ -515,12 +470,7 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
|
|
515 |
i2v_shift = gr.Slider(label="Sample Shift", minimum=1.0, maximum=20.0, value=cfg.sample_shift, step=0.1)
|
516 |
i2v_seed = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)
|
517 |
|
518 |
-
i2v_generate_btn = gr.Button(
|
519 |
-
"Generate Video",
|
520 |
-
variant="primary",
|
521 |
-
size="lg",
|
522 |
-
interactive=WAN_AVAILABLE
|
523 |
-
)
|
524 |
|
525 |
with gr.Column(scale=1):
|
526 |
i2v_output = gr.Video(label="Generated Video", elem_id="output_video")
|
@@ -529,9 +479,6 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
|
|
529 |
with gr.Tab("Text to Image to Video", id="t2i2v_tab"):
|
530 |
gr.Markdown("### 🎯 Complete Pipeline: Generate an image from text, then convert it to video")
|
531 |
|
532 |
-
if not WAN_AVAILABLE:
|
533 |
-
gr.Markdown("### ⚠️ Note: Video generation is disabled - only image generation will work")
|
534 |
-
|
535 |
with gr.Row():
|
536 |
with gr.Column(scale=1):
|
537 |
gr.Markdown("#### Step 1: Image Generation Settings")
|
@@ -607,39 +554,38 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
|
|
607 |
)
|
608 |
|
609 |
# Tab 2: Image-to-Video
|
610 |
-
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
outputs=i2v_prompt
|
617 |
-
)
|
618 |
-
|
619 |
-
# Auto-select best size when image is uploaded
|
620 |
-
def handle_image_upload(image):
|
621 |
-
if image is None:
|
622 |
-
return gr.update()
|
623 |
-
pil_image = Image.fromarray(image).convert("RGB")
|
624 |
-
available_sizes = list(SUPPORTED_SIZES[TASK_NAME])
|
625 |
-
best_size = select_best_size_for_image(pil_image, available_sizes)
|
626 |
-
return gr.update(value=best_size)
|
627 |
-
|
628 |
-
i2v_image.upload(
|
629 |
-
fn=handle_image_upload,
|
630 |
-
inputs=[i2v_image],
|
631 |
-
outputs=[i2v_size]
|
632 |
-
)
|
633 |
-
|
634 |
-
i2v_generate_btn.click(
|
635 |
-
fn=generate_video,
|
636 |
-
inputs=[
|
637 |
-
i2v_image, i2v_prompt, i2v_size, i2v_duration,
|
638 |
-
i2v_steps, i2v_guide_scale, i2v_shift, i2v_seed
|
639 |
-
],
|
640 |
-
outputs=i2v_output
|
641 |
)
|
642 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
643 |
# Tab 3: Text-to-Image-to-Video
|
644 |
t2i2v_generate_btn.click(
|
645 |
fn=generate_image_to_video,
|
|
|
15 |
from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
|
16 |
from compel import Compel, ReturnedEmbeddingsType
|
17 |
|
18 |
+
# Import for Wan2.2
|
19 |
+
import wan
|
20 |
+
from wan.configs import WAN_CONFIGS, SIZE_CONFIGS, MAX_AREA_CONFIGS, SUPPORTED_SIZES
|
21 |
+
from wan.utils.utils import cache_video
|
22 |
|
23 |
# --- Global Setup ---
|
24 |
print("Starting Integrated Text-to-Image-to-Video App...")
|
|
|
54 |
)
|
55 |
|
56 |
# --- 2. Setup Image-to-Video Model (Wan2.2) ---
|
57 |
+
print("Loading Wan 2.2 TI2V-5B model...")
|
58 |
+
|
59 |
+
# Download model snapshots
|
60 |
+
repo_id = "Wan-AI/Wan2.2-TI2V-5B"
|
61 |
+
print(f"Downloading/loading checkpoints for {repo_id}...")
|
62 |
+
ckpt_dir = snapshot_download(repo_id, local_dir_use_symlinks=False)
|
63 |
+
print(f"Using checkpoints from {ckpt_dir}")
|
64 |
+
|
65 |
+
# Load the model configuration
|
66 |
+
TASK_NAME = 'ti2v-5B'
|
67 |
+
cfg = WAN_CONFIGS[TASK_NAME]
|
68 |
+
FIXED_FPS = 24
|
69 |
+
MIN_FRAMES_MODEL = 8
|
70 |
+
MAX_FRAMES_MODEL = 121
|
71 |
+
|
72 |
+
# Instantiate the pipeline
|
73 |
+
device_id = 0 if torch.cuda.is_available() else -1
|
74 |
+
wan_pipeline = wan.WanTI2V(
|
75 |
+
config=cfg,
|
76 |
+
checkpoint_dir=ckpt_dir,
|
77 |
+
device_id=device_id,
|
78 |
+
rank=0,
|
79 |
+
t5_fsdp=False,
|
80 |
+
dit_fsdp=False,
|
81 |
+
use_sp=False,
|
82 |
+
t5_cpu=False,
|
83 |
+
init_on_cpu=False,
|
84 |
+
convert_model_dtype=True,
|
85 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
print("All models loaded and ready.")
|
87 |
|
88 |
# --- Constants ---
|
|
|
230 |
progress=gr.Progress(track_tqdm=True)
|
231 |
):
|
232 |
"""Generate video from image and prompt"""
|
|
|
|
|
|
|
233 |
errors = validate_video_inputs(image, prompt, duration_seconds)
|
234 |
if errors:
|
235 |
raise gr.Error("\n".join(errors))
|
|
|
273 |
|
274 |
progress(0.9, desc="Saving video...")
|
275 |
|
|
|
276 |
video_path = cache_video(
|
277 |
tensor=video_tensor[None],
|
278 |
save_file=None,
|
|
|
330 |
available_sizes = list(SUPPORTED_SIZES[TASK_NAME])
|
331 |
best_size = select_best_size_for_image(generated_image, available_sizes)
|
332 |
|
333 |
+
# Then generate video using the generated image
|
334 |
+
video_path = generate_video(
|
335 |
+
generated_image,
|
336 |
+
video_prompt,
|
337 |
+
best_size, # Use auto-selected size
|
338 |
+
video_duration,
|
339 |
+
video_sampling_steps,
|
340 |
+
video_guide_scale,
|
341 |
+
video_shift,
|
342 |
+
video_seed
|
343 |
+
)
|
|
|
|
|
|
|
|
|
344 |
|
345 |
return generated_image, video_path, used_seed, best_size
|
346 |
|
|
|
375 |
|
376 |
Generate images from text and convert them to high-quality videos using:
|
377 |
- **Stable Diffusion XL** for Text-to-Image generation
|
378 |
+
- **Wan 2.2 5B** for Image-to-Video generation
|
379 |
|
380 |
### ✨ Features:
|
381 |
- 📝 **Text-to-Image**: Generate images from text descriptions
|
|
|
400 |
"""
|
401 |
)
|
402 |
|
|
|
|
|
|
|
|
|
|
|
403 |
|
404 |
with gr.Tabs() as tabs:
|
405 |
# Tab 1: Text-to-Image
|
|
|
435 |
|
436 |
# Tab 2: Image-to-Video
|
437 |
with gr.Tab("Image to Video", id="i2v_tab"):
|
|
|
|
|
|
|
438 |
with gr.Row():
|
439 |
with gr.Column(scale=1):
|
440 |
i2v_image = gr.Image(type="numpy", label="Input Image", elem_id="input_image")
|
|
|
470 |
i2v_shift = gr.Slider(label="Sample Shift", minimum=1.0, maximum=20.0, value=cfg.sample_shift, step=0.1)
|
471 |
i2v_seed = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)
|
472 |
|
473 |
+
i2v_generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
|
|
|
|
|
|
|
|
|
|
|
474 |
|
475 |
with gr.Column(scale=1):
|
476 |
i2v_output = gr.Video(label="Generated Video", elem_id="output_video")
|
|
|
479 |
with gr.Tab("Text to Image to Video", id="t2i2v_tab"):
|
480 |
gr.Markdown("### 🎯 Complete Pipeline: Generate an image from text, then convert it to video")
|
481 |
|
|
|
|
|
|
|
482 |
with gr.Row():
|
483 |
with gr.Column(scale=1):
|
484 |
gr.Markdown("#### Step 1: Image Generation Settings")
|
|
|
554 |
)
|
555 |
|
556 |
# Tab 2: Image-to-Video
|
557 |
+
# Connect template buttons
|
558 |
+
for name, (btn, template) in template_buttons.items():
|
559 |
+
btn.click(
|
560 |
+
fn=lambda t=template, p=i2v_prompt: apply_template(t, p),
|
561 |
+
inputs=[i2v_prompt],
|
562 |
+
outputs=i2v_prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
563 |
)
|
564 |
|
565 |
+
# Auto-select best size when image is uploaded
|
566 |
+
def handle_image_upload(image):
|
567 |
+
if image is None:
|
568 |
+
return gr.update()
|
569 |
+
pil_image = Image.fromarray(image).convert("RGB")
|
570 |
+
available_sizes = list(SUPPORTED_SIZES[TASK_NAME])
|
571 |
+
best_size = select_best_size_for_image(pil_image, available_sizes)
|
572 |
+
return gr.update(value=best_size)
|
573 |
+
|
574 |
+
i2v_image.upload(
|
575 |
+
fn=handle_image_upload,
|
576 |
+
inputs=[i2v_image],
|
577 |
+
outputs=[i2v_size]
|
578 |
+
)
|
579 |
+
|
580 |
+
i2v_generate_btn.click(
|
581 |
+
fn=generate_video,
|
582 |
+
inputs=[
|
583 |
+
i2v_image, i2v_prompt, i2v_size, i2v_duration,
|
584 |
+
i2v_steps, i2v_guide_scale, i2v_shift, i2v_seed
|
585 |
+
],
|
586 |
+
outputs=i2v_output
|
587 |
+
)
|
588 |
+
|
589 |
# Tab 3: Text-to-Image-to-Video
|
590 |
t2i2v_generate_btn.click(
|
591 |
fn=generate_image_to_video,
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
torch
|
2 |
-
torchvision
|
3 |
opencv-python>=4.9.0.80
|
4 |
diffusers>=0.31.0
|
5 |
transformers>=4.49.0
|
@@ -11,12 +11,9 @@ easydict
|
|
11 |
ftfy
|
12 |
dashscope
|
13 |
imageio-ffmpeg
|
|
|
14 |
numpy>=1.23.5,<2
|
15 |
compel
|
16 |
invisible_watermark
|
17 |
pydantic==2.10.6
|
18 |
-
xformers
|
19 |
-
gradio
|
20 |
-
spaces
|
21 |
-
huggingface_hub
|
22 |
-
Pillow
|
|
|
1 |
+
torch>=2.4.0
|
2 |
+
torchvision>=0.19.0
|
3 |
opencv-python>=4.9.0.80
|
4 |
diffusers>=0.31.0
|
5 |
transformers>=4.49.0
|
|
|
11 |
ftfy
|
12 |
dashscope
|
13 |
imageio-ffmpeg
|
14 |
+
https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.0.8/flash_attn-2.7.4.post1+cu126torch2.7-cp310-cp310-linux_x86_64.whl
|
15 |
numpy>=1.23.5,<2
|
16 |
compel
|
17 |
invisible_watermark
|
18 |
pydantic==2.10.6
|
19 |
+
xformers
|
|
|
|
|
|
|
|