Files changed (2) hide show
  1. app.py +76 -130
  2. requirements.txt +4 -7
app.py CHANGED
@@ -15,8 +15,10 @@ import gc
15
  from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
16
  from compel import Compel, ReturnedEmbeddingsType
17
 
18
- # Note: Wan2.2 import will be handled after we ensure the module exists
19
- # We'll need to handle this carefully since wan is a custom module
 
 
20
 
21
  # --- Global Setup ---
22
  print("Starting Integrated Text-to-Image-to-Video App...")
@@ -52,66 +54,35 @@ compel = Compel(
52
  )
53
 
54
  # --- 2. Setup Image-to-Video Model (Wan2.2) ---
55
- # Check if wan module exists, if not, create a mock version for testing
56
- try:
57
- import wan
58
- from wan.configs import WAN_CONFIGS, SIZE_CONFIGS, MAX_AREA_CONFIGS, SUPPORTED_SIZES
59
- from wan.utils.utils import cache_video
60
-
61
- print("Loading Wan 2.2 TI2V-5B model...")
62
-
63
- # Download model snapshots
64
- repo_id = "Wan-AI/Wan2.2-TI2V-5B"
65
- print(f"Downloading/loading checkpoints for {repo_id}...")
66
- ckpt_dir = snapshot_download(repo_id, local_dir_use_symlinks=False)
67
- print(f"Using checkpoints from {ckpt_dir}")
68
-
69
- # Load the model configuration
70
- TASK_NAME = 'ti2v-5B'
71
- cfg = WAN_CONFIGS[TASK_NAME]
72
- FIXED_FPS = 24
73
- MIN_FRAMES_MODEL = 8
74
- MAX_FRAMES_MODEL = 121
75
-
76
- # Instantiate the pipeline
77
- device_id = 0 if torch.cuda.is_available() else -1
78
- wan_pipeline = wan.WanTI2V(
79
- config=cfg,
80
- checkpoint_dir=ckpt_dir,
81
- device_id=device_id,
82
- rank=0,
83
- t5_fsdp=False,
84
- dit_fsdp=False,
85
- use_sp=False,
86
- t5_cpu=False,
87
- init_on_cpu=False,
88
- convert_model_dtype=True,
89
- )
90
- WAN_AVAILABLE = True
91
- print("Wan model loaded successfully.")
92
-
93
- except ImportError as e:
94
- print(f"Warning: Wan module not found. Video generation will be disabled. Error: {e}")
95
- WAN_AVAILABLE = False
96
- # Define default values
97
- TASK_NAME = 'ti2v-5B'
98
- FIXED_FPS = 24
99
- MIN_FRAMES_MODEL = 8
100
- MAX_FRAMES_MODEL = 121
101
- SUPPORTED_SIZES = {
102
- 'ti2v-5B': ['704*1280', '576*1024', '512*768', '768*512', '1024*576', '1280*704']
103
- }
104
- SIZE_CONFIGS = {}
105
- MAX_AREA_CONFIGS = {}
106
- wan_pipeline = None
107
-
108
- class MockConfig:
109
- sample_fps = 24
110
- sample_guide_scale = 7.0
111
- sample_shift = 3.0
112
-
113
- cfg = MockConfig()
114
-
115
  print("All models loaded and ready.")
116
 
117
  # --- Constants ---
@@ -259,9 +230,6 @@ def generate_video(
259
  progress=gr.Progress(track_tqdm=True)
260
  ):
261
  """Generate video from image and prompt"""
262
- if not WAN_AVAILABLE:
263
- raise gr.Error("Video generation is not available. The Wan module is not installed.")
264
-
265
  errors = validate_video_inputs(image, prompt, duration_seconds)
266
  if errors:
267
  raise gr.Error("\n".join(errors))
@@ -305,7 +273,6 @@ def generate_video(
305
 
306
  progress(0.9, desc="Saving video...")
307
 
308
- from wan.utils.utils import cache_video
309
  video_path = cache_video(
310
  tensor=video_tensor[None],
311
  save_file=None,
@@ -363,21 +330,17 @@ def generate_image_to_video(
363
  available_sizes = list(SUPPORTED_SIZES[TASK_NAME])
364
  best_size = select_best_size_for_image(generated_image, available_sizes)
365
 
366
- if WAN_AVAILABLE:
367
- # Then generate video using the generated image
368
- video_path = generate_video(
369
- generated_image,
370
- video_prompt,
371
- best_size, # Use auto-selected size
372
- video_duration,
373
- video_sampling_steps,
374
- video_guide_scale,
375
- video_shift,
376
- video_seed
377
- )
378
- else:
379
- video_path = None
380
- gr.Warning("Video generation skipped - Wan module not available")
381
 
382
  return generated_image, video_path, used_seed, best_size
383
 
@@ -412,7 +375,7 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
412
 
413
  Generate images from text and convert them to high-quality videos using:
414
  - **Stable Diffusion XL** for Text-to-Image generation
415
- - **Wan 2.2 5B** for Image-to-Video generation (if available)
416
 
417
  ### ✨ Features:
418
  - 📝 **Text-to-Image**: Generate images from text descriptions
@@ -437,11 +400,6 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
437
  """
438
  )
439
 
440
- if not WAN_AVAILABLE:
441
- gr.Markdown("""
442
- ⚠️ **Warning**: The Wan video generation module is not available.
443
- Only text-to-image generation will work. Video generation features are disabled.
444
- """)
445
 
446
  with gr.Tabs() as tabs:
447
  # Tab 1: Text-to-Image
@@ -477,9 +435,6 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
477
 
478
  # Tab 2: Image-to-Video
479
  with gr.Tab("Image to Video", id="i2v_tab"):
480
- if not WAN_AVAILABLE:
481
- gr.Markdown("### ⚠️ Video generation is not available - Wan module not installed")
482
-
483
  with gr.Row():
484
  with gr.Column(scale=1):
485
  i2v_image = gr.Image(type="numpy", label="Input Image", elem_id="input_image")
@@ -515,12 +470,7 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
515
  i2v_shift = gr.Slider(label="Sample Shift", minimum=1.0, maximum=20.0, value=cfg.sample_shift, step=0.1)
516
  i2v_seed = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)
517
 
518
- i2v_generate_btn = gr.Button(
519
- "Generate Video",
520
- variant="primary",
521
- size="lg",
522
- interactive=WAN_AVAILABLE
523
- )
524
 
525
  with gr.Column(scale=1):
526
  i2v_output = gr.Video(label="Generated Video", elem_id="output_video")
@@ -529,9 +479,6 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
529
  with gr.Tab("Text to Image to Video", id="t2i2v_tab"):
530
  gr.Markdown("### 🎯 Complete Pipeline: Generate an image from text, then convert it to video")
531
 
532
- if not WAN_AVAILABLE:
533
- gr.Markdown("### ⚠️ Note: Video generation is disabled - only image generation will work")
534
-
535
  with gr.Row():
536
  with gr.Column(scale=1):
537
  gr.Markdown("#### Step 1: Image Generation Settings")
@@ -607,39 +554,38 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
607
  )
608
 
609
  # Tab 2: Image-to-Video
610
- if WAN_AVAILABLE:
611
- # Connect template buttons
612
- for name, (btn, template) in template_buttons.items():
613
- btn.click(
614
- fn=lambda t=template, p=i2v_prompt: apply_template(t, p),
615
- inputs=[i2v_prompt],
616
- outputs=i2v_prompt
617
- )
618
-
619
- # Auto-select best size when image is uploaded
620
- def handle_image_upload(image):
621
- if image is None:
622
- return gr.update()
623
- pil_image = Image.fromarray(image).convert("RGB")
624
- available_sizes = list(SUPPORTED_SIZES[TASK_NAME])
625
- best_size = select_best_size_for_image(pil_image, available_sizes)
626
- return gr.update(value=best_size)
627
-
628
- i2v_image.upload(
629
- fn=handle_image_upload,
630
- inputs=[i2v_image],
631
- outputs=[i2v_size]
632
- )
633
-
634
- i2v_generate_btn.click(
635
- fn=generate_video,
636
- inputs=[
637
- i2v_image, i2v_prompt, i2v_size, i2v_duration,
638
- i2v_steps, i2v_guide_scale, i2v_shift, i2v_seed
639
- ],
640
- outputs=i2v_output
641
  )
642
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643
  # Tab 3: Text-to-Image-to-Video
644
  t2i2v_generate_btn.click(
645
  fn=generate_image_to_video,
 
15
  from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
16
  from compel import Compel, ReturnedEmbeddingsType
17
 
18
+ # Import for Wan2.2
19
+ import wan
20
+ from wan.configs import WAN_CONFIGS, SIZE_CONFIGS, MAX_AREA_CONFIGS, SUPPORTED_SIZES
21
+ from wan.utils.utils import cache_video
22
 
23
  # --- Global Setup ---
24
  print("Starting Integrated Text-to-Image-to-Video App...")
 
54
  )
55
 
56
  # --- 2. Setup Image-to-Video Model (Wan2.2) ---
57
+ print("Loading Wan 2.2 TI2V-5B model...")
58
+
59
+ # Download model snapshots
60
+ repo_id = "Wan-AI/Wan2.2-TI2V-5B"
61
+ print(f"Downloading/loading checkpoints for {repo_id}...")
62
+ ckpt_dir = snapshot_download(repo_id, local_dir_use_symlinks=False)
63
+ print(f"Using checkpoints from {ckpt_dir}")
64
+
65
+ # Load the model configuration
66
+ TASK_NAME = 'ti2v-5B'
67
+ cfg = WAN_CONFIGS[TASK_NAME]
68
+ FIXED_FPS = 24
69
+ MIN_FRAMES_MODEL = 8
70
+ MAX_FRAMES_MODEL = 121
71
+
72
+ # Instantiate the pipeline
73
+ device_id = 0 if torch.cuda.is_available() else -1
74
+ wan_pipeline = wan.WanTI2V(
75
+ config=cfg,
76
+ checkpoint_dir=ckpt_dir,
77
+ device_id=device_id,
78
+ rank=0,
79
+ t5_fsdp=False,
80
+ dit_fsdp=False,
81
+ use_sp=False,
82
+ t5_cpu=False,
83
+ init_on_cpu=False,
84
+ convert_model_dtype=True,
85
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  print("All models loaded and ready.")
87
 
88
  # --- Constants ---
 
230
  progress=gr.Progress(track_tqdm=True)
231
  ):
232
  """Generate video from image and prompt"""
 
 
 
233
  errors = validate_video_inputs(image, prompt, duration_seconds)
234
  if errors:
235
  raise gr.Error("\n".join(errors))
 
273
 
274
  progress(0.9, desc="Saving video...")
275
 
 
276
  video_path = cache_video(
277
  tensor=video_tensor[None],
278
  save_file=None,
 
330
  available_sizes = list(SUPPORTED_SIZES[TASK_NAME])
331
  best_size = select_best_size_for_image(generated_image, available_sizes)
332
 
333
+ # Then generate video using the generated image
334
+ video_path = generate_video(
335
+ generated_image,
336
+ video_prompt,
337
+ best_size, # Use auto-selected size
338
+ video_duration,
339
+ video_sampling_steps,
340
+ video_guide_scale,
341
+ video_shift,
342
+ video_seed
343
+ )
 
 
 
 
344
 
345
  return generated_image, video_path, used_seed, best_size
346
 
 
375
 
376
  Generate images from text and convert them to high-quality videos using:
377
  - **Stable Diffusion XL** for Text-to-Image generation
378
+ - **Wan 2.2 5B** for Image-to-Video generation
379
 
380
  ### ✨ Features:
381
  - 📝 **Text-to-Image**: Generate images from text descriptions
 
400
  """
401
  )
402
 
 
 
 
 
 
403
 
404
  with gr.Tabs() as tabs:
405
  # Tab 1: Text-to-Image
 
435
 
436
  # Tab 2: Image-to-Video
437
  with gr.Tab("Image to Video", id="i2v_tab"):
 
 
 
438
  with gr.Row():
439
  with gr.Column(scale=1):
440
  i2v_image = gr.Image(type="numpy", label="Input Image", elem_id="input_image")
 
470
  i2v_shift = gr.Slider(label="Sample Shift", minimum=1.0, maximum=20.0, value=cfg.sample_shift, step=0.1)
471
  i2v_seed = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)
472
 
473
+ i2v_generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
 
 
 
 
 
474
 
475
  with gr.Column(scale=1):
476
  i2v_output = gr.Video(label="Generated Video", elem_id="output_video")
 
479
  with gr.Tab("Text to Image to Video", id="t2i2v_tab"):
480
  gr.Markdown("### 🎯 Complete Pipeline: Generate an image from text, then convert it to video")
481
 
 
 
 
482
  with gr.Row():
483
  with gr.Column(scale=1):
484
  gr.Markdown("#### Step 1: Image Generation Settings")
 
554
  )
555
 
556
  # Tab 2: Image-to-Video
557
+ # Connect template buttons
558
+ for name, (btn, template) in template_buttons.items():
559
+ btn.click(
560
+ fn=lambda t=template, p=i2v_prompt: apply_template(t, p),
561
+ inputs=[i2v_prompt],
562
+ outputs=i2v_prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
  )
564
 
565
+ # Auto-select best size when image is uploaded
566
+ def handle_image_upload(image):
567
+ if image is None:
568
+ return gr.update()
569
+ pil_image = Image.fromarray(image).convert("RGB")
570
+ available_sizes = list(SUPPORTED_SIZES[TASK_NAME])
571
+ best_size = select_best_size_for_image(pil_image, available_sizes)
572
+ return gr.update(value=best_size)
573
+
574
+ i2v_image.upload(
575
+ fn=handle_image_upload,
576
+ inputs=[i2v_image],
577
+ outputs=[i2v_size]
578
+ )
579
+
580
+ i2v_generate_btn.click(
581
+ fn=generate_video,
582
+ inputs=[
583
+ i2v_image, i2v_prompt, i2v_size, i2v_duration,
584
+ i2v_steps, i2v_guide_scale, i2v_shift, i2v_seed
585
+ ],
586
+ outputs=i2v_output
587
+ )
588
+
589
  # Tab 3: Text-to-Image-to-Video
590
  t2i2v_generate_btn.click(
591
  fn=generate_image_to_video,
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
- torch==2.4.0
2
- torchvision==0.19.0
3
  opencv-python>=4.9.0.80
4
  diffusers>=0.31.0
5
  transformers>=4.49.0
@@ -11,12 +11,9 @@ easydict
11
  ftfy
12
  dashscope
13
  imageio-ffmpeg
 
14
  numpy>=1.23.5,<2
15
  compel
16
  invisible_watermark
17
  pydantic==2.10.6
18
- xformers
19
- gradio
20
- spaces
21
- huggingface_hub
22
- Pillow
 
1
+ torch>=2.4.0
2
+ torchvision>=0.19.0
3
  opencv-python>=4.9.0.80
4
  diffusers>=0.31.0
5
  transformers>=4.49.0
 
11
  ftfy
12
  dashscope
13
  imageio-ffmpeg
14
+ https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.0.8/flash_attn-2.7.4.post1+cu126torch2.7-cp310-cp310-linux_x86_64.whl
15
  numpy>=1.23.5,<2
16
  compel
17
  invisible_watermark
18
  pydantic==2.10.6
19
+ xformers