Wan-2.2-ADULT

Running on Zero

App Files Files Community

misspoppins

by InvisibleFreakKollektiv8020 - opened 18 days ago

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+80

-137

This PR is in draft mode

Files changed (2) hide show

app.py +76 -130
requirements.txt +4 -7

app.py CHANGED Viewed

@@ -15,8 +15,10 @@ import gc
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 from compel import Compel, ReturnedEmbeddingsType
-# Note: Wan2.2 import will be handled after we ensure the module exists
-# We'll need to handle this carefully since wan is a custom module
 # --- Global Setup ---
 print("Starting Integrated Text-to-Image-to-Video App...")
@@ -52,66 +54,35 @@ compel = Compel(
 )
 # --- 2. Setup Image-to-Video Model (Wan2.2) ---
-# Check if wan module exists, if not, create a mock version for testing
-try:
-    import wan
-    from wan.configs import WAN_CONFIGS, SIZE_CONFIGS, MAX_AREA_CONFIGS, SUPPORTED_SIZES
-    from wan.utils.utils import cache_video
-    print("Loading Wan 2.2 TI2V-5B model...")
-    # Download model snapshots
-    repo_id = "Wan-AI/Wan2.2-TI2V-5B"
-    print(f"Downloading/loading checkpoints for {repo_id}...")
-    ckpt_dir = snapshot_download(repo_id, local_dir_use_symlinks=False)
-    print(f"Using checkpoints from {ckpt_dir}")
-    # Load the model configuration
-    TASK_NAME = 'ti2v-5B'
-    cfg = WAN_CONFIGS[TASK_NAME]
-    FIXED_FPS = 24
-    MIN_FRAMES_MODEL = 8
-    MAX_FRAMES_MODEL = 121
-    # Instantiate the pipeline
-    device_id = 0 if torch.cuda.is_available() else -1
-    wan_pipeline = wan.WanTI2V(
-        config=cfg,
-        checkpoint_dir=ckpt_dir,
-        device_id=device_id,
-        rank=0,
-        t5_fsdp=False,
-        dit_fsdp=False,
-        use_sp=False,
-        t5_cpu=False,
-        init_on_cpu=False,
-        convert_model_dtype=True,
-    )
-    WAN_AVAILABLE = True
-    print("Wan model loaded successfully.")
-except ImportError as e:
-    print(f"Warning: Wan module not found. Video generation will be disabled. Error: {e}")
-    WAN_AVAILABLE = False
-    # Define default values
-    TASK_NAME = 'ti2v-5B'
-    FIXED_FPS = 24
-    MIN_FRAMES_MODEL = 8
-    MAX_FRAMES_MODEL = 121
-    SUPPORTED_SIZES = {
-        'ti2v-5B': ['704*1280', '576*1024', '512*768', '768*512', '1024*576', '1280*704']
-    }
-    SIZE_CONFIGS = {}
-    MAX_AREA_CONFIGS = {}
-    wan_pipeline = None
-    class MockConfig:
-        sample_fps = 24
-        sample_guide_scale = 7.0
-        sample_shift = 3.0
-    cfg = MockConfig()
 print("All models loaded and ready.")
 # --- Constants ---
@@ -259,9 +230,6 @@ def generate_video(
     progress=gr.Progress(track_tqdm=True)
 ):
     """Generate video from image and prompt"""
-    if not WAN_AVAILABLE:
-        raise gr.Error("Video generation is not available. The Wan module is not installed.")
     errors = validate_video_inputs(image, prompt, duration_seconds)
     if errors:
         raise gr.Error("\n".join(errors))
@@ -305,7 +273,6 @@ def generate_video(
         progress(0.9, desc="Saving video...")
-        from wan.utils.utils import cache_video
         video_path = cache_video(
             tensor=video_tensor[None],
             save_file=None,
@@ -363,21 +330,17 @@ def generate_image_to_video(
     available_sizes = list(SUPPORTED_SIZES[TASK_NAME])
     best_size = select_best_size_for_image(generated_image, available_sizes)
-    if WAN_AVAILABLE:
-        # Then generate video using the generated image
-        video_path = generate_video(
-            generated_image,
-            video_prompt,
-            best_size,  # Use auto-selected size
-            video_duration,
-            video_sampling_steps,
-            video_guide_scale,
-            video_shift,
-            video_seed
-        )
-    else:
-        video_path = None
-        gr.Warning("Video generation skipped - Wan module not available")
     return generated_image, video_path, used_seed, best_size
@@ -412,7 +375,7 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
     Generate images from text and convert them to high-quality videos using:
     - **Stable Diffusion XL** for Text-to-Image generation
-    - **Wan 2.2 5B** for Image-to-Video generation (if available)
     ### ✨ Features:
     - 📝 **Text-to-Image**: Generate images from text descriptions
@@ -437,11 +400,6 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
         """
     )
-    if not WAN_AVAILABLE:
-        gr.Markdown("""
-        ⚠️ **Warning**: The Wan video generation module is not available.
-        Only text-to-image generation will work. Video generation features are disabled.
-        """)
     with gr.Tabs() as tabs:
         # Tab 1: Text-to-Image
@@ -477,9 +435,6 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
         # Tab 2: Image-to-Video
         with gr.Tab("Image to Video", id="i2v_tab"):
-            if not WAN_AVAILABLE:
-                gr.Markdown("### ⚠️ Video generation is not available - Wan module not installed")
             with gr.Row():
                 with gr.Column(scale=1):
                     i2v_image = gr.Image(type="numpy", label="Input Image", elem_id="input_image")
@@ -515,12 +470,7 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
                         i2v_shift = gr.Slider(label="Sample Shift", minimum=1.0, maximum=20.0, value=cfg.sample_shift, step=0.1)
                         i2v_seed = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)
-                    i2v_generate_btn = gr.Button(
-                        "Generate Video",
-                        variant="primary",
-                        size="lg",
-                        interactive=WAN_AVAILABLE
-                    )
                 with gr.Column(scale=1):
                     i2v_output = gr.Video(label="Generated Video", elem_id="output_video")
@@ -529,9 +479,6 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
         with gr.Tab("Text to Image to Video", id="t2i2v_tab"):
             gr.Markdown("### 🎯 Complete Pipeline: Generate an image from text, then convert it to video")
-            if not WAN_AVAILABLE:
-                gr.Markdown("### ⚠️ Note: Video generation is disabled - only image generation will work")
             with gr.Row():
                 with gr.Column(scale=1):
                     gr.Markdown("#### Step 1: Image Generation Settings")
@@ -607,39 +554,38 @@ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
     )
     # Tab 2: Image-to-Video
-    if WAN_AVAILABLE:
-        # Connect template buttons
-        for name, (btn, template) in template_buttons.items():
-            btn.click(
-                fn=lambda t=template, p=i2v_prompt: apply_template(t, p),
-                inputs=[i2v_prompt],
-                outputs=i2v_prompt
-            )
-        # Auto-select best size when image is uploaded
-        def handle_image_upload(image):
-            if image is None:
-                return gr.update()
-            pil_image = Image.fromarray(image).convert("RGB")
-            available_sizes = list(SUPPORTED_SIZES[TASK_NAME])
-            best_size = select_best_size_for_image(pil_image, available_sizes)
-            return gr.update(value=best_size)
-        i2v_image.upload(
-            fn=handle_image_upload,
-            inputs=[i2v_image],
-            outputs=[i2v_size]
-        )
-        i2v_generate_btn.click(
-            fn=generate_video,
-            inputs=[
-                i2v_image, i2v_prompt, i2v_size, i2v_duration,
-                i2v_steps, i2v_guide_scale, i2v_shift, i2v_seed
-            ],
-            outputs=i2v_output
         )
     # Tab 3: Text-to-Image-to-Video
     t2i2v_generate_btn.click(
         fn=generate_image_to_video,

 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 from compel import Compel, ReturnedEmbeddingsType
+# Import for Wan2.2
+import wan
+from wan.configs import WAN_CONFIGS, SIZE_CONFIGS, MAX_AREA_CONFIGS, SUPPORTED_SIZES
+from wan.utils.utils import cache_video
 # --- Global Setup ---
 print("Starting Integrated Text-to-Image-to-Video App...")
 )
 # --- 2. Setup Image-to-Video Model (Wan2.2) ---
+print("Loading Wan 2.2 TI2V-5B model...")
+# Download model snapshots
+repo_id = "Wan-AI/Wan2.2-TI2V-5B"
+print(f"Downloading/loading checkpoints for {repo_id}...")
+ckpt_dir = snapshot_download(repo_id, local_dir_use_symlinks=False)
+print(f"Using checkpoints from {ckpt_dir}")
+# Load the model configuration
+TASK_NAME = 'ti2v-5B'
+cfg = WAN_CONFIGS[TASK_NAME]
+FIXED_FPS = 24
+MIN_FRAMES_MODEL = 8
+MAX_FRAMES_MODEL = 121
+# Instantiate the pipeline
+device_id = 0 if torch.cuda.is_available() else -1
+wan_pipeline = wan.WanTI2V(
+    config=cfg,
+    checkpoint_dir=ckpt_dir,
+    device_id=device_id,
+    rank=0,
+    t5_fsdp=False,
+    dit_fsdp=False,
+    use_sp=False,
+    t5_cpu=False,
+    init_on_cpu=False,
+    convert_model_dtype=True,
+)
 print("All models loaded and ready.")
 # --- Constants ---
     progress=gr.Progress(track_tqdm=True)
 ):
     """Generate video from image and prompt"""
     errors = validate_video_inputs(image, prompt, duration_seconds)
     if errors:
         raise gr.Error("\n".join(errors))
         progress(0.9, desc="Saving video...")
         video_path = cache_video(
             tensor=video_tensor[None],
             save_file=None,
     available_sizes = list(SUPPORTED_SIZES[TASK_NAME])
     best_size = select_best_size_for_image(generated_image, available_sizes)
+    # Then generate video using the generated image
+    video_path = generate_video(
+        generated_image,
+        video_prompt,
+        best_size,  # Use auto-selected size
+        video_duration,
+        video_sampling_steps,
+        video_guide_scale,
+        video_shift,
+        video_seed
+    )
     return generated_image, video_path, used_seed, best_size
     Generate images from text and convert them to high-quality videos using:
     - **Stable Diffusion XL** for Text-to-Image generation
+    - **Wan 2.2 5B** for Image-to-Video generation
     ### ✨ Features:
     - 📝 **Text-to-Image**: Generate images from text descriptions
         """
     )
     with gr.Tabs() as tabs:
         # Tab 1: Text-to-Image
         # Tab 2: Image-to-Video
         with gr.Tab("Image to Video", id="i2v_tab"):
             with gr.Row():
                 with gr.Column(scale=1):
                     i2v_image = gr.Image(type="numpy", label="Input Image", elem_id="input_image")
                         i2v_shift = gr.Slider(label="Sample Shift", minimum=1.0, maximum=20.0, value=cfg.sample_shift, step=0.1)
                         i2v_seed = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)
+                    i2v_generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
                 with gr.Column(scale=1):
                     i2v_output = gr.Video(label="Generated Video", elem_id="output_video")
         with gr.Tab("Text to Image to Video", id="t2i2v_tab"):
             gr.Markdown("### 🎯 Complete Pipeline: Generate an image from text, then convert it to video")
             with gr.Row():
                 with gr.Column(scale=1):
                     gr.Markdown("#### Step 1: Image Generation Settings")
     )
     # Tab 2: Image-to-Video
+    # Connect template buttons
+    for name, (btn, template) in template_buttons.items():
+        btn.click(
+            fn=lambda t=template, p=i2v_prompt: apply_template(t, p),
+            inputs=[i2v_prompt],
+            outputs=i2v_prompt
         )
+    # Auto-select best size when image is uploaded
+    def handle_image_upload(image):
+        if image is None:
+            return gr.update()
+        pil_image = Image.fromarray(image).convert("RGB")
+        available_sizes = list(SUPPORTED_SIZES[TASK_NAME])
+        best_size = select_best_size_for_image(pil_image, available_sizes)
+        return gr.update(value=best_size)
+    i2v_image.upload(
+        fn=handle_image_upload,
+        inputs=[i2v_image],
+        outputs=[i2v_size]
+    )
+    i2v_generate_btn.click(
+        fn=generate_video,
+        inputs=[
+            i2v_image, i2v_prompt, i2v_size, i2v_duration,
+            i2v_steps, i2v_guide_scale, i2v_shift, i2v_seed
+        ],
+        outputs=i2v_output
+    )
     # Tab 3: Text-to-Image-to-Video
     t2i2v_generate_btn.click(
         fn=generate_image_to_video,

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
-torch==2.4.0
-torchvision==0.19.0
 opencv-python>=4.9.0.80
 diffusers>=0.31.0
 transformers>=4.49.0
@@ -11,12 +11,9 @@ easydict
 ftfy
 dashscope
 imageio-ffmpeg
 numpy>=1.23.5,<2
 compel
 invisible_watermark
 pydantic==2.10.6
-xformers
-gradio
-spaces
-huggingface_hub
-Pillow

+torch>=2.4.0
+torchvision>=0.19.0
 opencv-python>=4.9.0.80
 diffusers>=0.31.0
 transformers>=4.49.0
 ftfy
 dashscope
 imageio-ffmpeg
+https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.0.8/flash_attn-2.7.4.post1+cu126torch2.7-cp310-cp310-linux_x86_64.whl
 numpy>=1.23.5,<2
 compel
 invisible_watermark
 pydantic==2.10.6
+xformers