FastWan2.2_5B_TI2V

Runtime error

App Files Files Community

rahul7star commited on Aug 4

Commit

493311b

verified ·

1 Parent(s): 19742b1

Update app_t2v.py

Browse files

Files changed (1) hide show

app_t2v.py +584 -176

app_t2v.py CHANGED Viewed

@@ -1,214 +1,622 @@
-# PyTorch 2.8 (temporary hack)
 import os
-os.system('pip install --upgrade --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu126 "torch<2.9" spaces')
-# Actual demo code
-import spaces
-import torch
-from diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline
-from diffusers.models.transformers.transformer_wan import WanTransformer3DModel
-from diffusers.utils.export_utils import export_to_video
 import gradio as gr
-import tempfile
-import numpy as np
 from PIL import Image
 import random
-from optimization import optimize_pipeline_
-MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
-#MODEL_ID = "cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers"
-LANDSCAPE_WIDTH = 832
-LANDSCAPE_HEIGHT = 480
-MAX_SEED = np.iinfo(np.int32).max
 FIXED_FPS = 24
-MIN_FRAMES_MODEL = 13
 MAX_FRAMES_MODEL = 121
-NUM_FRAMES_DEFAULT = 81
-pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID,
-    transformer=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
-        subfolder='transformer',
-        torch_dtype=torch.bfloat16,
-        device_map='cuda',
-    ),
-    transformer_2=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
-        subfolder='transformer_2',
-        torch_dtype=torch.bfloat16,
-        device_map='cuda',
-    ),
-    torch_dtype=torch.bfloat16,
-).to('cuda')
-optimize_pipeline_(pipe,
-    image=Image.new('RGB', (LANDSCAPE_WIDTH, LANDSCAPE_HEIGHT)),
-    prompt='prompt',
-    height=LANDSCAPE_HEIGHT,
-    width=LANDSCAPE_WIDTH,
-    num_frames=MAX_FRAMES_MODEL,
-)
-default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
-default_negative_prompt = "色调艳丽, 过曝, 静态, 细节模糊不清, 字幕, 风格, 作品, 画作, 画面, 静止, 整体发灰, 最差质量, 低质量, JPEG压缩残留, 丑陋的, 残缺的, 多余的手指, 画得不好的手部, 画得不好的脸部, 畸形的, 毁容的, 形态畸形的肢体, 手指融合, 静止不动的画面, 杂乱的背景, 三条腿, 背景人很多, 倒着走"
-def resize_image(image: Image.Image) -> Image.Image:
-    if image.height > image.width:
-        transposed = image.transpose(Image.Transpose.ROTATE_90)
-        resized = resize_image_landscape(transposed)
-        return resized.transpose(Image.Transpose.ROTATE_270)
-    return resize_image_landscape(image)
-def resize_image_landscape(image: Image.Image) -> Image.Image:
-    target_aspect = LANDSCAPE_WIDTH / LANDSCAPE_HEIGHT
-    width, height = image.size
-    in_aspect = width / height
-    if in_aspect > target_aspect:
-        new_width = round(height * target_aspect)
-        left = (width - new_width) // 2
-        image = image.crop((left, 0, left + new_width, height))
-    else:
-        new_height = round(width / target_aspect)
-        top = (height - new_height) // 2
-        image = image.crop((0, top, width, top + new_height))
-    return image.resize((LANDSCAPE_WIDTH, LANDSCAPE_HEIGHT), Image.LANCZOS)
-def get_duration(
-    input_image,
     prompt,
     negative_prompt,
-    num_frames,
-    guidance_scale,
-    steps,
     seed,
     randomize_seed,
-    progress,
 ):
-    forward_duration_base = 8
-    forward_duration = forward_duration_base * (num_frames / NUM_FRAMES_DEFAULT)**1.5
-    forward_count = 2 if guidance_scale > 1 else 1
-    return 10 + steps * forward_count * forward_duration
-@spaces.GPU(duration=get_duration)
 def generate_video(
-    input_image,
     prompt,
-    negative_prompt=default_negative_prompt,
-    num_frames = NUM_FRAMES_DEFAULT,
-    guidance_scale = 1,
-    steps = 28,
-    seed = 42,
-    randomize_seed = False,
-    progress=gr.Progress(track_tqdm=True),
 ):
-    """
-    Generate a video from an input image using the Wan 2.1 I2V model with CausVid LoRA.
-    This function takes an input image and generates a video animation based on the provided
-    prompt and parameters. It uses the Wan 2.1 14B Image-to-Video model with CausVid LoRA
-    for fast generation in 4-8 steps.
-    Args:
-        input_image (PIL.Image): The input image to animate. Will be resized to target dimensions.
-        prompt (str): Text prompt describing the desired animation or motion.
-        negative_prompt (str, optional): Negative prompt to avoid unwanted elements.
-            Defaults to default_negative_prompt (contains unwanted visual artifacts).
-        num_frames (int, optional): Number of frames.
-            Defaults to MAX_FRAMES_MODEL
-        guidance_scale (float, optional): Controls adherence to the prompt. Higher values = more adherence.
-            Defaults to 1.0. Range: 0.0-20.0.
-        steps (int, optional): Number of inference steps. More steps = higher quality but slower.
-            Defaults to 4. Range: 1-30.
-        seed (int, optional): Random seed for reproducible results. Defaults to 42.
-            Range: 0 to MAX_SEED (2147483647).
-        randomize_seed (bool, optional): Whether to use a random seed instead of the provided seed.
-            Defaults to False.
-        progress (gr.Progress, optional): Gradio progress tracker. Defaults to gr.Progress(track_tqdm=True).
-    Returns:
-        tuple: A tuple containing:
-            - video_path (str): Path to the generated video file (.mp4)
-            - current_seed (int): The seed used for generation (useful when randomize_seed=True)
-    Raises:
-        gr.Error: If input_image is None (no image uploaded).
-    Note:
-        - The function automatically resizes the input image to the target dimensions
-        - Output dimensions are adjusted to be multiples of MOD_VALUE (32)
-        - The function uses GPU acceleration via the @spaces.GPU decorator
-    """
-    if input_image is None:
-        raise gr.Error("Please upload an input image.")
-    current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
-    resized_image = resize_image(input_image)
-    output_frames_list = pipe(
-        image=resized_image,
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-        height=resized_image.height,
-        width=resized_image.width,
-        num_frames=num_frames,
-        guidance_scale=float(guidance_scale),
-        num_inference_steps=int(steps),
-        generator=torch.Generator(device="cuda").manual_seed(current_seed),
-    ).frames[0]
-    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
-        video_path = tmpfile.name
-    export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
-    return video_path, current_seed
-with gr.Blocks() as demo:
-    gr.Markdown("# Fast 4 steps Wan 2.1 I2V (14B) with CausVid LoRA")
-    gr.Markdown("[CausVid](https://github.com/tianweiy/CausVid) is a distilled version of Wan 2.1 to run faster in just 4-8 steps, [extracted as LoRA by Kijai](https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_CausVid_14B_T2V_lora_rank32.safetensors) and is compatible with 🧨 diffusers")
-    with gr.Row():
-        with gr.Column():
-            input_image_component = gr.Image(type="pil", label="Input Image (auto-resized to target H/W)")
-            prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
-            num_frames_input = gr.Slider(minimum=MIN_FRAMES_MODEL, maximum=MAX_FRAMES_MODEL, step=1, value=NUM_FRAMES_DEFAULT, label="Frames")
-            with gr.Accordion("Advanced Settings", open=False):
-                negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
-                seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
-                randomize_seed_checkbox = gr.Checkbox(label="Randomize seed", value=True, interactive=True)
-                steps_slider = gr.Slider(minimum=1, maximum=40, step=1, value=28, label="Inference Steps")
-                guidance_scale_input = gr.Slider(minimum=1.0, maximum=20.0, step=0.5, value=1.0, label="Guidance Scale")
-            generate_button = gr.Button("Generate Video", variant="primary")
-        with gr.Column():
-            video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
-    ui_inputs = [
-        input_image_component, prompt_input,
-        negative_prompt_input, num_frames_input,
-        guidance_scale_input, steps_slider, seed_input, randomize_seed_checkbox
-    ]
-    generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])
     gr.Examples(
-        examples=[
-            [
-                "wan_i2v_input.JPG",
-                "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside.",
-            ],
         ],
-        inputs=[input_image_component, prompt_input], outputs=[video_output, seed_input], fn=generate_video, cache_examples="lazy"
     )
 if __name__ == "__main__":
-    demo.queue().launch(mcp_server=True)

 import os
+import sys
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 import gradio as gr
+import torch
+from huggingface_hub import snapshot_download
 from PIL import Image
 import random
+import numpy as np
+import spaces
+import gc
+# Import for Stable Diffusion XL
+from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
+from compel import Compel, ReturnedEmbeddingsType
+# Import for Wan2.2
+import wan
+from wan.configs import WAN_CONFIGS, SIZE_CONFIGS, MAX_AREA_CONFIGS, SUPPORTED_SIZES
+from wan.utils.utils import cache_video
+# --- Global Setup ---
+print("Starting Integrated Text-to-Image-to-Video App...")
+# --- 1. Setup Text-to-Image Model (SDXL) ---
+print("Loading Stable Diffusion XL model...")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Initialize SDXL pipeline
+sdxl_pipe = StableDiffusionXLPipeline.from_pretrained(
+    "votepurchase/pornmasterPro_noobV3VAE",
+    torch_dtype=torch.float16,
+    variant="fp16",
+    use_safetensors=True
+)
+sdxl_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sdxl_pipe.scheduler.config)
+sdxl_pipe.to(device)
+# Force all components to use the same dtype
+sdxl_pipe.text_encoder.to(torch.float16)
+sdxl_pipe.text_encoder_2.to(torch.float16)
+sdxl_pipe.vae.to(torch.float16)
+sdxl_pipe.unet.to(torch.float16)
+# Initialize Compel for long prompt processing
+compel = Compel(
+    tokenizer=[sdxl_pipe.tokenizer, sdxl_pipe.tokenizer_2],
+    text_encoder=[sdxl_pipe.text_encoder, sdxl_pipe.text_encoder_2],
+    returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
+    requires_pooled=[False, True],
+    truncate_long_prompts=False
+)
+# --- 2. Setup Image-to-Video Model (Wan2.2) ---
+print("Loading Wan 2.2 TI2V-5B model...")
+# Download model snapshots
+repo_id = "Wan-AI/Wan2.2-TI2V-5B"
+print(f"Downloading/loading checkpoints for {repo_id}...")
+ckpt_dir = snapshot_download(repo_id, local_dir_use_symlinks=False)
+print(f"Using checkpoints from {ckpt_dir}")
+# Load the model configuration
+TASK_NAME = 'ti2v-5B'
+cfg = WAN_CONFIGS[TASK_NAME]
 FIXED_FPS = 24
+MIN_FRAMES_MODEL = 8
 MAX_FRAMES_MODEL = 121
+# Instantiate the pipeline
+device_id = 0 if torch.cuda.is_available() else -1
+wan_pipeline = wan.WanTI2V(
+    config=cfg,
+    checkpoint_dir=ckpt_dir,
+    device_id=device_id,
+    rank=0,
+    t5_fsdp=False,
+    dit_fsdp=False,
+    use_sp=False,
+    t5_cpu=False,
+    init_on_cpu=False,
+    convert_model_dtype=True,
+)
+print("All models loaded and ready.")
+# --- Constants ---
+MAX_SEED = np.iinfo(np.int32).max
+MAX_IMAGE_SIZE = 1216
+# --- Helper Functions ---
+def clear_gpu_memory():
+    """Clear GPU memory more thoroughly"""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+    gc.collect()
+def process_long_prompt(prompt, negative_prompt=""):
+    """Simple long prompt processing using Compel"""
+    try:
+        conditioning, pooled = compel([prompt, negative_prompt])
+        return conditioning, pooled
+    except Exception as e:
+        print(f"Long prompt processing failed: {e}, falling back to standard processing")
+        return None, None
+def select_best_size_for_image(image, available_sizes):
+    """Select the size option with aspect ratio closest to the input image."""
+    if image is None:
+        return available_sizes[0]
+    img_width, img_height = image.size
+    img_aspect_ratio = img_height / img_width
+    best_size = available_sizes[0]
+    best_diff = float('inf')
+    for size_str in available_sizes:
+        height, width = map(int, size_str.split('*'))
+        size_aspect_ratio = height / width
+        diff = abs(img_aspect_ratio - size_aspect_ratio)
+        if diff < best_diff:
+            best_diff = diff
+            best_size = size_str
+    return best_size
+def validate_video_inputs(image, prompt, duration_seconds):
+    """Validate user inputs for video generation"""
+    errors = []
+    if not prompt or len(prompt.strip()) < 5:
+        errors.append("Prompt must be at least 5 characters long.")
+    if image is not None:
+        if isinstance(image, np.ndarray):
+            img = Image.fromarray(image)
+        else:
+            img = image
+        if img.size[0] * img.size[1] > 4096 * 4096:
+            errors.append("Image size is too large (maximum 4096x4096).")
+    if duration_seconds > 5.0 and image is None:
+        errors.append("Videos longer than 5 seconds require an input image.")
+    return errors
+# --- Text-to-Image Generation Function ---
+@spaces.GPU(duration=30)
+def generate_image(
     prompt,
     negative_prompt,
     seed,
     randomize_seed,
+    width,
+    height,
+    guidance_scale,
+    num_inference_steps,
+    progress=gr.Progress(track_tqdm=True)
 ):
+    """Generate image from text prompt"""
+    progress(0, desc="Initializing image generation...")
+    use_long_prompt = len(prompt.split()) > 60 or len(prompt) > 300
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    generator = torch.Generator(device=device).manual_seed(seed)
+    try:
+        progress(0.3, desc="Processing prompt...")
+        if use_long_prompt:
+            print("Using long prompt processing...")
+            conditioning, pooled = process_long_prompt(prompt, negative_prompt)
+            if conditioning is not None:
+                progress(0.5, desc="Generating image...")
+                output_image = sdxl_pipe(
+                    prompt_embeds=conditioning[0:1],
+                    pooled_prompt_embeds=pooled[0:1],
+                    negative_prompt_embeds=conditioning[1:2],
+                    negative_pooled_prompt_embeds=pooled[1:2],
+                    guidance_scale=guidance_scale,
+                    num_inference_steps=num_inference_steps,
+                    width=width,
+                    height=height,
+                    generator=generator
+                ).images[0]
+                progress(1.0, desc="Complete!")
+                return output_image, seed
+        # Fall back to standard processing
+        progress(0.5, desc="Generating image...")
+        output_image = sdxl_pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            width=width,
+            height=height,
+            generator=generator
+        ).images[0]
+        progress(1.0, desc="Complete!")
+        return output_image, seed
+    except RuntimeError as e:
+        print(f"Error during generation: {e}")
+        error_img = Image.new('RGB', (width, height), color=(0, 0, 0))
+        return error_img, seed
+    finally:
+        clear_gpu_memory()
+# --- Image-to-Video Generation Function ---
+def get_video_duration(image, prompt, size, duration_seconds, sampling_steps, guide_scale, shift, seed, progress):
+    """Calculate dynamic GPU duration for video generation"""
+    if sampling_steps > 35 and duration_seconds >= 2:
+        return 120
+    elif sampling_steps < 35 or duration_seconds < 2:
+        return 105
+    else:
+        return 90
+@spaces.GPU(duration=get_video_duration)
 def generate_video(
+    image,
     prompt,
+    size,
+    duration_seconds,
+    sampling_steps,
+    guide_scale,
+    shift,
+    seed,
+    progress=gr.Progress(track_tqdm=True)
+):
+    """Generate video from image and prompt"""
+    errors = validate_video_inputs(image, prompt, duration_seconds)
+    if errors:
+        raise gr.Error("\n".join(errors))
+    progress(0, desc="Setting up video generation...")
+    if seed == -1:
+        seed = random.randint(0, sys.maxsize)
+    progress(0.1, desc="Processing image...")
+    input_image = None
+    if image is not None:
+        if isinstance(image, np.ndarray):
+            input_image = Image.fromarray(image).convert("RGB")
+        else:
+            input_image = image.convert("RGB")
+        # Resize image to match selected size
+        target_height, target_width = map(int, size.split('*'))
+        input_image = input_image.resize((target_width, target_height))
+    # Calculate number of frames based on duration
+    num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
+    progress(0.2, desc="Generating video...")
+    try:
+        video_tensor = wan_pipeline.generate(
+            input_prompt=prompt,
+            img=input_image,
+            size=SIZE_CONFIGS[size],
+            max_area=MAX_AREA_CONFIGS[size],
+            frame_num=num_frames,
+            shift=shift,
+            sample_solver='unipc',
+            sampling_steps=int(sampling_steps),
+            guide_scale=guide_scale,
+            seed=seed,
+            offload_model=True
+        )
+        progress(0.9, desc="Saving video...")
+        video_path = cache_video(
+            tensor=video_tensor[None],
+            save_file=None,
+            fps=cfg.sample_fps,
+            normalize=True,
+            value_range=(-1, 1)
+        )
+        progress(1.0, desc="Complete!")
+    except torch.cuda.OutOfMemoryError:
+        clear_gpu_memory()
+        raise gr.Error("GPU out of memory. Please try with lower settings.")
+    except Exception as e:
+        raise gr.Error(f"Video generation failed: {str(e)}")
+    finally:
+        if 'video_tensor' in locals():
+            del video_tensor
+        clear_gpu_memory()
+    return video_path
+# --- Combined Generation Function ---
+def generate_image_to_video(
+    img_prompt,
+    img_negative_prompt,
+    img_seed,
+    img_randomize_seed,
+    img_width,
+    img_height,
+    img_guidance_scale,
+    img_num_inference_steps,
+    video_prompt,
+    video_size,
+    video_duration,
+    video_sampling_steps,
+    video_guide_scale,
+    video_shift,
+    video_seed
 ):
+    """Generate image from text, then use it to generate video"""
+    # First generate image
+    generated_image, used_seed = generate_image(
+        img_prompt,
+        img_negative_prompt,
+        img_seed,
+        img_randomize_seed,
+        img_width,
+        img_height,
+        img_guidance_scale,
+        img_num_inference_steps
+    )
+    # Update the best video size based on generated image
+    available_sizes = list(SUPPORTED_SIZES[TASK_NAME])
+    best_size = select_best_size_for_image(generated_image, available_sizes)
+    # Then generate video using the generated image
+    video_path = generate_video(
+        generated_image,
+        video_prompt,
+        best_size,  # Use auto-selected size
+        video_duration,
+        video_sampling_steps,
+        video_guide_scale,
+        video_shift,
+        video_seed
+    )
+    return generated_image, video_path, used_seed, best_size
+# --- Gradio Interface ---
+css = """
+.gradio-container {max-width: 1400px !important; margin: 0 auto}
+#output_video {height: 500px;}
+#input_image {height: 400px;}
+#generated_image {height: 400px;}
+.tab-nav button {font-size: 18px !important; padding: 10px 20px !important;}
+"""
+# Prompt templates
+video_templates = {
+    "Cinematic": "cinematic shot of {subject}, professional lighting, smooth camera movement, 4k quality",
+    "Animation": "animated style {subject}, vibrant colors, fluid motion, dynamic movement",
+    "Nature": "nature documentary footage of {subject}, wildlife photography, natural movement",
+    "Slow Motion": "slow motion capture of {subject}, high speed camera, detailed motion",
+    "Action": "dynamic action shot of {subject}, fast paced movement, energetic motion"
+}
+def apply_template(template, current_prompt):
+    """Apply prompt template"""
+    if "{subject}" in template:
+        subject = current_prompt.split(",")[0] if "," in current_prompt else current_prompt
+        return template.replace("{subject}", subject)
+    return template + " " + current_prompt
+with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🎨 Integrated Text-to-Image-to-Video Generator
+    Generate images from text and convert them to high-quality videos using:
+    - **Stable Diffusion XL** for Text-to-Image generation
+    - **Wan 2.2 5B** for Image-to-Video generation
+    ### ✨ Features:
+    - 📝 **Text-to-Image**: Generate images from text descriptions
+    - 🎬 **Image-to-Video**: Convert images (uploaded or generated) to videos
+    - 🔄 **Text-to-Image-to-Video**: Complete pipeline from text to video
+    """)
+    # Badge section
+    gr.HTML(
+        """
+        <div style="display: flex; justify-content: center; align-items: center; gap: 20px; margin: 20px 0;">
+            <a href="https://huggingface.co/spaces/Heartsync/Wan-2.2-ADULT" target="_blank">
+                <img src="https://img.shields.io/static/v1?label=T2I%20%26%20TI2V&message=Wan-2.2-ADULT&color=%230000ff&labelColor=%23800080&logo=huggingface&logoColor=white&style=for-the-badge" alt="badge">
+            </a>
+            <a href="https://huggingface.co/spaces/Heartsync/PornHUB" target="_blank">
+                <img src="https://img.shields.io/static/v1?label=T2I%20&message=PornHUB&color=%230000ff&labelColor=%23800080&logo=huggingface&logoColor=white&style=for-the-badge" alt="badge">
+            </a>
+            <a href="https://huggingface.co/spaces/Heartsync/Hentai-Adult" target="_blank">
+                <img src="https://img.shields.io/static/v1?label=T2I%20&message=Hentai-Adult&color=%230000ff&labelColor=%23800080&logo=huggingface&logoColor=white&style=for-the-badge" alt="badge">
+            </a>
+        </div>
+        """
+    )
+    with gr.Tabs() as tabs:
+        # Tab 1: Text-to-Image
+        with gr.Tab("Text to Image", id="t2i_tab"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    t2i_prompt = gr.Textbox(
+                        label="Prompt",
+                        placeholder="Describe the image you want to generate...",
+                        lines=3
+                    )
+                    t2i_negative_prompt = gr.Textbox(
+                        label="Negative Prompt",
+                        value="nsfw, (low quality, worst quality:1.2), very displeasing, 3d, watermark, signature, ugly, poorly drawn",
+                        lines=2
+                    )
+                    with gr.Row():
+                        t2i_width = gr.Slider(label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024)
+                        t2i_height = gr.Slider(label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024)
+                    with gr.Accordion("Advanced Settings", open=False):
+                        t2i_seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
+                        t2i_randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+                        t2i_guidance_scale = gr.Slider(label="Guidance Scale", minimum=0.0, maximum=20.0, step=0.1, value=7)
+                        t2i_num_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=50, step=1, value=28)
+                    t2i_generate_btn = gr.Button("Generate Image", variant="primary", size="lg")
+                with gr.Column(scale=1):
+                    t2i_output = gr.Image(label="Generated Image", elem_id="generated_image")
+                    t2i_seed_output = gr.Number(label="Used Seed", interactive=False)
+        # Tab 2: Image-to-Video
+        with gr.Tab("Image to Video", id="i2v_tab"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    i2v_image = gr.Image(type="numpy", label="Input Image", elem_id="input_image")
+                    i2v_prompt = gr.Textbox(
+                        label="Video Prompt",
+                        value="Generate a video with smooth and natural movement. Objects should have visible motion while maintaining fluid transitions.",
+                        lines=3
+                    )
+                    with gr.Accordion("Prompt Templates", open=False):
+                        gr.Markdown("Click a template to apply it to your prompt:")
+                        template_buttons = {}
+                        for name, template in video_templates.items():
+                            btn = gr.Button(name, size="sm")
+                            template_buttons[name] = (btn, template)
+                    i2v_duration = gr.Slider(
+                        label="Duration (seconds)",
+                        minimum=round(MIN_FRAMES_MODEL/FIXED_FPS, 1),
+                        maximum=round(MAX_FRAMES_MODEL/FIXED_FPS, 1),
+                        step=0.1,
+                        value=2.0
+                    )
+                    i2v_size = gr.Dropdown(
+                        label="Output Resolution",
+                        choices=list(SUPPORTED_SIZES[TASK_NAME]),
+                        value="704*1280"
+                    )
+                    with gr.Accordion("Advanced Settings", open=False):
+                        i2v_steps = gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=38, step=1)
+                        i2v_guide_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=10.0, value=cfg.sample_guide_scale, step=0.1)
+                        i2v_shift = gr.Slider(label="Sample Shift", minimum=1.0, maximum=20.0, value=cfg.sample_shift, step=0.1)
+                        i2v_seed = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)
+                    i2v_generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
+                with gr.Column(scale=1):
+                    i2v_output = gr.Video(label="Generated Video", elem_id="output_video")
+        # Tab 3: Text-to-Image-to-Video
+        with gr.Tab("Text to Image to Video", id="t2i2v_tab"):
+            gr.Markdown("### 🎯 Complete Pipeline: Generate an image from text, then convert it to video")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("#### Step 1: Image Generation Settings")
+                    t2i2v_img_prompt = gr.Textbox(
+                        label="Image Prompt",
+                        placeholder="Describe the image to generate...",
+                        lines=3
+                    )
+                    t2i2v_img_negative = gr.Textbox(
+                        label="Negative Prompt",
+                        value="nsfw, (low quality, worst quality:1.2), very displeasing, 3d, watermark, signature, ugly, poorly drawn",
+                        lines=2
+                    )
+                    with gr.Row():
+                        t2i2v_img_width = gr.Slider(label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024)
+                        t2i2v_img_height = gr.Slider(label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024)
+                    with gr.Accordion("Image Advanced Settings", open=False):
+                        t2i2v_img_seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
+                        t2i2v_img_randomize = gr.Checkbox(label="Randomize seed", value=True)
+                        t2i2v_img_guidance = gr.Slider(label="Guidance Scale", minimum=0.0, maximum=20.0, step=0.1, value=7)
+                        t2i2v_img_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=50, step=1, value=28)
+                    gr.Markdown("#### Step 2: Video Generation Settings")
+                    t2i2v_video_prompt = gr.Textbox(
+                        label="Video Prompt",
+                        value="Generate a video with smooth and natural movement. Objects should have visible motion while maintaining fluid transitions.",
+                        lines=3
+                    )
+                    t2i2v_video_duration = gr.Slider(
+                        label="Duration (seconds)",
+                        minimum=round(MIN_FRAMES_MODEL/FIXED_FPS, 1),
+                        maximum=round(MAX_FRAMES_MODEL/FIXED_FPS, 1),
+                        step=0.1,
+                        value=2.0
+                    )
+                    # Add the missing video size dropdown component
+                    t2i2v_video_size = gr.Dropdown(
+                        label="Video Output Resolution",
+                        choices=list(SUPPORTED_SIZES[TASK_NAME]),
+                        value="704*1280",
+                        info="This will be auto-adjusted based on generated image aspect ratio"
+                    )
+                    with gr.Accordion("Video Advanced Settings", open=False):
+                        t2i2v_video_steps = gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=38, step=1)
+                        t2i2v_video_guide = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=10.0, value=cfg.sample_guide_scale, step=0.1)
+                        t2i2v_video_shift = gr.Slider(label="Sample Shift", minimum=1.0, maximum=20.0, value=cfg.sample_shift, step=0.1)
+                        t2i2v_video_seed = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)
+                    t2i2v_generate_btn = gr.Button("Generate Image → Video", variant="primary", size="lg")
+                with gr.Column(scale=1):
+                    gr.Markdown("#### Results")
+                    t2i2v_image_output = gr.Image(label="Generated Image", elem_id="generated_image")
+                    t2i2v_video_output = gr.Video(label="Generated Video", elem_id="output_video")
+                    with gr.Row():
+                        t2i2v_seed_output = gr.Number(label="Image Seed Used", interactive=False)
+                        t2i2v_size_output = gr.Textbox(label="Video Size Used", interactive=False)
+    # Event handlers
+    # Tab 1: Text-to-Image
+    t2i_generate_btn.click(
+        fn=generate_image,
+        inputs=[
+            t2i_prompt, t2i_negative_prompt, t2i_seed, t2i_randomize_seed,
+            t2i_width, t2i_height, t2i_guidance_scale, t2i_num_steps
+        ],
+        outputs=[t2i_output, t2i_seed_output]
+    )
+    # Tab 2: Image-to-Video
+    # Connect template buttons
+    for name, (btn, template) in template_buttons.items():
+        btn.click(
+            fn=lambda t=template, p=i2v_prompt: apply_template(t, p),
+            inputs=[i2v_prompt],
+            outputs=i2v_prompt
+        )
+    # Auto-select best size when image is uploaded
+    def handle_image_upload(image):
+        if image is None:
+            return gr.update()
+        pil_image = Image.fromarray(image).convert("RGB")
+        available_sizes = list(SUPPORTED_SIZES[TASK_NAME])
+        best_size = select_best_size_for_image(pil_image, available_sizes)
+        return gr.update(value=best_size)
+    i2v_image.upload(
+        fn=handle_image_upload,
+        inputs=[i2v_image],
+        outputs=[i2v_size]
+    )
+    i2v_generate_btn.click(
+        fn=generate_video,
+        inputs=[
+            i2v_image, i2v_prompt, i2v_size, i2v_duration,
+            i2v_steps, i2v_guide_scale, i2v_shift, i2v_seed
+        ],
+        outputs=i2v_output
+    )
+    # Tab 3: Text-to-Image-to-Video
+    t2i2v_generate_btn.click(
+        fn=generate_image_to_video,
+        inputs=[
+            t2i2v_img_prompt, t2i2v_img_negative, t2i2v_img_seed, t2i2v_img_randomize,
+            t2i2v_img_width, t2i2v_img_height, t2i2v_img_guidance, t2i2v_img_steps,
+            t2i2v_video_prompt, t2i2v_video_size, t2i2v_video_duration,
+            t2i2v_video_steps, t2i2v_video_guide, t2i2v_video_shift, t2i2v_video_seed
+        ],
+        outputs=[t2i2v_image_output, t2i2v_video_output, t2i2v_seed_output, t2i2v_size_output]
+    )
+    # Examples
     gr.Examples(
+        examples=[
+            ["A majestic lion sitting on a rock at sunset, golden hour lighting, photorealistic", "Generate a video with the lion slowly turning its head and mane flowing in the wind"],
+            ["A futuristic cyberpunk city with neon lights and flying cars", "Cinematic shot with smooth camera movement through the city streets"],
+            ["A serene Japanese garden with cherry blossoms and a koi pond", "Gentle breeze causing cherry blossoms to fall, ripples in the pond"],
         ],
+        inputs=[t2i2v_img_prompt, t2i2v_video_prompt],
+        label="Example Prompts"
     )
 if __name__ == "__main__":
+    demo.launch()