import gradio as gr import os import subprocess import torch is_shared_ui = True if "fffiloni/DimensionX" in os.environ['SPACE_ID'] else False is_gpu_associated = torch.cuda.is_available() import gc from diffusers import AutoencoderKLCogVideoX, CogVideoXImageToVideoPipeline, CogVideoXTransformer3DModel from diffusers.utils import export_to_video, load_image from transformers import T5EncoderModel, T5Tokenizer from datetime import datetime import random from moviepy.editor import VideoFileClip import ffmpeg from huggingface_hub import hf_hub_download # Ensure 'checkpoint' directory exists os.makedirs("checkpoints", exist_ok=True) if not is_shared_ui and is_gpu_associated: # Download LoRA weights hf_hub_download( repo_id="wenqsun/DimensionX", filename="orbit_left_lora_weights.safetensors", local_dir="checkpoints" ) hf_hub_download( repo_id="wenqsun/DimensionX", filename="orbit_up_lora_weights.safetensors", local_dir="checkpoints" ) # Load models in the global scope model_id = "THUDM/CogVideoX-5b-I2V" transformer = CogVideoXTransformer3DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch.float16).to("cpu") text_encoder = T5EncoderModel.from_pretrained(model_id, subfolder="text_encoder", torch_dtype=torch.float16).to("cpu") vae = AutoencoderKLCogVideoX.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float16).to("cpu") tokenizer = T5Tokenizer.from_pretrained(model_id, subfolder="tokenizer") pipe = CogVideoXImageToVideoPipeline.from_pretrained(model_id, tokenizer=tokenizer, text_encoder=text_encoder, transformer=transformer, vae=vae, torch_dtype=torch.float16) # Add this near the top after imports os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' def calculate_resize_dimensions(width, height, max_width=1024): """Calculate new dimensions maintaining aspect ratio""" if width <= max_width: return width, height aspect_ratio = height / width new_width = max_width new_height = int(max_width * aspect_ratio) # Make height even number for video encoding new_height = new_height - (new_height % 2) return new_width, new_height def infer(image_path, prompt, orbit_type, progress=gr.Progress(track_tqdm=True)): # Move everything to CPU initially pipe.to("cpu") torch.cuda.empty_cache() # Load and get original image dimensions image = load_image(image_path) original_width, original_height = image.size print(f"IMAGE INPUT SIZE: {original_width} x {original_height}") # Calculate target dimensions maintaining aspect ratio target_width, target_height = calculate_resize_dimensions(original_width, original_height) print(f"TARGET SIZE: {target_width} x {target_height}") lora_path = "checkpoints/" weight_name = "orbit_left_lora_weights.safetensors" if orbit_type == "Left" else "orbit_up_lora_weights.safetensors" lora_rank = 256 adapter_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # Load LoRA weights on CPU pipe.load_lora_weights(lora_path, weight_name=weight_name, adapter_name=f"adapter_{adapter_timestamp}") pipe.fuse_lora(lora_scale=1 / lora_rank) try: # Move to GPU just before inference pipe.to("cuda") torch.cuda.empty_cache() prompt = f"{prompt}. High quality, ultrarealistic detail and breath-taking movie-like camera shot." seed = random.randint(0, 2**8 - 1) with torch.inference_mode(): video = pipe( image, prompt, num_inference_steps=50, guidance_scale=7.0, use_dynamic_cfg=True, generator=torch.Generator(device="cpu").manual_seed(seed) ) finally: # Ensure cleanup happens even if inference fails pipe.to("cpu") pipe.unfuse_lora() pipe.unload_lora_weights() torch.cuda.empty_cache() gc.collect() # Generate initial output video timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") temp_path = f"output_{timestamp}_temp.mp4" final_path = f"output_{timestamp}.mp4" # First export the original video export_to_video(video.frames[0], temp_path, fps=8) try: # Use ffmpeg via subprocess cmd = [ 'ffmpeg', '-i', temp_path, '-vf', f'scale={target_width}:{target_height}', '-c:v', 'libx264', '-preset', 'medium', '-crf', '23', '-y', # Overwrite output file if it exists final_path ] subprocess.run(cmd, check=True, capture_output=True) except subprocess.CalledProcessError as e: print(f"FFmpeg error: {e.stderr.decode()}") raise e finally: if os.path.exists(temp_path): os.remove(temp_path) return final_path # Set up Gradio U css = """ div#warning-duplicate { background-color: #ebf5ff; padding: 0 16px 16px; margin: 0px 0; color: #030303!important; } div#warning-duplicate > .gr-prose > h2, div#warning-duplicate > .gr-prose > p { color: #0f4592!important; } div#warning-duplicate strong { color: #0f4592; } p.actions { display: flex; align-items: center; margin: 20px 0; } div#warning-duplicate .actions a { display: inline-block; margin-right: 10px; } div#warning-setgpu { background-color: #fff4eb; padding: 0 16px 16px; margin: 0px 0; color: #030303!important; } div#warning-setgpu > .gr-prose > h2, div#warning-setgpu > .gr-prose > p { color: #92220f!important; } div#warning-setgpu a, div#warning-setgpu b { color: #91230f; } div#warning-setgpu p.actions > a { display: inline-block; background: #1f1f23; border-radius: 40px; padding: 6px 24px; color: antiquewhite; text-decoration: none; font-weight: 600; font-size: 1.2em; } div#warning-ready { background-color: #ecfdf5; padding: 0 16px 16px; margin: 0px 0; color: #030303!important; } div#warning-ready > .gr-prose > h2, div#warning-ready > .gr-prose > p { color: #057857!important; } .custom-color { color: #030303 !important; } """ with gr.Blocks(css=css, analytics_enabled=False) as demo: with gr.Column(elem_id="col-container"): gr.Markdown("# DimensionX") gr.Markdown("### Create Any 3D and 4D Scenes from a Single Image with Controllable Video Diffusion") gr.HTML("""
Duplicate this Space Follow me on HF
""") with gr.Row(): with gr.Column(scale=1): if is_shared_ui: top_description = gr.HTML(f'''

Attention: this Space need to be duplicated to work

To make it work, duplicate the Space and run it on your own profile using a private GPU (L40s recommended).
A L40s costs US$1.80/h.

Duplicate this Space to start experimenting with this demo

''', elem_id="warning-duplicate") else: if(is_gpu_associated): top_description = gr.HTML(f'''

You have successfully associated a GPU to this Space 🎉

You will be billed by the minute from when you activated the GPU until when it is turned off.

''', elem_id="warning-ready") else: top_description = gr.HTML(f'''

You have successfully duplicated the MimicMotion Space 🎉

There's only one step left before you can properly play with this demo: attribute a GPU to it (via the Settings tab) and run the app below. You will be billed by the minute from when you activate the GPU until when it is turned off.

🔥   Set recommended GPU

''', elem_id="warning-setgpu") image_in = gr.Image(label="Image Input", type="filepath") prompt = gr.Textbox(label="Prompt") orbit_type = gr.Radio(label="Orbit type", choices=["Left", "Up"], value="Left", interactive=True) submit_btn = gr.Button("Submit", interactive=False if is_shared_ui else True) with gr.Column(scale=2): video_out = gr.Video(label="Video output") examples = gr.Examples( examples = [ [ "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg", "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background.", "Left", "./examples/output_astronaut_left.mp4" ], [ "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg", "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background.", "Up", "./examples/output_astronaut_up.mp4" ] ], inputs=[image_in, prompt, orbit_type, video_out] ) submit_btn.click( fn=infer, inputs=[image_in, prompt, orbit_type], outputs=[video_out] ) demo.queue().launch(show_error=True, show_api=False, ssr_mode=False)