| import gradio as gr | |
| import numpy as np | |
| import torch | |
| from video_diffusion.stable_diffusion_video.stable_diffusion_pipeline import StableDiffusionWalkPipeline | |
| from video_diffusion.utils.model_list import stable_model_list | |
| class StableDiffusionText2VideoGenerator: | |
| def __init__(self): | |
| self.pipe = None | |
| def load_model( | |
| self, | |
| model_path, | |
| ): | |
| if self.pipe is None: | |
| self.pipe = StableDiffusionWalkPipeline.from_pretrained( | |
| model_path, | |
| torch_dtype=torch.float16, | |
| revision="fp16", | |
| ) | |
| self.pipe.to("cuda") | |
| self.pipe.enable_xformers_memory_efficient_attention() | |
| self.pipe.enable_attention_slicing() | |
| return self.pipe | |
| def generate_video( | |
| self, | |
| model_path: str, | |
| first_prompts: str, | |
| second_prompts: str, | |
| negative_prompt: str, | |
| num_interpolation_steps: int, | |
| guidance_scale: int, | |
| num_inference_step: int, | |
| height: int, | |
| width: int, | |
| upsample: bool, | |
| fps=int, | |
| ): | |
| first_seed = np.random.randint(0, 100000) | |
| second_seed = np.random.randint(0, 100000) | |
| seeds = [first_seed, second_seed] | |
| prompts = [first_prompts, second_prompts] | |
| pipe = self.load_model(model_path=model_path) | |
| output_video = pipe.walk( | |
| prompts=prompts, | |
| num_interpolation_steps=int(num_interpolation_steps), | |
| height=height, | |
| width=width, | |
| guidance_scale=guidance_scale, | |
| num_inference_steps=num_inference_step, | |
| negative_prompt=negative_prompt, | |
| seeds=seeds, | |
| upsample=upsample, | |
| fps=fps, | |
| ) | |
| return output_video | |
| def app(): | |
| with gr.Blocks(): | |
| with gr.Row(): | |
| with gr.Column(): | |
| stable_text2video_first_prompt = gr.Textbox( | |
| lines=1, | |
| placeholder="First Prompt", | |
| show_label=False, | |
| ) | |
| stable_text2video_second_prompt = gr.Textbox( | |
| lines=1, | |
| placeholder="Second Prompt", | |
| show_label=False, | |
| ) | |
| stable_text2video_negative_prompt = gr.Textbox( | |
| lines=1, | |
| placeholder="Negative Prompt ", | |
| show_label=False, | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| stable_text2video_model_path = gr.Dropdown( | |
| choices=stable_model_list, | |
| label="Stable Model List", | |
| value=stable_model_list[0], | |
| ) | |
| stable_text2video_guidance_scale = gr.Slider( | |
| minimum=0, | |
| maximum=15, | |
| step=1, | |
| value=8.5, | |
| label="Guidance Scale", | |
| ) | |
| stable_text2video_num_inference_steps = gr.Slider( | |
| minimum=1, | |
| maximum=100, | |
| step=1, | |
| value=30, | |
| label="Number of Inference Steps", | |
| ) | |
| stable_text2video_fps = gr.Slider( | |
| minimum=1, | |
| maximum=60, | |
| step=1, | |
| value=10, | |
| label="Fps", | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| stable_text2video_num_interpolation_steps = gr.Number( | |
| value=10, | |
| label="Number of Interpolation Steps", | |
| ) | |
| stable_text2video_height = gr.Slider( | |
| minimum=1, | |
| maximum=1000, | |
| step=1, | |
| value=512, | |
| label="Height", | |
| ) | |
| stable_text2video_width = gr.Slider( | |
| minimum=1, | |
| maximum=1000, | |
| step=1, | |
| value=512, | |
| label="Width", | |
| ) | |
| stable_text2video_upsample = gr.Checkbox( | |
| label="Upsample", | |
| default=False, | |
| ) | |
| text2video_generate = gr.Button(value="Generator") | |
| with gr.Column(): | |
| text2video_output = gr.Video(label="Output") | |
| text2video_generate.click( | |
| fn=StableDiffusionText2VideoGenerator().generate_video, | |
| inputs=[ | |
| stable_text2video_model_path, | |
| stable_text2video_first_prompt, | |
| stable_text2video_second_prompt, | |
| stable_text2video_negative_prompt, | |
| stable_text2video_num_interpolation_steps, | |
| stable_text2video_guidance_scale, | |
| stable_text2video_num_inference_steps, | |
| stable_text2video_height, | |
| stable_text2video_width, | |
| stable_text2video_upsample, | |
| stable_text2video_fps, | |
| ], | |
| outputs=text2video_output, | |
| ) | |