Spaces:
				
			
			
	
			
			
		Paused
		
	
	
	
			
			
	
	
	
	
		
		
		Paused
		
	Commit 
							
							·
						
						e562afd
	
1
								Parent(s):
							
							3af5e96
								
update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -130,7 +130,7 @@ class FoleyController: | |
| 130 | 
             
                    prompt_textbox,
         | 
| 131 | 
             
                    negative_prompt_textbox, 
         | 
| 132 | 
             
                    ip_adapter_scale,
         | 
| 133 | 
            -
                     | 
| 134 | 
             
                    sampler_dropdown,
         | 
| 135 | 
             
                    sample_step_slider,
         | 
| 136 | 
             
                    cfg_scale_slider,
         | 
| @@ -154,7 +154,7 @@ class FoleyController: | |
| 154 | 
             
                    if seed_textbox != "":
         | 
| 155 | 
             
                        torch.manual_seed(int(seed_textbox))
         | 
| 156 | 
             
                        generator.manual_seed(int(seed_textbox))
         | 
| 157 | 
            -
                    max_frame_nums =  | 
| 158 | 
             
                    frames, duration  = read_frames_with_moviepy(input_video, max_frame_nums=max_frame_nums)
         | 
| 159 | 
             
                    if duration >= 10:
         | 
| 160 | 
             
                        duration = 10
         | 
| @@ -169,7 +169,9 @@ class FoleyController: | |
| 169 | 
             
                    time_condition = time_condition + [-1] * (1024 - len(time_condition))
         | 
| 170 | 
             
                    # w -> b c h w
         | 
| 171 | 
             
                    time_condition = torch.FloatTensor(time_condition).unsqueeze(0).unsqueeze(0).unsqueeze(0).repeat(1, 1, 256, 1)
         | 
| 172 | 
            -
             | 
|  | |
|  | |
| 173 | 
             
                    images = self.image_processor(images=frames, return_tensors="pt").to(device)
         | 
| 174 | 
             
                    image_embeddings = self.image_encoder(**images).image_embeds
         | 
| 175 | 
             
                    image_embeddings = torch.mean(image_embeddings, dim=0, keepdim=True).unsqueeze(0).unsqueeze(0)
         | 
| @@ -253,18 +255,20 @@ with gr.Blocks(css=css) as demo: | |
| 253 | 
             
                                negative_prompt_textbox = gr.Textbox(value=N_PROMPT, label="Negative prompt", lines=1)
         | 
| 254 |  | 
| 255 | 
             
                            with gr.Row():
         | 
| 256 | 
            -
                                 | 
| 257 | 
            -
             | 
| 258 | 
            -
             | 
| 259 | 
            -
             | 
| 260 | 
            -
                                )
         | 
| 261 | 
            -
             | 
| 262 | 
            -
             | 
| 263 | 
            -
             | 
| 264 | 
            -
             | 
| 265 | 
            -
             | 
| 266 | 
            -
             | 
| 267 | 
            -
             | 
|  | |
|  | |
| 268 |  | 
| 269 | 
             
                            with gr.Row():
         | 
| 270 | 
             
                                seed_textbox = gr.Textbox(label="Seed", value=42)
         | 
| @@ -273,7 +277,12 @@ with gr.Blocks(css=css) as demo: | |
| 273 |  | 
| 274 | 
             
                            generate_button = gr.Button(value="Generate", variant="primary")
         | 
| 275 |  | 
| 276 | 
            -
                         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 277 |  | 
| 278 | 
             
                    generate_button.click(
         | 
| 279 | 
             
                        fn=controller.foley,
         | 
| @@ -282,7 +291,7 @@ with gr.Blocks(css=css) as demo: | |
| 282 | 
             
                            prompt_textbox,
         | 
| 283 | 
             
                            negative_prompt_textbox,
         | 
| 284 | 
             
                            ip_adapter_scale,
         | 
| 285 | 
            -
                             | 
| 286 | 
             
                            sampler_dropdown,
         | 
| 287 | 
             
                            sample_step_slider,
         | 
| 288 | 
             
                            cfg_scale_slider,
         | 
| @@ -292,13 +301,22 @@ with gr.Blocks(css=css) as demo: | |
| 292 | 
             
                    )
         | 
| 293 |  | 
| 294 | 
             
                    gr.Examples(
         | 
| 295 | 
            -
                        examples= [
         | 
| 296 | 
            -
             | 
| 297 | 
            -
             | 
| 298 | 
            -
             | 
| 299 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 300 | 
             
                        ],
         | 
| 301 | 
            -
                        inputs=[init_img,prompt_textbox,negative_prompt_textbox,ip_adapter_scale,sampler_dropdown,sample_step_slider,cfg_scale_slider,seed_textbox],
         | 
|  | |
|  | |
|  | |
| 302 | 
             
                    )
         | 
| 303 |  | 
| 304 | 
             
                demo.queue(10)
         | 
|  | |
| 130 | 
             
                    prompt_textbox,
         | 
| 131 | 
             
                    negative_prompt_textbox, 
         | 
| 132 | 
             
                    ip_adapter_scale,
         | 
| 133 | 
            +
                    temporal_scale,
         | 
| 134 | 
             
                    sampler_dropdown,
         | 
| 135 | 
             
                    sample_step_slider,
         | 
| 136 | 
             
                    cfg_scale_slider,
         | 
|  | |
| 154 | 
             
                    if seed_textbox != "":
         | 
| 155 | 
             
                        torch.manual_seed(int(seed_textbox))
         | 
| 156 | 
             
                        generator.manual_seed(int(seed_textbox))
         | 
| 157 | 
            +
                    max_frame_nums = 150
         | 
| 158 | 
             
                    frames, duration  = read_frames_with_moviepy(input_video, max_frame_nums=max_frame_nums)
         | 
| 159 | 
             
                    if duration >= 10:
         | 
| 160 | 
             
                        duration = 10
         | 
|  | |
| 169 | 
             
                    time_condition = time_condition + [-1] * (1024 - len(time_condition))
         | 
| 170 | 
             
                    # w -> b c h w
         | 
| 171 | 
             
                    time_condition = torch.FloatTensor(time_condition).unsqueeze(0).unsqueeze(0).unsqueeze(0).repeat(1, 1, 256, 1)
         | 
| 172 | 
            +
             | 
| 173 | 
            +
                    # Note that clip need fewer frames
         | 
| 174 | 
            +
                    frames = frames[::10]
         | 
| 175 | 
             
                    images = self.image_processor(images=frames, return_tensors="pt").to(device)
         | 
| 176 | 
             
                    image_embeddings = self.image_encoder(**images).image_embeds
         | 
| 177 | 
             
                    image_embeddings = torch.mean(image_embeddings, dim=0, keepdim=True).unsqueeze(0).unsqueeze(0)
         | 
|  | |
| 255 | 
             
                                negative_prompt_textbox = gr.Textbox(value=N_PROMPT, label="Negative prompt", lines=1)
         | 
| 256 |  | 
| 257 | 
             
                            with gr.Row():
         | 
| 258 | 
            +
                                ip_adapter_scale = gr.Slider(label="Visual Content Scale", value=1.0, minimum=0, maximum=1)
         | 
| 259 | 
            +
                                temporal_scale = gr.Slider(label="Temporal Align Scale", value=0.2, minimum=0., maximum=1.0)
         | 
| 260 | 
            +
             | 
| 261 | 
            +
                            with gr.Accordion("Sampling Settings", open=False):
         | 
| 262 | 
            +
                                with gr.Row():
         | 
| 263 | 
            +
                                    sampler_dropdown = gr.Dropdown(
         | 
| 264 | 
            +
                                        label="Sampling method",
         | 
| 265 | 
            +
                                        choices=list(scheduler_dict.keys()),
         | 
| 266 | 
            +
                                        value=list(scheduler_dict.keys())[0],
         | 
| 267 | 
            +
                                    )
         | 
| 268 | 
            +
                                    sample_step_slider = gr.Slider(
         | 
| 269 | 
            +
                                        label="Sampling steps", value=25, minimum=10, maximum=100, step=1
         | 
| 270 | 
            +
                                    )
         | 
| 271 | 
            +
                                cfg_scale_slider = gr.Slider(label="CFG Scale", value=7.5, minimum=0, maximum=20)
         | 
| 272 |  | 
| 273 | 
             
                            with gr.Row():
         | 
| 274 | 
             
                                seed_textbox = gr.Textbox(label="Seed", value=42)
         | 
|  | |
| 277 |  | 
| 278 | 
             
                            generate_button = gr.Button(value="Generate", variant="primary")
         | 
| 279 |  | 
| 280 | 
            +
                        with gr.Column():
         | 
| 281 | 
            +
                            result_video = gr.Video(label="Generated Audio", interactive=False)
         | 
| 282 | 
            +
                            gr.Markdown('**Tips**: <br> \
         | 
| 283 | 
            +
                                        1. With strong temporal visual cues in input video, you can scale up the **Temporal Align Scale**. <br>\
         | 
| 284 | 
            +
                                        2. **Visual content scale** is the level of semantic alignment with visual content. \
         | 
| 285 | 
            +
                            ')
         | 
| 286 |  | 
| 287 | 
             
                    generate_button.click(
         | 
| 288 | 
             
                        fn=controller.foley,
         | 
|  | |
| 291 | 
             
                            prompt_textbox,
         | 
| 292 | 
             
                            negative_prompt_textbox,
         | 
| 293 | 
             
                            ip_adapter_scale,
         | 
| 294 | 
            +
                            temporal_scale,
         | 
| 295 | 
             
                            sampler_dropdown,
         | 
| 296 | 
             
                            sample_step_slider,
         | 
| 297 | 
             
                            cfg_scale_slider,
         | 
|  | |
| 301 | 
             
                    )
         | 
| 302 |  | 
| 303 | 
             
                    gr.Examples(
         | 
| 304 | 
            +
                        # examples= [
         | 
| 305 | 
            +
                        #     ['examples/videos/51701454.mp4', 'seagulls', '', 1.0, 'DDIM', 25, 7.5, 10014024412012338098],
         | 
| 306 | 
            +
                        #     ['examples/videos/42.mp4', '', '', 1.0, 'DDIM', 25, 7.5, 42],
         | 
| 307 | 
            +
                        #     ['examples/videos/1.mp4', '', '', 1.0, 'DDIM', 25, 7.5, 93493458],
         | 
| 308 | 
            +
                        #     ['examples/videos/2.mp4', '', '', 1.0, 'DDIM', 25, 7.5, 16520432],
         | 
| 309 | 
            +
                        # ],
         | 
| 310 | 
            +
                        examples=[
         | 
| 311 | 
            +
                            ['examples/input/case1.mp4', '', '', 1.0, 0.2, 'DDIM', 25, 7.5, 33817921],
         | 
| 312 | 
            +
                            ['examples/input/case3.mp4', '', '', 1.0, 0.2,'DDIM', 25, 7.5, 94667578],
         | 
| 313 | 
            +
                            ['examples/input/case5.mp4', '', '', 0.75, 0.2,'DDIM', 25, 7.5, 92890876],
         | 
| 314 | 
            +
                            ['examples/input/case6.mp4', '', '', 1.0, 0.2, 'DDIM', 25, 7.5, 77015909],
         | 
| 315 | 
             
                        ],
         | 
| 316 | 
            +
                        inputs=[init_img,prompt_textbox,negative_prompt_textbox,ip_adapter_scale,temporal_scale,sampler_dropdown,sample_step_slider,cfg_scale_slider,seed_textbox],
         | 
| 317 | 
            +
                        cache_examples=True,
         | 
| 318 | 
            +
                        outputs=[result_video], 
         | 
| 319 | 
            +
                        fn=controller.foley,
         | 
| 320 | 
             
                    )
         | 
| 321 |  | 
| 322 | 
             
                demo.queue(10)
         | 
