Spaces:
Running
Running
Merge branch 'main' of https://huggingface.co/spaces/doodle-med/Audio2KineticVid
Browse files- README.md +0 -8
- app.py +18 -20
- requirements.txt +1 -1
- utils/prompt_gen.py +2 -0
- utils/transcribe.py +2 -0
- utils/video_gen.py +3 -0
README.md
CHANGED
@@ -1,11 +1,3 @@
|
|
1 |
-
---
|
2 |
-
license: apache-2.0
|
3 |
-
title: Audio2KineticVid
|
4 |
-
sdk: gradio
|
5 |
-
emoji: π
|
6 |
-
colorFrom: yellow
|
7 |
-
colorTo: red
|
8 |
-
---
|
9 |
# Audio2KineticVid
|
10 |
|
11 |
Audio2KineticVid is a comprehensive tool that converts an audio track (e.g., a song) into a dynamic music video with AI-generated scenes and synchronized kinetic typography (animated subtitles). Everything runs locally using open-source models β no external APIs or paid services required.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Audio2KineticVid
|
2 |
|
3 |
Audio2KineticVid is a comprehensive tool that converts an audio track (e.g., a song) into a dynamic music video with AI-generated scenes and synchronized kinetic typography (animated subtitles). Everything runs locally using open-source models β no external APIs or paid services required.
|
app.py
CHANGED
@@ -7,7 +7,6 @@ import gradio as gr
|
|
7 |
import torch
|
8 |
from PIL import Image
|
9 |
import time
|
10 |
-
import spaces
|
11 |
|
12 |
# Import pipeline modules
|
13 |
from utils.transcribe import transcribe_audio, list_available_whisper_models
|
@@ -71,7 +70,6 @@ DEFAULT_STYLE_SUFFIX = "cinematic, 35 mm, shallow depth of field, film grain"
|
|
71 |
IMAGE_MODES = ["Independent", "Consistent (Img2Img)"]
|
72 |
DEFAULT_IMAGE_MODE = "Independent"
|
73 |
|
74 |
-
@spaces.GPU
|
75 |
def process_audio(
|
76 |
audio_path,
|
77 |
whisper_model,
|
@@ -310,7 +308,7 @@ with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.S
|
|
310 |
audio_input = gr.Audio(
|
311 |
label="π΅ Upload Audio Track",
|
312 |
type="filepath",
|
313 |
-
|
314 |
)
|
315 |
with gr.Column():
|
316 |
# Quick settings panel
|
@@ -319,7 +317,7 @@ with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.S
|
|
319 |
choices=["Fast (512x288)", "Balanced (1024x576)", "High Quality (1280x720)"],
|
320 |
value="Balanced (1024x576)",
|
321 |
label="Quality Preset",
|
322 |
-
|
323 |
)
|
324 |
|
325 |
# Model selection tabs
|
@@ -332,26 +330,26 @@ with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.S
|
|
332 |
label="π€ Transcription Model (Whisper)",
|
333 |
choices=WHISPER_MODELS,
|
334 |
value=DEFAULT_WHISPER_MODEL,
|
335 |
-
|
336 |
)
|
337 |
llm_dropdown = gr.Dropdown(
|
338 |
label="π§ Scene Description Model (LLM)",
|
339 |
choices=LLM_MODELS,
|
340 |
value=DEFAULT_LLM_MODEL,
|
341 |
-
|
342 |
)
|
343 |
with gr.Column():
|
344 |
image_dropdown = gr.Dropdown(
|
345 |
label="π¨ Image Generation Model",
|
346 |
choices=IMAGE_MODELS,
|
347 |
value=DEFAULT_IMAGE_MODEL,
|
348 |
-
|
349 |
)
|
350 |
video_dropdown = gr.Dropdown(
|
351 |
label="π¬ Video Animation Model",
|
352 |
choices=VIDEO_MODELS,
|
353 |
value=DEFAULT_VIDEO_MODEL,
|
354 |
-
|
355 |
)
|
356 |
|
357 |
with gr.TabItem("βοΈ Scene Prompting"):
|
@@ -361,7 +359,7 @@ with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.S
|
|
361 |
label="LLM Prompt Template",
|
362 |
value=DEFAULT_PROMPT_TEMPLATE,
|
363 |
lines=6,
|
364 |
-
|
365 |
)
|
366 |
with gr.Row():
|
367 |
max_words_input = gr.Slider(
|
@@ -370,7 +368,7 @@ with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.S
|
|
370 |
maximum=100,
|
371 |
step=5,
|
372 |
value=DEFAULT_MAX_WORDS,
|
373 |
-
|
374 |
)
|
375 |
max_sentences_input = gr.Slider(
|
376 |
label="Max Sentences per Scene",
|
@@ -378,12 +376,12 @@ with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.S
|
|
378 |
maximum=5,
|
379 |
step=1,
|
380 |
value=DEFAULT_MAX_SENTENCES,
|
381 |
-
|
382 |
)
|
383 |
style_suffix_input = gr.Textbox(
|
384 |
label="Visual Style Keywords",
|
385 |
value=DEFAULT_STYLE_SUFFIX,
|
386 |
-
|
387 |
)
|
388 |
|
389 |
with gr.TabItem("π¬ Video Settings"):
|
@@ -394,32 +392,32 @@ with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.S
|
|
394 |
label="πͺ Subtitle Animation Style",
|
395 |
choices=template_choices,
|
396 |
value=DEFAULT_TEMPLATE,
|
397 |
-
|
398 |
)
|
399 |
res_dropdown = gr.Dropdown(
|
400 |
label="πΊ Video Resolution",
|
401 |
choices=["512x288", "1024x576", "1280x720"],
|
402 |
value=DEFAULT_RESOLUTION,
|
403 |
-
|
404 |
)
|
405 |
with gr.Row():
|
406 |
fps_input = gr.Textbox(
|
407 |
label="ποΈ Video FPS",
|
408 |
value=DEFAULT_FPS_MODE,
|
409 |
-
|
410 |
)
|
411 |
seed_input = gr.Number(
|
412 |
label="π± Random Seed",
|
413 |
value=DEFAULT_SEED,
|
414 |
precision=0,
|
415 |
-
|
416 |
)
|
417 |
with gr.Row():
|
418 |
image_mode_input = gr.Radio(
|
419 |
label="πΌοΈ Scene Generation Mode",
|
420 |
choices=IMAGE_MODES,
|
421 |
value=DEFAULT_IMAGE_MODE,
|
422 |
-
|
423 |
)
|
424 |
strength_slider = gr.Slider(
|
425 |
label="π― Style Consistency Strength",
|
@@ -428,7 +426,7 @@ with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.S
|
|
428 |
step=0.05,
|
429 |
value=0.5,
|
430 |
visible=False,
|
431 |
-
|
432 |
)
|
433 |
crossfade_slider = gr.Slider(
|
434 |
label="π Scene Transition Duration",
|
@@ -436,7 +434,7 @@ with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.S
|
|
436 |
maximum=1.0,
|
437 |
step=0.05,
|
438 |
value=DEFAULT_CROSSFADE,
|
439 |
-
|
440 |
)
|
441 |
|
442 |
# Quick preset handling
|
@@ -711,4 +709,4 @@ with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.S
|
|
711 |
if __name__ == "__main__":
|
712 |
# Uncomment for custom hosting options
|
713 |
# demo.launch(server_name='0.0.0.0', server_port=7860)
|
714 |
-
demo.launch(
|
|
|
7 |
import torch
|
8 |
from PIL import Image
|
9 |
import time
|
|
|
10 |
|
11 |
# Import pipeline modules
|
12 |
from utils.transcribe import transcribe_audio, list_available_whisper_models
|
|
|
70 |
IMAGE_MODES = ["Independent", "Consistent (Img2Img)"]
|
71 |
DEFAULT_IMAGE_MODE = "Independent"
|
72 |
|
|
|
73 |
def process_audio(
|
74 |
audio_path,
|
75 |
whisper_model,
|
|
|
308 |
audio_input = gr.Audio(
|
309 |
label="π΅ Upload Audio Track",
|
310 |
type="filepath",
|
311 |
+
info="Upload your music file. For best results, use clear audio with distinct vocals."
|
312 |
)
|
313 |
with gr.Column():
|
314 |
# Quick settings panel
|
|
|
317 |
choices=["Fast (512x288)", "Balanced (1024x576)", "High Quality (1280x720)"],
|
318 |
value="Balanced (1024x576)",
|
319 |
label="Quality Preset",
|
320 |
+
info="Higher quality = better results but slower generation"
|
321 |
)
|
322 |
|
323 |
# Model selection tabs
|
|
|
330 |
label="π€ Transcription Model (Whisper)",
|
331 |
choices=WHISPER_MODELS,
|
332 |
value=DEFAULT_WHISPER_MODEL,
|
333 |
+
info="Larger models are more accurate but slower. 'medium.en' is recommended for English."
|
334 |
)
|
335 |
llm_dropdown = gr.Dropdown(
|
336 |
label="π§ Scene Description Model (LLM)",
|
337 |
choices=LLM_MODELS,
|
338 |
value=DEFAULT_LLM_MODEL,
|
339 |
+
info="Language model to generate visual scene descriptions from lyrics."
|
340 |
)
|
341 |
with gr.Column():
|
342 |
image_dropdown = gr.Dropdown(
|
343 |
label="π¨ Image Generation Model",
|
344 |
choices=IMAGE_MODELS,
|
345 |
value=DEFAULT_IMAGE_MODEL,
|
346 |
+
info="Stable Diffusion model for generating scene images."
|
347 |
)
|
348 |
video_dropdown = gr.Dropdown(
|
349 |
label="π¬ Video Animation Model",
|
350 |
choices=VIDEO_MODELS,
|
351 |
value=DEFAULT_VIDEO_MODEL,
|
352 |
+
info="Model to animate still images into video clips."
|
353 |
)
|
354 |
|
355 |
with gr.TabItem("βοΈ Scene Prompting"):
|
|
|
359 |
label="LLM Prompt Template",
|
360 |
value=DEFAULT_PROMPT_TEMPLATE,
|
361 |
lines=6,
|
362 |
+
info="Template for generating scene descriptions. Use {lyrics}, {max_words}, and {max_sentences} as placeholders."
|
363 |
)
|
364 |
with gr.Row():
|
365 |
max_words_input = gr.Slider(
|
|
|
368 |
maximum=100,
|
369 |
step=5,
|
370 |
value=DEFAULT_MAX_WORDS,
|
371 |
+
info="Limit words in each scene description (more words = more detailed scenes)."
|
372 |
)
|
373 |
max_sentences_input = gr.Slider(
|
374 |
label="Max Sentences per Scene",
|
|
|
376 |
maximum=5,
|
377 |
step=1,
|
378 |
value=DEFAULT_MAX_SENTENCES,
|
379 |
+
info="Limit sentences per scene (1-2 recommended for music videos)."
|
380 |
)
|
381 |
style_suffix_input = gr.Textbox(
|
382 |
label="Visual Style Keywords",
|
383 |
value=DEFAULT_STYLE_SUFFIX,
|
384 |
+
info="Style keywords added to all scenes for consistent visual style (e.g., 'cinematic, vibrant colors')."
|
385 |
)
|
386 |
|
387 |
with gr.TabItem("π¬ Video Settings"):
|
|
|
392 |
label="πͺ Subtitle Animation Style",
|
393 |
choices=template_choices,
|
394 |
value=DEFAULT_TEMPLATE,
|
395 |
+
info="Choose the kinetic subtitle animation style."
|
396 |
)
|
397 |
res_dropdown = gr.Dropdown(
|
398 |
label="πΊ Video Resolution",
|
399 |
choices=["512x288", "1024x576", "1280x720"],
|
400 |
value=DEFAULT_RESOLUTION,
|
401 |
+
info="Higher resolution = better quality but much slower generation."
|
402 |
)
|
403 |
with gr.Row():
|
404 |
fps_input = gr.Textbox(
|
405 |
label="ποΈ Video FPS",
|
406 |
value=DEFAULT_FPS_MODE,
|
407 |
+
info="Frames per second. Use 'Auto' to match lyric timing, or set fixed value (e.g., '24', '30')."
|
408 |
)
|
409 |
seed_input = gr.Number(
|
410 |
label="π± Random Seed",
|
411 |
value=DEFAULT_SEED,
|
412 |
precision=0,
|
413 |
+
info="Set seed for reproducible results (0 = random). Use same seed to recreate results."
|
414 |
)
|
415 |
with gr.Row():
|
416 |
image_mode_input = gr.Radio(
|
417 |
label="πΌοΈ Scene Generation Mode",
|
418 |
choices=IMAGE_MODES,
|
419 |
value=DEFAULT_IMAGE_MODE,
|
420 |
+
info="Independent: each scene is unique. Consistent: scenes influence each other for style continuity."
|
421 |
)
|
422 |
strength_slider = gr.Slider(
|
423 |
label="π― Style Consistency Strength",
|
|
|
426 |
step=0.05,
|
427 |
value=0.5,
|
428 |
visible=False,
|
429 |
+
info="How much each scene influences the next (lower = more influence, higher = more variety)."
|
430 |
)
|
431 |
crossfade_slider = gr.Slider(
|
432 |
label="π Scene Transition Duration",
|
|
|
434 |
maximum=1.0,
|
435 |
step=0.05,
|
436 |
value=DEFAULT_CROSSFADE,
|
437 |
+
info="Smooth crossfade between scenes in seconds (0 = hard cuts, 0.25 = subtle blend)."
|
438 |
)
|
439 |
|
440 |
# Quick preset handling
|
|
|
709 |
if __name__ == "__main__":
|
710 |
# Uncomment for custom hosting options
|
711 |
# demo.launch(server_name='0.0.0.0', server_port=7860)
|
712 |
+
demo.launch()
|
requirements.txt
CHANGED
@@ -5,7 +5,7 @@ accelerate>=0.30
|
|
5 |
diffusers>=0.34
|
6 |
torchaudio
|
7 |
openai-whisper
|
8 |
-
pyannote.audio==3.2.
|
9 |
pycaps @ git+https://github.com/francozanardi/pycaps.git
|
10 |
ffmpeg-python
|
11 |
auto-gptq==0.7.1
|
|
|
5 |
diffusers>=0.34
|
6 |
torchaudio
|
7 |
openai-whisper
|
8 |
+
pyannote.audio==3.2.1
|
9 |
pycaps @ git+https://github.com/francozanardi/pycaps.git
|
10 |
ffmpeg-python
|
11 |
auto-gptq==0.7.1
|
utils/prompt_gen.py
CHANGED
@@ -6,6 +6,7 @@ try:
|
|
6 |
except ImportError:
|
7 |
AutoGPTQForCausalLM = None
|
8 |
from transformers import AutoModelForCausalLM
|
|
|
9 |
|
10 |
# Cache models and tokenizers
|
11 |
_llm_cache = {} # {model_name: (model, tokenizer)}
|
@@ -51,6 +52,7 @@ def _load_llm(model_name):
|
|
51 |
|
52 |
return _llm_cache[model_name]
|
53 |
|
|
|
54 |
def generate_scene_prompts(
|
55 |
segments,
|
56 |
llm_model="TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ",
|
|
|
6 |
except ImportError:
|
7 |
AutoGPTQForCausalLM = None
|
8 |
from transformers import AutoModelForCausalLM
|
9 |
+
import spaces
|
10 |
|
11 |
# Cache models and tokenizers
|
12 |
_llm_cache = {} # {model_name: (model, tokenizer)}
|
|
|
52 |
|
53 |
return _llm_cache[model_name]
|
54 |
|
55 |
+
@spaces.GPU
|
56 |
def generate_scene_prompts(
|
57 |
segments,
|
58 |
llm_model="TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ",
|
utils/transcribe.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import whisper
|
|
|
2 |
|
3 |
# Cache loaded whisper models to avoid reloading for each request
|
4 |
_model_cache = {}
|
@@ -7,6 +8,7 @@ def list_available_whisper_models():
|
|
7 |
"""Return list of available Whisper models"""
|
8 |
return ["tiny", "base", "small", "medium", "medium.en", "large", "large-v2"]
|
9 |
|
|
|
10 |
def transcribe_audio(audio_path: str, model_size: str = "medium.en"):
|
11 |
"""
|
12 |
Transcribe the given audio file using OpenAI Whisper and return the result dictionary.
|
|
|
1 |
import whisper
|
2 |
+
import spaces
|
3 |
|
4 |
# Cache loaded whisper models to avoid reloading for each request
|
5 |
_model_cache = {}
|
|
|
8 |
"""Return list of available Whisper models"""
|
9 |
return ["tiny", "base", "small", "medium", "medium.en", "large", "large-v2"]
|
10 |
|
11 |
+
@spaces.GPU
|
12 |
def transcribe_audio(audio_path: str, model_size: str = "medium.en"):
|
13 |
"""
|
14 |
Transcribe the given audio file using OpenAI Whisper and return the result dictionary.
|
utils/video_gen.py
CHANGED
@@ -11,6 +11,7 @@ from diffusers import (
|
|
11 |
from PIL import Image
|
12 |
import numpy as np
|
13 |
import time
|
|
|
14 |
|
15 |
# Global pipelines cache
|
16 |
_model_cache = {}
|
@@ -96,6 +97,7 @@ def _load_video_pipeline(model_name):
|
|
96 |
|
97 |
return _model_cache[model_name]
|
98 |
|
|
|
99 |
def preview_image_generation(prompt, image_model="stabilityai/stable-diffusion-xl-base-1.0", width=1024, height=576, seed=None):
|
100 |
"""
|
101 |
Generate a preview image from a prompt
|
@@ -125,6 +127,7 @@ def preview_image_generation(prompt, image_model="stabilityai/stable-diffusion-x
|
|
125 |
|
126 |
return image
|
127 |
|
|
|
128 |
def create_video_segments(
|
129 |
segments,
|
130 |
scene_prompts,
|
|
|
11 |
from PIL import Image
|
12 |
import numpy as np
|
13 |
import time
|
14 |
+
import spaces
|
15 |
|
16 |
# Global pipelines cache
|
17 |
_model_cache = {}
|
|
|
97 |
|
98 |
return _model_cache[model_name]
|
99 |
|
100 |
+
@spaces.GPU
|
101 |
def preview_image_generation(prompt, image_model="stabilityai/stable-diffusion-xl-base-1.0", width=1024, height=576, seed=None):
|
102 |
"""
|
103 |
Generate a preview image from a prompt
|
|
|
127 |
|
128 |
return image
|
129 |
|
130 |
+
@spaces.GPU
|
131 |
def create_video_segments(
|
132 |
segments,
|
133 |
scene_prompts,
|