Spaces:

doodle-med
/

Audio2KineticVid

Running

App Files Files Community

doodle-med commited on Jul 15

Commit

44fd71c

2 Parent(s): fb68449 2576aa4

Merge branch 'main' of https://huggingface.co/spaces/doodle-med/Audio2KineticVid

Browse files

Files changed (6) hide show

README.md +0 -8
app.py +18 -20
requirements.txt +1 -1
utils/prompt_gen.py +2 -0
utils/transcribe.py +2 -0
utils/video_gen.py +3 -0

README.md CHANGED Viewed

@@ -1,11 +1,3 @@
----
-license: apache-2.0
-title: Audio2KineticVid
-sdk: gradio
-emoji: 🚀
-colorFrom: yellow
-colorTo: red
----
 # Audio2KineticVid
 Audio2KineticVid is a comprehensive tool that converts an audio track (e.g., a song) into a dynamic music video with AI-generated scenes and synchronized kinetic typography (animated subtitles). Everything runs locally using open-source models – no external APIs or paid services required.










1	# Audio2KineticVid
2
3	Audio2KineticVid is a comprehensive tool that converts an audio track (e.g., a song) into a dynamic music video with AI-generated scenes and synchronized kinetic typography (animated subtitles). Everything runs locally using open-source models – no external APIs or paid services required.

app.py CHANGED Viewed

@@ -7,7 +7,6 @@ import gradio as gr
 import torch
 from PIL import Image
 import time
-import spaces
 # Import pipeline modules
 from utils.transcribe import transcribe_audio, list_available_whisper_models
@@ -71,7 +70,6 @@ DEFAULT_STYLE_SUFFIX = "cinematic, 35 mm, shallow depth of field, film grain"
 IMAGE_MODES = ["Independent", "Consistent (Img2Img)"]
 DEFAULT_IMAGE_MODE = "Independent"
-@spaces.GPU
 def process_audio(
     audio_path,
     whisper_model,
@@ -310,7 +308,7 @@ with gr.Blocks(title="Audio → Kinetic-Subtitle Music Video", theme=gr.themes.S
             audio_input = gr.Audio(
                 label="🎵 Upload Audio Track",
                 type="filepath",
             )
         with gr.Column():
             # Quick settings panel
@@ -319,7 +317,7 @@ with gr.Blocks(title="Audio → Kinetic-Subtitle Music Video", theme=gr.themes.S
                 choices=["Fast (512x288)", "Balanced (1024x576)", "High Quality (1280x720)"],
                 value="Balanced (1024x576)",
                 label="Quality Preset",
             )
     # Model selection tabs
@@ -332,26 +330,26 @@ with gr.Blocks(title="Audio → Kinetic-Subtitle Music Video", theme=gr.themes.S
                         label="🎤 Transcription Model (Whisper)",
                         choices=WHISPER_MODELS,
                         value=DEFAULT_WHISPER_MODEL,
                     )
                     llm_dropdown = gr.Dropdown(
                         label="🧠 Scene Description Model (LLM)",
                         choices=LLM_MODELS,
                         value=DEFAULT_LLM_MODEL,
                     )
                 with gr.Column():
                     image_dropdown = gr.Dropdown(
                         label="🎨 Image Generation Model",
                         choices=IMAGE_MODELS,
                         value=DEFAULT_IMAGE_MODEL,
                     )
                     video_dropdown = gr.Dropdown(
                         label="🎬 Video Animation Model",
                         choices=VIDEO_MODELS,
                         value=DEFAULT_VIDEO_MODEL,
                     )
         with gr.TabItem("✍️ Scene Prompting"):
@@ -361,7 +359,7 @@ with gr.Blocks(title="Audio → Kinetic-Subtitle Music Video", theme=gr.themes.S
                     label="LLM Prompt Template",
                     value=DEFAULT_PROMPT_TEMPLATE,
                     lines=6,
                 )
                 with gr.Row():
                     max_words_input = gr.Slider(
@@ -370,7 +368,7 @@ with gr.Blocks(title="Audio → Kinetic-Subtitle Music Video", theme=gr.themes.S
                         maximum=100,
                         step=5,
                         value=DEFAULT_MAX_WORDS,
                     )
                     max_sentences_input = gr.Slider(
                         label="Max Sentences per Scene",
@@ -378,12 +376,12 @@ with gr.Blocks(title="Audio → Kinetic-Subtitle Music Video", theme=gr.themes.S
                         maximum=5,
                         step=1,
                         value=DEFAULT_MAX_SENTENCES,
                     )
                 style_suffix_input = gr.Textbox(
                     label="Visual Style Keywords",
                     value=DEFAULT_STYLE_SUFFIX,
                 )
         with gr.TabItem("🎬 Video Settings"):
@@ -394,32 +392,32 @@ with gr.Blocks(title="Audio → Kinetic-Subtitle Music Video", theme=gr.themes.S
                         label="🎪 Subtitle Animation Style",
                         choices=template_choices,
                         value=DEFAULT_TEMPLATE,
                     )
                     res_dropdown = gr.Dropdown(
                         label="📺 Video Resolution",
                         choices=["512x288", "1024x576", "1280x720"],
                         value=DEFAULT_RESOLUTION,
                     )
                 with gr.Row():
                     fps_input = gr.Textbox(
                         label="🎞️ Video FPS",
                         value=DEFAULT_FPS_MODE,
                     )
                     seed_input = gr.Number(
                         label="🌱 Random Seed",
                         value=DEFAULT_SEED,
                         precision=0,
                     )
                 with gr.Row():
                     image_mode_input = gr.Radio(
                         label="🖼️ Scene Generation Mode",
                         choices=IMAGE_MODES,
                         value=DEFAULT_IMAGE_MODE,
                     )
                     strength_slider = gr.Slider(
                         label="🎯 Style Consistency Strength",
@@ -428,7 +426,7 @@ with gr.Blocks(title="Audio → Kinetic-Subtitle Music Video", theme=gr.themes.S
                         step=0.05,
                         value=0.5,
                         visible=False,
                     )
                 crossfade_slider = gr.Slider(
                     label="🔄 Scene Transition Duration",
@@ -436,7 +434,7 @@ with gr.Blocks(title="Audio → Kinetic-Subtitle Music Video", theme=gr.themes.S
                     maximum=1.0,
                     step=0.05,
                     value=DEFAULT_CROSSFADE,
                 )
     # Quick preset handling
@@ -711,4 +709,4 @@ with gr.Blocks(title="Audio → Kinetic-Subtitle Music Video", theme=gr.themes.S
 if __name__ == "__main__":
     # Uncomment for custom hosting options
     # demo.launch(server_name='0.0.0.0', server_port=7860)
-    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

 import torch
 from PIL import Image
 import time
 # Import pipeline modules
 from utils.transcribe import transcribe_audio, list_available_whisper_models
 IMAGE_MODES = ["Independent", "Consistent (Img2Img)"]
 DEFAULT_IMAGE_MODE = "Independent"
 def process_audio(
     audio_path,
     whisper_model,
             audio_input = gr.Audio(
                 label="🎵 Upload Audio Track",
                 type="filepath",
+                info="Upload your music file. For best results, use clear audio with distinct vocals."
             )
         with gr.Column():
             # Quick settings panel
                 choices=["Fast (512x288)", "Balanced (1024x576)", "High Quality (1280x720)"],
                 value="Balanced (1024x576)",
                 label="Quality Preset",
+                info="Higher quality = better results but slower generation"
             )
     # Model selection tabs
                         label="🎤 Transcription Model (Whisper)",
                         choices=WHISPER_MODELS,
                         value=DEFAULT_WHISPER_MODEL,
+                        info="Larger models are more accurate but slower. 'medium.en' is recommended for English."
                     )
                     llm_dropdown = gr.Dropdown(
                         label="🧠 Scene Description Model (LLM)",
                         choices=LLM_MODELS,
                         value=DEFAULT_LLM_MODEL,
+                        info="Language model to generate visual scene descriptions from lyrics."
                     )
                 with gr.Column():
                     image_dropdown = gr.Dropdown(
                         label="🎨 Image Generation Model",
                         choices=IMAGE_MODELS,
                         value=DEFAULT_IMAGE_MODEL,
+                        info="Stable Diffusion model for generating scene images."
                     )
                     video_dropdown = gr.Dropdown(
                         label="🎬 Video Animation Model",
                         choices=VIDEO_MODELS,
                         value=DEFAULT_VIDEO_MODEL,
+                        info="Model to animate still images into video clips."
                     )
         with gr.TabItem("✍️ Scene Prompting"):
                     label="LLM Prompt Template",
                     value=DEFAULT_PROMPT_TEMPLATE,
                     lines=6,
+                    info="Template for generating scene descriptions. Use {lyrics}, {max_words}, and {max_sentences} as placeholders."
                 )
                 with gr.Row():
                     max_words_input = gr.Slider(
                         maximum=100,
                         step=5,
                         value=DEFAULT_MAX_WORDS,
+                        info="Limit words in each scene description (more words = more detailed scenes)."
                     )
                     max_sentences_input = gr.Slider(
                         label="Max Sentences per Scene",
                         maximum=5,
                         step=1,
                         value=DEFAULT_MAX_SENTENCES,
+                        info="Limit sentences per scene (1-2 recommended for music videos)."
                     )
                 style_suffix_input = gr.Textbox(
                     label="Visual Style Keywords",
                     value=DEFAULT_STYLE_SUFFIX,
+                    info="Style keywords added to all scenes for consistent visual style (e.g., 'cinematic, vibrant colors')."
                 )
         with gr.TabItem("🎬 Video Settings"):
                         label="🎪 Subtitle Animation Style",
                         choices=template_choices,
                         value=DEFAULT_TEMPLATE,
+                        info="Choose the kinetic subtitle animation style."
                     )
                     res_dropdown = gr.Dropdown(
                         label="📺 Video Resolution",
                         choices=["512x288", "1024x576", "1280x720"],
                         value=DEFAULT_RESOLUTION,
+                        info="Higher resolution = better quality but much slower generation."
                     )
                 with gr.Row():
                     fps_input = gr.Textbox(
                         label="🎞️ Video FPS",
                         value=DEFAULT_FPS_MODE,
+                        info="Frames per second. Use 'Auto' to match lyric timing, or set fixed value (e.g., '24', '30')."
                     )
                     seed_input = gr.Number(
                         label="🌱 Random Seed",
                         value=DEFAULT_SEED,
                         precision=0,
+                        info="Set seed for reproducible results (0 = random). Use same seed to recreate results."
                     )
                 with gr.Row():
                     image_mode_input = gr.Radio(
                         label="🖼️ Scene Generation Mode",
                         choices=IMAGE_MODES,
                         value=DEFAULT_IMAGE_MODE,
+                        info="Independent: each scene is unique. Consistent: scenes influence each other for style continuity."
                     )
                     strength_slider = gr.Slider(
                         label="🎯 Style Consistency Strength",
                         step=0.05,
                         value=0.5,
                         visible=False,
+                        info="How much each scene influences the next (lower = more influence, higher = more variety)."
                     )
                 crossfade_slider = gr.Slider(
                     label="🔄 Scene Transition Duration",
                     maximum=1.0,
                     step=0.05,
                     value=DEFAULT_CROSSFADE,
+                    info="Smooth crossfade between scenes in seconds (0 = hard cuts, 0.25 = subtle blend)."
                 )
     # Quick preset handling
 if __name__ == "__main__":
     # Uncomment for custom hosting options
     # demo.launch(server_name='0.0.0.0', server_port=7860)
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -5,7 +5,7 @@ accelerate>=0.30
 diffusers>=0.34
 torchaudio
 openai-whisper
-pyannote.audio==3.2.0
 pycaps @ git+https://github.com/francozanardi/pycaps.git
 ffmpeg-python
 auto-gptq==0.7.1

 diffusers>=0.34
 torchaudio
 openai-whisper
+pyannote.audio==3.2.1
 pycaps @ git+https://github.com/francozanardi/pycaps.git
 ffmpeg-python
 auto-gptq==0.7.1

utils/prompt_gen.py CHANGED Viewed

@@ -6,6 +6,7 @@ try:
 except ImportError:
     AutoGPTQForCausalLM = None
 from transformers import AutoModelForCausalLM
 # Cache models and tokenizers
 _llm_cache = {}  # {model_name: (model, tokenizer)}
@@ -51,6 +52,7 @@ def _load_llm(model_name):
     return _llm_cache[model_name]
 def generate_scene_prompts(
     segments,
     llm_model="TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ",

 except ImportError:
     AutoGPTQForCausalLM = None
 from transformers import AutoModelForCausalLM
+import spaces
 # Cache models and tokenizers
 _llm_cache = {}  # {model_name: (model, tokenizer)}
     return _llm_cache[model_name]
+@spaces.GPU
 def generate_scene_prompts(
     segments,
     llm_model="TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ",

utils/transcribe.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import whisper
 # Cache loaded whisper models to avoid reloading for each request
 _model_cache = {}
@@ -7,6 +8,7 @@ def list_available_whisper_models():
     """Return list of available Whisper models"""
     return ["tiny", "base", "small", "medium", "medium.en", "large", "large-v2"]
 def transcribe_audio(audio_path: str, model_size: str = "medium.en"):
     """
     Transcribe the given audio file using OpenAI Whisper and return the result dictionary.

 import whisper
+import spaces
 # Cache loaded whisper models to avoid reloading for each request
 _model_cache = {}
     """Return list of available Whisper models"""
     return ["tiny", "base", "small", "medium", "medium.en", "large", "large-v2"]
+@spaces.GPU
 def transcribe_audio(audio_path: str, model_size: str = "medium.en"):
     """
     Transcribe the given audio file using OpenAI Whisper and return the result dictionary.

utils/video_gen.py CHANGED Viewed

@@ -11,6 +11,7 @@ from diffusers import (
 from PIL import Image
 import numpy as np
 import time
 # Global pipelines cache
 _model_cache = {}
@@ -96,6 +97,7 @@ def _load_video_pipeline(model_name):
     return _model_cache[model_name]
 def preview_image_generation(prompt, image_model="stabilityai/stable-diffusion-xl-base-1.0", width=1024, height=576, seed=None):
     """
     Generate a preview image from a prompt
@@ -125,6 +127,7 @@ def preview_image_generation(prompt, image_model="stabilityai/stable-diffusion-x
     return image
 def create_video_segments(
     segments,
     scene_prompts,

 from PIL import Image
 import numpy as np
 import time
+import spaces
 # Global pipelines cache
 _model_cache = {}
     return _model_cache[model_name]
+@spaces.GPU
 def preview_image_generation(prompt, image_model="stabilityai/stable-diffusion-xl-base-1.0", width=1024, height=576, seed=None):
     """
     Generate a preview image from a prompt
     return image
+@spaces.GPU
 def create_video_segments(
     segments,
     scene_prompts,