doodle-med commited on
Commit
44fd71c
Β·
2 Parent(s): fb68449 2576aa4

Merge branch 'main' of https://huggingface.co/spaces/doodle-med/Audio2KineticVid

Browse files
Files changed (6) hide show
  1. README.md +0 -8
  2. app.py +18 -20
  3. requirements.txt +1 -1
  4. utils/prompt_gen.py +2 -0
  5. utils/transcribe.py +2 -0
  6. utils/video_gen.py +3 -0
README.md CHANGED
@@ -1,11 +1,3 @@
1
- ---
2
- license: apache-2.0
3
- title: Audio2KineticVid
4
- sdk: gradio
5
- emoji: πŸš€
6
- colorFrom: yellow
7
- colorTo: red
8
- ---
9
  # Audio2KineticVid
10
 
11
  Audio2KineticVid is a comprehensive tool that converts an audio track (e.g., a song) into a dynamic music video with AI-generated scenes and synchronized kinetic typography (animated subtitles). Everything runs locally using open-source models – no external APIs or paid services required.
 
 
 
 
 
 
 
 
 
1
  # Audio2KineticVid
2
 
3
  Audio2KineticVid is a comprehensive tool that converts an audio track (e.g., a song) into a dynamic music video with AI-generated scenes and synchronized kinetic typography (animated subtitles). Everything runs locally using open-source models – no external APIs or paid services required.
app.py CHANGED
@@ -7,7 +7,6 @@ import gradio as gr
7
  import torch
8
  from PIL import Image
9
  import time
10
- import spaces
11
 
12
  # Import pipeline modules
13
  from utils.transcribe import transcribe_audio, list_available_whisper_models
@@ -71,7 +70,6 @@ DEFAULT_STYLE_SUFFIX = "cinematic, 35 mm, shallow depth of field, film grain"
71
  IMAGE_MODES = ["Independent", "Consistent (Img2Img)"]
72
  DEFAULT_IMAGE_MODE = "Independent"
73
 
74
- @spaces.GPU
75
  def process_audio(
76
  audio_path,
77
  whisper_model,
@@ -310,7 +308,7 @@ with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.S
310
  audio_input = gr.Audio(
311
  label="🎡 Upload Audio Track",
312
  type="filepath",
313
-
314
  )
315
  with gr.Column():
316
  # Quick settings panel
@@ -319,7 +317,7 @@ with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.S
319
  choices=["Fast (512x288)", "Balanced (1024x576)", "High Quality (1280x720)"],
320
  value="Balanced (1024x576)",
321
  label="Quality Preset",
322
-
323
  )
324
 
325
  # Model selection tabs
@@ -332,26 +330,26 @@ with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.S
332
  label="🎀 Transcription Model (Whisper)",
333
  choices=WHISPER_MODELS,
334
  value=DEFAULT_WHISPER_MODEL,
335
-
336
  )
337
  llm_dropdown = gr.Dropdown(
338
  label="🧠 Scene Description Model (LLM)",
339
  choices=LLM_MODELS,
340
  value=DEFAULT_LLM_MODEL,
341
-
342
  )
343
  with gr.Column():
344
  image_dropdown = gr.Dropdown(
345
  label="🎨 Image Generation Model",
346
  choices=IMAGE_MODELS,
347
  value=DEFAULT_IMAGE_MODEL,
348
-
349
  )
350
  video_dropdown = gr.Dropdown(
351
  label="🎬 Video Animation Model",
352
  choices=VIDEO_MODELS,
353
  value=DEFAULT_VIDEO_MODEL,
354
-
355
  )
356
 
357
  with gr.TabItem("✍️ Scene Prompting"):
@@ -361,7 +359,7 @@ with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.S
361
  label="LLM Prompt Template",
362
  value=DEFAULT_PROMPT_TEMPLATE,
363
  lines=6,
364
-
365
  )
366
  with gr.Row():
367
  max_words_input = gr.Slider(
@@ -370,7 +368,7 @@ with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.S
370
  maximum=100,
371
  step=5,
372
  value=DEFAULT_MAX_WORDS,
373
-
374
  )
375
  max_sentences_input = gr.Slider(
376
  label="Max Sentences per Scene",
@@ -378,12 +376,12 @@ with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.S
378
  maximum=5,
379
  step=1,
380
  value=DEFAULT_MAX_SENTENCES,
381
-
382
  )
383
  style_suffix_input = gr.Textbox(
384
  label="Visual Style Keywords",
385
  value=DEFAULT_STYLE_SUFFIX,
386
-
387
  )
388
 
389
  with gr.TabItem("🎬 Video Settings"):
@@ -394,32 +392,32 @@ with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.S
394
  label="πŸŽͺ Subtitle Animation Style",
395
  choices=template_choices,
396
  value=DEFAULT_TEMPLATE,
397
-
398
  )
399
  res_dropdown = gr.Dropdown(
400
  label="πŸ“Ί Video Resolution",
401
  choices=["512x288", "1024x576", "1280x720"],
402
  value=DEFAULT_RESOLUTION,
403
-
404
  )
405
  with gr.Row():
406
  fps_input = gr.Textbox(
407
  label="🎞️ Video FPS",
408
  value=DEFAULT_FPS_MODE,
409
-
410
  )
411
  seed_input = gr.Number(
412
  label="🌱 Random Seed",
413
  value=DEFAULT_SEED,
414
  precision=0,
415
-
416
  )
417
  with gr.Row():
418
  image_mode_input = gr.Radio(
419
  label="πŸ–ΌοΈ Scene Generation Mode",
420
  choices=IMAGE_MODES,
421
  value=DEFAULT_IMAGE_MODE,
422
-
423
  )
424
  strength_slider = gr.Slider(
425
  label="🎯 Style Consistency Strength",
@@ -428,7 +426,7 @@ with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.S
428
  step=0.05,
429
  value=0.5,
430
  visible=False,
431
-
432
  )
433
  crossfade_slider = gr.Slider(
434
  label="πŸ”„ Scene Transition Duration",
@@ -436,7 +434,7 @@ with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.S
436
  maximum=1.0,
437
  step=0.05,
438
  value=DEFAULT_CROSSFADE,
439
-
440
  )
441
 
442
  # Quick preset handling
@@ -711,4 +709,4 @@ with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.S
711
  if __name__ == "__main__":
712
  # Uncomment for custom hosting options
713
  # demo.launch(server_name='0.0.0.0', server_port=7860)
714
- demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
 
7
  import torch
8
  from PIL import Image
9
  import time
 
10
 
11
  # Import pipeline modules
12
  from utils.transcribe import transcribe_audio, list_available_whisper_models
 
70
  IMAGE_MODES = ["Independent", "Consistent (Img2Img)"]
71
  DEFAULT_IMAGE_MODE = "Independent"
72
 
 
73
  def process_audio(
74
  audio_path,
75
  whisper_model,
 
308
  audio_input = gr.Audio(
309
  label="🎡 Upload Audio Track",
310
  type="filepath",
311
+ info="Upload your music file. For best results, use clear audio with distinct vocals."
312
  )
313
  with gr.Column():
314
  # Quick settings panel
 
317
  choices=["Fast (512x288)", "Balanced (1024x576)", "High Quality (1280x720)"],
318
  value="Balanced (1024x576)",
319
  label="Quality Preset",
320
+ info="Higher quality = better results but slower generation"
321
  )
322
 
323
  # Model selection tabs
 
330
  label="🎀 Transcription Model (Whisper)",
331
  choices=WHISPER_MODELS,
332
  value=DEFAULT_WHISPER_MODEL,
333
+ info="Larger models are more accurate but slower. 'medium.en' is recommended for English."
334
  )
335
  llm_dropdown = gr.Dropdown(
336
  label="🧠 Scene Description Model (LLM)",
337
  choices=LLM_MODELS,
338
  value=DEFAULT_LLM_MODEL,
339
+ info="Language model to generate visual scene descriptions from lyrics."
340
  )
341
  with gr.Column():
342
  image_dropdown = gr.Dropdown(
343
  label="🎨 Image Generation Model",
344
  choices=IMAGE_MODELS,
345
  value=DEFAULT_IMAGE_MODEL,
346
+ info="Stable Diffusion model for generating scene images."
347
  )
348
  video_dropdown = gr.Dropdown(
349
  label="🎬 Video Animation Model",
350
  choices=VIDEO_MODELS,
351
  value=DEFAULT_VIDEO_MODEL,
352
+ info="Model to animate still images into video clips."
353
  )
354
 
355
  with gr.TabItem("✍️ Scene Prompting"):
 
359
  label="LLM Prompt Template",
360
  value=DEFAULT_PROMPT_TEMPLATE,
361
  lines=6,
362
+ info="Template for generating scene descriptions. Use {lyrics}, {max_words}, and {max_sentences} as placeholders."
363
  )
364
  with gr.Row():
365
  max_words_input = gr.Slider(
 
368
  maximum=100,
369
  step=5,
370
  value=DEFAULT_MAX_WORDS,
371
+ info="Limit words in each scene description (more words = more detailed scenes)."
372
  )
373
  max_sentences_input = gr.Slider(
374
  label="Max Sentences per Scene",
 
376
  maximum=5,
377
  step=1,
378
  value=DEFAULT_MAX_SENTENCES,
379
+ info="Limit sentences per scene (1-2 recommended for music videos)."
380
  )
381
  style_suffix_input = gr.Textbox(
382
  label="Visual Style Keywords",
383
  value=DEFAULT_STYLE_SUFFIX,
384
+ info="Style keywords added to all scenes for consistent visual style (e.g., 'cinematic, vibrant colors')."
385
  )
386
 
387
  with gr.TabItem("🎬 Video Settings"):
 
392
  label="πŸŽͺ Subtitle Animation Style",
393
  choices=template_choices,
394
  value=DEFAULT_TEMPLATE,
395
+ info="Choose the kinetic subtitle animation style."
396
  )
397
  res_dropdown = gr.Dropdown(
398
  label="πŸ“Ί Video Resolution",
399
  choices=["512x288", "1024x576", "1280x720"],
400
  value=DEFAULT_RESOLUTION,
401
+ info="Higher resolution = better quality but much slower generation."
402
  )
403
  with gr.Row():
404
  fps_input = gr.Textbox(
405
  label="🎞️ Video FPS",
406
  value=DEFAULT_FPS_MODE,
407
+ info="Frames per second. Use 'Auto' to match lyric timing, or set fixed value (e.g., '24', '30')."
408
  )
409
  seed_input = gr.Number(
410
  label="🌱 Random Seed",
411
  value=DEFAULT_SEED,
412
  precision=0,
413
+ info="Set seed for reproducible results (0 = random). Use same seed to recreate results."
414
  )
415
  with gr.Row():
416
  image_mode_input = gr.Radio(
417
  label="πŸ–ΌοΈ Scene Generation Mode",
418
  choices=IMAGE_MODES,
419
  value=DEFAULT_IMAGE_MODE,
420
+ info="Independent: each scene is unique. Consistent: scenes influence each other for style continuity."
421
  )
422
  strength_slider = gr.Slider(
423
  label="🎯 Style Consistency Strength",
 
426
  step=0.05,
427
  value=0.5,
428
  visible=False,
429
+ info="How much each scene influences the next (lower = more influence, higher = more variety)."
430
  )
431
  crossfade_slider = gr.Slider(
432
  label="πŸ”„ Scene Transition Duration",
 
434
  maximum=1.0,
435
  step=0.05,
436
  value=DEFAULT_CROSSFADE,
437
+ info="Smooth crossfade between scenes in seconds (0 = hard cuts, 0.25 = subtle blend)."
438
  )
439
 
440
  # Quick preset handling
 
709
  if __name__ == "__main__":
710
  # Uncomment for custom hosting options
711
  # demo.launch(server_name='0.0.0.0', server_port=7860)
712
+ demo.launch()
requirements.txt CHANGED
@@ -5,7 +5,7 @@ accelerate>=0.30
5
  diffusers>=0.34
6
  torchaudio
7
  openai-whisper
8
- pyannote.audio==3.2.0
9
  pycaps @ git+https://github.com/francozanardi/pycaps.git
10
  ffmpeg-python
11
  auto-gptq==0.7.1
 
5
  diffusers>=0.34
6
  torchaudio
7
  openai-whisper
8
+ pyannote.audio==3.2.1
9
  pycaps @ git+https://github.com/francozanardi/pycaps.git
10
  ffmpeg-python
11
  auto-gptq==0.7.1
utils/prompt_gen.py CHANGED
@@ -6,6 +6,7 @@ try:
6
  except ImportError:
7
  AutoGPTQForCausalLM = None
8
  from transformers import AutoModelForCausalLM
 
9
 
10
  # Cache models and tokenizers
11
  _llm_cache = {} # {model_name: (model, tokenizer)}
@@ -51,6 +52,7 @@ def _load_llm(model_name):
51
 
52
  return _llm_cache[model_name]
53
 
 
54
  def generate_scene_prompts(
55
  segments,
56
  llm_model="TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ",
 
6
  except ImportError:
7
  AutoGPTQForCausalLM = None
8
  from transformers import AutoModelForCausalLM
9
+ import spaces
10
 
11
  # Cache models and tokenizers
12
  _llm_cache = {} # {model_name: (model, tokenizer)}
 
52
 
53
  return _llm_cache[model_name]
54
 
55
+ @spaces.GPU
56
  def generate_scene_prompts(
57
  segments,
58
  llm_model="TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ",
utils/transcribe.py CHANGED
@@ -1,4 +1,5 @@
1
  import whisper
 
2
 
3
  # Cache loaded whisper models to avoid reloading for each request
4
  _model_cache = {}
@@ -7,6 +8,7 @@ def list_available_whisper_models():
7
  """Return list of available Whisper models"""
8
  return ["tiny", "base", "small", "medium", "medium.en", "large", "large-v2"]
9
 
 
10
  def transcribe_audio(audio_path: str, model_size: str = "medium.en"):
11
  """
12
  Transcribe the given audio file using OpenAI Whisper and return the result dictionary.
 
1
  import whisper
2
+ import spaces
3
 
4
  # Cache loaded whisper models to avoid reloading for each request
5
  _model_cache = {}
 
8
  """Return list of available Whisper models"""
9
  return ["tiny", "base", "small", "medium", "medium.en", "large", "large-v2"]
10
 
11
+ @spaces.GPU
12
  def transcribe_audio(audio_path: str, model_size: str = "medium.en"):
13
  """
14
  Transcribe the given audio file using OpenAI Whisper and return the result dictionary.
utils/video_gen.py CHANGED
@@ -11,6 +11,7 @@ from diffusers import (
11
  from PIL import Image
12
  import numpy as np
13
  import time
 
14
 
15
  # Global pipelines cache
16
  _model_cache = {}
@@ -96,6 +97,7 @@ def _load_video_pipeline(model_name):
96
 
97
  return _model_cache[model_name]
98
 
 
99
  def preview_image_generation(prompt, image_model="stabilityai/stable-diffusion-xl-base-1.0", width=1024, height=576, seed=None):
100
  """
101
  Generate a preview image from a prompt
@@ -125,6 +127,7 @@ def preview_image_generation(prompt, image_model="stabilityai/stable-diffusion-x
125
 
126
  return image
127
 
 
128
  def create_video_segments(
129
  segments,
130
  scene_prompts,
 
11
  from PIL import Image
12
  import numpy as np
13
  import time
14
+ import spaces
15
 
16
  # Global pipelines cache
17
  _model_cache = {}
 
97
 
98
  return _model_cache[model_name]
99
 
100
+ @spaces.GPU
101
  def preview_image_generation(prompt, image_model="stabilityai/stable-diffusion-xl-base-1.0", width=1024, height=576, seed=None):
102
  """
103
  Generate a preview image from a prompt
 
127
 
128
  return image
129
 
130
+ @spaces.GPU
131
  def create_video_segments(
132
  segments,
133
  scene_prompts,