#!/usr/bin/env python3 import os import shutil import uuid import json import gradio as gr import torch from PIL import Image import time # Import pipeline modules from utils.transcribe import transcribe_audio, list_available_whisper_models from utils.segment import segment_lyrics from utils.prompt_gen import generate_scene_prompts, list_available_llm_models from utils.video_gen import ( create_video_segments, list_available_image_models, list_available_video_models, preview_image_generation ) from utils.glue import stitch_and_caption # Create output directories if not existing os.makedirs("templates", exist_ok=True) os.makedirs("templates/minimalist", exist_ok=True) os.makedirs("tmp", exist_ok=True) # Load available model options WHISPER_MODELS = list_available_whisper_models() DEFAULT_WHISPER_MODEL = "medium.en" LLM_MODELS = list_available_llm_models() DEFAULT_LLM_MODEL = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ" IMAGE_MODELS = list_available_image_models() DEFAULT_IMAGE_MODEL = "stabilityai/stable-diffusion-xl-base-1.0" VIDEO_MODELS = list_available_video_models() DEFAULT_VIDEO_MODEL = "stabilityai/stable-video-diffusion-img2vid-xt" # Default prompt template DEFAULT_PROMPT_TEMPLATE = """You are a cinematographer generating a scene for a music video. Describe one vivid visual scene ({max_words} words max) that matches the mood and imagery of these lyrics. Focus on setting, atmosphere, lighting, and framing. Do not mention the artist or singing. Use only {max_sentences} sentence(s). Lyrics: "{lyrics}" Scene description:""" # Prepare style template options by scanning templates/ directory TEMPLATE_DIR = "templates" template_choices = [] for name in os.listdir(TEMPLATE_DIR): if os.path.isdir(os.path.join(TEMPLATE_DIR, name)): template_choices.append(name) template_choices = sorted(template_choices) DEFAULT_TEMPLATE = "minimalist" if "minimalist" in template_choices else (template_choices[0] if template_choices else None) # Advanced settings defaults DEFAULT_RESOLUTION = "1024x576" # default resolution DEFAULT_FPS_MODE = "Auto" # auto-match lyric timing DEFAULT_SEED = 0 # 0 means random seed DEFAULT_MAX_WORDS = 30 # default word limit for scene descriptions DEFAULT_MAX_SENTENCES = 1 # default sentence limit DEFAULT_CROSSFADE = 0.25 # default crossfade duration DEFAULT_STYLE_SUFFIX = "cinematic, 35 mm, shallow depth of field, film grain" # Mode for image generation IMAGE_MODES = ["Independent", "Consistent (Img2Img)"] DEFAULT_IMAGE_MODE = "Independent" def process_audio( audio_path, whisper_model, llm_model, image_model, video_model, template_name, resolution, fps_mode, seed, prompt_template, max_words, max_sentences, style_suffix, image_mode, strength, crossfade_duration ): """ End-to-end processing function to generate the music video with kinetic subtitles. Returns final video file path for preview and download. """ # Default progress function just prints to console def progress(percent, desc=""): print(f"Progress: {percent}% - {desc}") # Input validation if not audio_path or not os.path.exists(audio_path): raise ValueError("Please provide a valid audio file") if not template_name or template_name not in template_choices: template_name = DEFAULT_TEMPLATE or "minimalist" # Prepare a unique temp directory for this run (to avoid conflicts between parallel jobs) session_id = str(uuid.uuid4())[:8] work_dir = os.path.join("tmp", f"run_{session_id}") os.makedirs(work_dir, exist_ok=True) # Save parameter settings for debugging params = { "whisper_model": whisper_model, "llm_model": llm_model, "image_model": image_model, "video_model": video_model, "template": template_name, "resolution": resolution, "fps_mode": fps_mode, "seed": seed, "max_words": max_words, "max_sentences": max_sentences, "style_suffix": style_suffix, "image_mode": image_mode, "strength": strength, "crossfade_duration": crossfade_duration } with open(os.path.join(work_dir, "params.json"), "w") as f: json.dump(params, f, indent=2) try: # 1. Transcription progress(0, desc="Transcribing audio with Whisper...") try: result = transcribe_audio(audio_path, whisper_model) if not result or 'segments' not in result: raise ValueError("Transcription failed - no speech detected") except Exception as e: raise RuntimeError(f"Audio transcription failed: {str(e)}") progress(15, desc="Transcription completed. Segmenting lyrics...") # 2. Segmentation try: segments = segment_lyrics(result) if not segments: raise ValueError("No valid segments found in transcription") except Exception as e: raise RuntimeError(f"Audio segmentation failed: {str(e)}") progress(25, desc=f"Detected {len(segments)} lyric segments. Generating scene prompts...") # 3. Scene-prompt generation try: # Format the prompt template with the limits formatted_prompt_template = prompt_template.format( max_words=max_words, max_sentences=max_sentences, lyrics="{lyrics}" # This placeholder will be filled for each segment ) prompts = generate_scene_prompts( segments, llm_model=llm_model, prompt_template=formatted_prompt_template, style_suffix=style_suffix ) if len(prompts) != len(segments): raise ValueError(f"Prompt generation mismatch: {len(prompts)} prompts for {len(segments)} segments") except Exception as e: raise RuntimeError(f"Scene prompt generation failed: {str(e)}") # Save generated prompts for display or debugging with open(os.path.join(work_dir, "prompts.txt"), "w", encoding="utf-8") as f: for i, p in enumerate(prompts): f.write(f"Segment {i+1}: {p}\n") progress(35, desc="Scene prompts ready. Generating video segments...") # Parse resolution with validation try: if resolution and "x" in resolution.lower(): width, height = map(int, resolution.lower().split("x")) if width <= 0 or height <= 0: raise ValueError("Invalid resolution values") else: width, height = 1024, 576 # default high resolution except (ValueError, TypeError) as e: print(f"Warning: Invalid resolution '{resolution}', using default 1024x576") width, height = 1024, 576 # Determine FPS handling fps_value = None dynamic_fps = True if fps_mode and fps_mode.lower() != "auto": try: fps_value = float(fps_mode) if fps_value <= 0: raise ValueError("FPS must be positive") dynamic_fps = False except (ValueError, TypeError): print(f"Warning: Invalid FPS '{fps_mode}', using auto mode") fps_value = None dynamic_fps = True # 4. Imageβ†’video generation for each segment try: segment_videos = create_video_segments( segments, prompts, image_model=image_model, video_model=video_model, width=width, height=height, dynamic_fps=dynamic_fps, base_fps=fps_value, seed=seed, work_dir=work_dir, image_mode=image_mode, strength=strength, progress_callback=None ) if not segment_videos: raise ValueError("No video segments were generated") except Exception as e: raise RuntimeError(f"Video generation failed: {str(e)}") progress(80, desc="Video segments generated. Stitching and adding subtitles...") # 5. Concatenation & audio syncing, plus kinetic subtitles overlay try: final_video_path = stitch_and_caption( segment_videos, audio_path, segments, template_name, work_dir=work_dir, crossfade_duration=crossfade_duration ) if not final_video_path or not os.path.exists(final_video_path): raise ValueError("Final video file was not created") except Exception as e: raise RuntimeError(f"Video stitching and captioning failed: {str(e)}") progress(100, desc="βœ… Generation complete!") return final_video_path, work_dir except Exception as e: # Enhanced error reporting error_msg = str(e) if "CUDA" in error_msg or "GPU" in error_msg: error_msg += "\n\nπŸ’‘ Tip: This application requires a CUDA-compatible GPU with sufficient VRAM." elif "model" in error_msg.lower(): error_msg += "\n\nπŸ’‘ Tip: Model loading failed. Check your internet connection and try again." elif "audio" in error_msg.lower(): error_msg += "\n\nπŸ’‘ Tip: Please ensure your audio file is in a supported format (MP3, WAV, M4A)." print(f"Error during processing: {error_msg}") raise RuntimeError(error_msg) # Define Gradio UI components with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🎡 Audio β†’ Kinetic-Subtitle Music Video Transform your audio tracks into dynamic music videos with AI-generated scenes and animated subtitles. **✨ Features:** - 🎀 **Whisper Transcription** - Accurate speech-to-text with word-level timing - 🧠 **AI Scene Generation** - LLM-powered visual descriptions from lyrics - 🎨 **Image & Video AI** - Stable Diffusion + Video Diffusion models - 🎬 **Kinetic Subtitles** - Animated text synchronized with audio - ⚑ **Fully Local** - No API keys required, runs on your GPU **πŸ“‹ Quick Start:** 1. Upload an audio file (MP3, WAV, M4A) 2. Choose your AI models (or keep defaults) 3. Customize style and settings 4. Click "Generate Music Video" """) # System requirements info with gr.Accordion("πŸ’» System Requirements & Tips", open=False): gr.Markdown(""" **Hardware Requirements:** - NVIDIA GPU with 8GB+ VRAM (recommended: RTX 3080/4070 or better) - 16GB+ system RAM - Fast storage (SSD recommended) **Supported Audio Formats:** - MP3, WAV, M4A, FLAC, OGG - Recommended: Clear vocals, 30 seconds to 3 minutes **Performance Tips:** - Use lower resolution (512x288) for faster generation - Choose smaller models for quicker processing - Ensure stable power supply for GPU-intensive tasks """) # Main configuration with gr.Row(): with gr.Column(): audio_input = gr.Audio( label="🎡 Upload Audio Track", type="filepath", info="Upload your music file. For best results, use clear audio with distinct vocals." ) with gr.Column(): # Quick settings panel gr.Markdown("### ⚑ Quick Settings") quick_quality = gr.Radio( choices=["Fast (512x288)", "Balanced (1024x576)", "High Quality (1280x720)"], value="Balanced (1024x576)", label="Quality Preset", info="Higher quality = better results but slower generation" ) # Model selection tabs with gr.Tabs(): with gr.TabItem("πŸ€– AI Models"): gr.Markdown("**Choose the AI models for each processing step:**") with gr.Row(): with gr.Column(): whisper_dropdown = gr.Dropdown( label="🎀 Transcription Model (Whisper)", choices=WHISPER_MODELS, value=DEFAULT_WHISPER_MODEL, info="Larger models are more accurate but slower. 'medium.en' is recommended for English." ) llm_dropdown = gr.Dropdown( label="🧠 Scene Description Model (LLM)", choices=LLM_MODELS, value=DEFAULT_LLM_MODEL, info="Language model to generate visual scene descriptions from lyrics." ) with gr.Column(): image_dropdown = gr.Dropdown( label="🎨 Image Generation Model", choices=IMAGE_MODELS, value=DEFAULT_IMAGE_MODEL, info="Stable Diffusion model for generating scene images." ) video_dropdown = gr.Dropdown( label="🎬 Video Animation Model", choices=VIDEO_MODELS, value=DEFAULT_VIDEO_MODEL, info="Model to animate still images into video clips." ) with gr.TabItem("✍️ Scene Prompting"): gr.Markdown("**Customize how AI generates scene descriptions:**") with gr.Column(): prompt_template_input = gr.Textbox( label="LLM Prompt Template", value=DEFAULT_PROMPT_TEMPLATE, lines=6, info="Template for generating scene descriptions. Use {lyrics}, {max_words}, and {max_sentences} as placeholders." ) with gr.Row(): max_words_input = gr.Slider( label="Max Words per Scene", minimum=10, maximum=100, step=5, value=DEFAULT_MAX_WORDS, info="Limit words in each scene description (more words = more detailed scenes)." ) max_sentences_input = gr.Slider( label="Max Sentences per Scene", minimum=1, maximum=5, step=1, value=DEFAULT_MAX_SENTENCES, info="Limit sentences per scene (1-2 recommended for music videos)." ) style_suffix_input = gr.Textbox( label="Visual Style Keywords", value=DEFAULT_STYLE_SUFFIX, info="Style keywords added to all scenes for consistent visual style (e.g., 'cinematic, vibrant colors')." ) with gr.TabItem("🎬 Video Settings"): gr.Markdown("**Configure video output and subtitle styling:**") with gr.Column(): with gr.Row(): template_dropdown = gr.Dropdown( label="πŸŽͺ Subtitle Animation Style", choices=template_choices, value=DEFAULT_TEMPLATE, info="Choose the kinetic subtitle animation style." ) res_dropdown = gr.Dropdown( label="πŸ“Ί Video Resolution", choices=["512x288", "1024x576", "1280x720"], value=DEFAULT_RESOLUTION, info="Higher resolution = better quality but much slower generation." ) with gr.Row(): fps_input = gr.Textbox( label="🎞️ Video FPS", value=DEFAULT_FPS_MODE, info="Frames per second. Use 'Auto' to match lyric timing, or set fixed value (e.g., '24', '30')." ) seed_input = gr.Number( label="🌱 Random Seed", value=DEFAULT_SEED, precision=0, info="Set seed for reproducible results (0 = random). Use same seed to recreate results." ) with gr.Row(): image_mode_input = gr.Radio( label="πŸ–ΌοΈ Scene Generation Mode", choices=IMAGE_MODES, value=DEFAULT_IMAGE_MODE, info="Independent: each scene is unique. Consistent: scenes influence each other for style continuity." ) strength_slider = gr.Slider( label="🎯 Style Consistency Strength", minimum=0.1, maximum=0.9, step=0.05, value=0.5, visible=False, info="How much each scene influences the next (lower = more influence, higher = more variety)." ) crossfade_slider = gr.Slider( label="πŸ”„ Scene Transition Duration", minimum=0.0, maximum=1.0, step=0.05, value=DEFAULT_CROSSFADE, info="Smooth crossfade between scenes in seconds (0 = hard cuts, 0.25 = subtle blend)." ) # Quick preset handling def apply_quality_preset(preset): if preset == "Fast (512x288)": return gr.update(value="512x288"), gr.update(value="tiny"), gr.update(value="stabilityai/sdxl-turbo") elif preset == "High Quality (1280x720)": return gr.update(value="1280x720"), gr.update(value="large"), gr.update(value="stabilityai/stable-diffusion-xl-base-1.0") else: # Balanced return gr.update(value="1024x576"), gr.update(value="medium.en"), gr.update(value="stabilityai/stable-diffusion-xl-base-1.0") quick_quality.change( apply_quality_preset, inputs=[quick_quality], outputs=[res_dropdown, whisper_dropdown, image_dropdown] ) # Make strength slider visible only when Consistent mode is selected def update_strength_visibility(mode): return gr.update(visible=(mode == "Consistent (Img2Img)")) image_mode_input.change(update_strength_visibility, inputs=image_mode_input, outputs=strength_slider) # Enhanced preview section with gr.Row(): with gr.Column(scale=1): preview_btn = gr.Button("πŸ” Preview First Scene", variant="secondary", size="lg") gr.Markdown("*Generate a quick preview of the first scene to test your settings*") with gr.Column(scale=2): generate_btn = gr.Button("🎬 Generate Complete Music Video", variant="primary", size="lg") gr.Markdown("*Start the full video generation process (this may take several minutes)*") # Preview results with gr.Row(visible=False) as preview_row: with gr.Column(): preview_img = gr.Image(label="Preview Scene", type="pil", height=300) with gr.Column(): preview_prompt = gr.Textbox(label="Generated Scene Description", lines=3) preview_info = gr.Markdown("") # Progress and status progress_bar = gr.Progress() status_text = gr.Textbox( label="πŸ“Š Generation Status", value="Ready to start...", interactive=False, lines=2 ) # Results section with better organization with gr.Tabs(): with gr.TabItem("πŸŽ₯ Final Video"): output_video = gr.Video(label="Generated Music Video", format="mp4", height=400) with gr.Row(): download_file = gr.File(label="πŸ“₯ Download Video File", file_count="single") video_info = gr.Textbox(label="Video Information", lines=2, interactive=False) with gr.TabItem("πŸ–ΌοΈ Generated Images"): image_gallery = gr.Gallery( label="Scene Images from Video Generation", columns=3, rows=2, height="auto", object_fit="contain", show_label=True ) gallery_info = gr.Markdown("*Scene images will appear here after generation*") with gr.TabItem("πŸ“ Scene Descriptions"): with gr.Accordion("Generated Scene Prompts", open=True): prompt_text = gr.Markdown("", elem_id="prompt_markdown") segment_info = gr.Textbox( label="Segmentation Summary", lines=3, interactive=False, placeholder="Segment analysis will appear here..." ) # Preview function def on_preview( audio, whisper_model, llm_model, image_model, prompt_template, max_words, max_sentences, style_suffix, resolution ): if not audio: return (gr.update(visible=False), None, "Please upload audio first", "⚠️ **No audio file provided**\n\nPlease upload an audio file to generate a preview.") # Quick transcription and segmentation of first few seconds try: # Extract first 10 seconds of audio for quick preview import subprocess import tempfile with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio: temp_audio_path = temp_audio.name # Use ffmpeg to extract first 10 seconds subprocess.run([ "ffmpeg", "-y", "-i", audio, "-ss", "0", "-t", "10", "-acodec", "pcm_s16le", temp_audio_path ], check=True, capture_output=True) # Transcribe with fastest model for preview result = transcribe_audio(temp_audio_path, "tiny") segments = segment_lyrics(result) os.unlink(temp_audio_path) if not segments: return (gr.update(visible=False), None, "No speech detected in first 10 seconds", "⚠️ **No speech detected**\n\nTry with audio that has clear vocals at the beginning.") first_segment = segments[0] # Format prompt template formatted_prompt = prompt_template.format( max_words=max_words, max_sentences=max_sentences, lyrics=first_segment["text"] ) # Generate prompt scene_prompt = generate_scene_prompts( [first_segment], llm_model=llm_model, prompt_template=formatted_prompt, style_suffix=style_suffix )[0] # Generate image if resolution and "x" in resolution.lower(): width, height = map(int, resolution.lower().split("x")) else: width, height = 1024, 576 image = preview_image_generation( scene_prompt, image_model=image_model, width=width, height=height ) # Create info text duration = first_segment['end'] - first_segment['start'] info_text = f""" βœ… **Preview Generated Successfully** **Detected Lyrics:** "{first_segment['text'][:100]}{'...' if len(first_segment['text']) > 100 else ''}" **Scene Duration:** {duration:.1f} seconds **Generated Description:** {scene_prompt[:150]}{'...' if len(scene_prompt) > 150 else ''} **Image Resolution:** {width}x{height} """ return gr.update(visible=True), image, scene_prompt, info_text except subprocess.CalledProcessError as e: return (gr.update(visible=False), None, "Audio processing failed", "❌ **Audio Processing Error**\n\nFFmpeg failed to process the audio file. Please check the format.") except Exception as e: print(f"Preview error: {e}") return (gr.update(visible=False), None, f"Preview failed: {str(e)}", f"❌ **Preview Error**\n\n{str(e)}\n\nPlease check your audio file and model settings.") # Bind button click to processing function def on_generate( audio, whisper_model, llm_model, image_model, video_model, template_name, resolution, fps, seed, prompt_template, max_words, max_sentences, style_suffix, image_mode, strength, crossfade_duration ): if not audio: return (None, None, gr.update(value="**No audio file provided**\n\nPlease upload an audio file to start generation.", visible=True), [], "Ready to start...", "", "") try: # Start generation start_time = time.time() final_path, work_dir = process_audio( audio, whisper_model, llm_model, image_model, video_model, template_name, resolution, fps, int(seed), prompt_template, max_words, max_sentences, style_suffix, image_mode, strength, crossfade_duration ) generation_time = time.time() - start_time # Load prompts from file to display prompts_file = os.path.join(work_dir, "prompts.txt") prompts_markdown = "" try: with open(prompts_file, 'r', encoding='utf-8') as pf: content = pf.read() # Format prompts as numbered list prompts_lines = content.strip().splitlines() prompts_markdown = "\n".join([f"**{line}**" for line in prompts_lines]) except: prompts_markdown = "Scene prompts not available" # Load segment information segment_summary = "" try: # Get audio duration and file info import subprocess duration_cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", audio] audio_duration = float(subprocess.check_output(duration_cmd, text=True).strip()) file_size = os.path.getsize(final_path) / (1024 * 1024) # MB segment_summary = f"""πŸ“Š **Generation Summary:** β€’ Audio Duration: {audio_duration:.1f} seconds β€’ Processing Time: {generation_time/60:.1f} minutes β€’ Final Video Size: {file_size:.1f} MB β€’ Resolution: {resolution} β€’ Template: {template_name}""" except: segment_summary = f"Generation completed in {generation_time/60:.1f} minutes" # Load generated images for the gallery images = [] try: import glob image_files = glob.glob(os.path.join(work_dir, "*_img.png")) for img_file in sorted(image_files): try: img = Image.open(img_file) images.append(img) except: pass except Exception as e: print(f"Error loading images for gallery: {e}") # Create video info video_info = f"βœ… Video generated successfully!\nFile: {os.path.basename(final_path)}\nSize: {file_size:.1f} MB" gallery_info_text = f"**{len(images)} scene images generated**" if images else "No images available" return (final_path, final_path, gr.update(value=prompts_markdown, visible=True), images, f"βœ… Generation complete! ({generation_time/60:.1f} minutes)", video_info, segment_summary) except Exception as e: error_msg = str(e) print(f"Generation error: {e}") import traceback traceback.print_exc() return (None, None, gr.update(value=f"**❌ Generation Failed**\n\n{error_msg}", visible=True), [], f"❌ Error: {error_msg}", "", "") preview_btn.click( on_preview, inputs=[ audio_input, whisper_dropdown, llm_dropdown, image_dropdown, prompt_template_input, max_words_input, max_sentences_input, style_suffix_input, res_dropdown ], outputs=[preview_row, preview_img, preview_prompt, preview_info] ) generate_btn.click( on_generate, inputs=[ audio_input, whisper_dropdown, llm_dropdown, image_dropdown, video_dropdown, template_dropdown, res_dropdown, fps_input, seed_input, prompt_template_input, max_words_input, max_sentences_input, style_suffix_input, image_mode_input, strength_slider, crossfade_slider ], outputs=[output_video, download_file, prompt_text, image_gallery, status_text, video_info, segment_info] ) if __name__ == "__main__": # Uncomment for custom hosting options # demo.launch(server_name='0.0.0.0', server_port=7860) demo.launch()