import streamlit as st import whisper import os import tempfile import subprocess import datetime # Helper: Format seconds into ASS time format (H:MM:SS.cs) def format_ass_time(seconds): td = datetime.timedelta(seconds=seconds) total_seconds = int(td.total_seconds()) hours = total_seconds // 3600 minutes = (total_seconds % 3600) // 60 secs = total_seconds % 60 # Get centiseconds (2 decimal places) cs = int((td.total_seconds() - total_seconds) * 100) return f"{hours}:{minutes:02d}:{secs:02d}.{cs:02d}" # Load Whisper model (cached for performance) @st.cache_resource def load_model(): return whisper.load_model("small") # Using "small" for free usage # Transcribe video using Whisper and return segments def transcribe_video(video_path): model = load_model() result = model.transcribe(video_path) return result["segments"] # Create an ASS file with karaoke-style effects. # Each segment is split into words; each word gets an ASS karaoke tag (\k) # which gradually reveals it. The unrevealed text is made fully transparent. def create_ass(segments, ass_path): header = """[Script Info] Title: AI Captioning ScriptType: v4.00+ Collisions: Normal PlayResX: 1280 PlayResY: 720 Timer: 100.0000 [V4+ Styles] ; Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding Style: Default,Arial,36,&H00FFFFFF,&HFF000000,&H00000000,&H64000000,0,0,0,0,100,100,0,0,1,2,0,2,10,10,30,1 [Events] Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text """ events = "" for segment in segments: text = segment.get("text", "").strip() if not text: continue words = text.split() if not words: continue start_time = format_ass_time(segment["start"]) end_time = format_ass_time(segment["end"]) # Calculate total duration in centiseconds total_duration = segment["end"] - segment["start"] total_cs = int(total_duration * 100) # Determine duration per word (in centiseconds) duration_per_word = max(total_cs // len(words), 1) # Build karaoke text: each word preceded by its \k tag. ass_text = "" for word in words: ass_text += r"{\k" + str(duration_per_word) + "}" + word + " " ass_text = ass_text.strip() dialogue_line = f"Dialogue: 0,{start_time},{end_time},Default,,0,0,0,,{ass_text}\n" events += dialogue_line with open(ass_path, "w", encoding="utf-8") as f: f.write(header + events) # Burn the ASS subtitles into the video using FFmpeg. def burn_captions(video_path, ass_path, output_path): command = [ "ffmpeg", "-i", video_path, "-vf", f"subtitles={ass_path}", "-c:a", "copy", output_path ] subprocess.run(command, check=True) # Streamlit UI st.title("🎥 AI Video Captioning App with Karaoke-Style Captions") uploaded_file = st.file_uploader("Upload a video file", type=["mp4", "mkv", "avi", "mov"]) if uploaded_file: # Save the uploaded video to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video: temp_video.write(uploaded_file.read()) video_path = temp_video.name st.video(video_path) if st.button("Generate Captions & Burn into Video"): with st.spinner("Generating captions..."): segments = transcribe_video(video_path) # Create ASS subtitle file ass_path = video_path.replace(".mp4", ".ass") create_ass(segments, ass_path) output_video_path = video_path.replace(".mp4", "_captioned.mp4") with st.spinner("Burning captions into video..."): burn_captions(video_path, ass_path, output_video_path) st.success("Processing complete! Download your video below.") with open(output_video_path, "rb") as file: st.download_button("📥 Download Captioned Video", file, file_name="captioned_video.mp4", mime="video/mp4")