CaptionFlow / app.py
Hasnain-Ali's picture
Update app.py
6ec0137 verified
import streamlit as st
import whisper
import os
import tempfile
import subprocess
import datetime
# Helper: Format seconds into ASS time format (H:MM:SS.cs)
def format_ass_time(seconds):
td = datetime.timedelta(seconds=seconds)
total_seconds = int(td.total_seconds())
hours = total_seconds // 3600
minutes = (total_seconds % 3600) // 60
secs = total_seconds % 60
# Get centiseconds (2 decimal places)
cs = int((td.total_seconds() - total_seconds) * 100)
return f"{hours}:{minutes:02d}:{secs:02d}.{cs:02d}"
# Load Whisper model (cached for performance)
@st.cache_resource
def load_model():
return whisper.load_model("small") # Using "small" for free usage
# Transcribe video using Whisper and return segments
def transcribe_video(video_path):
model = load_model()
result = model.transcribe(video_path)
return result["segments"]
# Create an ASS file with karaoke-style effects.
# Each segment is split into words; each word gets an ASS karaoke tag (\k)
# which gradually reveals it. The unrevealed text is made fully transparent.
def create_ass(segments, ass_path):
header = """[Script Info]
Title: AI Captioning
ScriptType: v4.00+
Collisions: Normal
PlayResX: 1280
PlayResY: 720
Timer: 100.0000
[V4+ Styles]
; Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Default,Arial,36,&H00FFFFFF,&HFF000000,&H00000000,&H64000000,0,0,0,0,100,100,0,0,1,2,0,2,10,10,30,1
[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""
events = ""
for segment in segments:
text = segment.get("text", "").strip()
if not text:
continue
words = text.split()
if not words:
continue
start_time = format_ass_time(segment["start"])
end_time = format_ass_time(segment["end"])
# Calculate total duration in centiseconds
total_duration = segment["end"] - segment["start"]
total_cs = int(total_duration * 100)
# Determine duration per word (in centiseconds)
duration_per_word = max(total_cs // len(words), 1)
# Build karaoke text: each word preceded by its \k tag.
ass_text = ""
for word in words:
ass_text += r"{\k" + str(duration_per_word) + "}" + word + " "
ass_text = ass_text.strip()
dialogue_line = f"Dialogue: 0,{start_time},{end_time},Default,,0,0,0,,{ass_text}\n"
events += dialogue_line
with open(ass_path, "w", encoding="utf-8") as f:
f.write(header + events)
# Burn the ASS subtitles into the video using FFmpeg.
def burn_captions(video_path, ass_path, output_path):
command = [
"ffmpeg",
"-i", video_path,
"-vf", f"subtitles={ass_path}",
"-c:a", "copy",
output_path
]
subprocess.run(command, check=True)
# Streamlit UI
st.title("πŸŽ₯ AI Video Captioning App with Karaoke-Style Captions")
uploaded_file = st.file_uploader("Upload a video file", type=["mp4", "mkv", "avi", "mov"])
if uploaded_file:
# Save the uploaded video to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
temp_video.write(uploaded_file.read())
video_path = temp_video.name
st.video(video_path)
if st.button("Generate Captions & Burn into Video"):
with st.spinner("Generating captions..."):
segments = transcribe_video(video_path)
# Create ASS subtitle file
ass_path = video_path.replace(".mp4", ".ass")
create_ass(segments, ass_path)
output_video_path = video_path.replace(".mp4", "_captioned.mp4")
with st.spinner("Burning captions into video..."):
burn_captions(video_path, ass_path, output_video_path)
st.success("Processing complete! Download your video below.")
with open(output_video_path, "rb") as file:
st.download_button("πŸ“₯ Download Captioned Video", file, file_name="captioned_video.mp4", mime="video/mp4")