Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,49 +3,90 @@ import whisper
|
|
3 |
import os
|
4 |
import tempfile
|
5 |
import subprocess
|
6 |
-
import srt
|
7 |
import datetime
|
8 |
|
9 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
@st.cache_resource
|
11 |
def load_model():
|
12 |
-
return whisper.load_model("small") #
|
13 |
|
14 |
-
# Transcribe video using Whisper
|
15 |
def transcribe_video(video_path):
|
16 |
model = load_model()
|
17 |
result = model.transcribe(video_path)
|
18 |
return result["segments"]
|
19 |
|
20 |
-
# Create an
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
command = [
|
39 |
"ffmpeg",
|
40 |
"-i", video_path,
|
41 |
-
"-vf", f"subtitles={
|
42 |
"-c:a", "copy",
|
43 |
output_path
|
44 |
]
|
45 |
subprocess.run(command, check=True)
|
46 |
|
47 |
-
# Streamlit
|
48 |
-
st.title("🎥 AI Video Captioning App")
|
49 |
|
50 |
uploaded_file = st.file_uploader("Upload a video file", type=["mp4", "mkv", "avi", "mov"])
|
51 |
|
@@ -59,15 +100,16 @@ if uploaded_file:
|
|
59 |
|
60 |
if st.button("Generate Captions & Burn into Video"):
|
61 |
with st.spinner("Generating captions..."):
|
62 |
-
|
63 |
|
64 |
-
|
65 |
-
|
|
|
66 |
|
67 |
output_video_path = video_path.replace(".mp4", "_captioned.mp4")
|
68 |
|
69 |
with st.spinner("Burning captions into video..."):
|
70 |
-
burn_captions(video_path,
|
71 |
|
72 |
st.success("Processing complete! Download your video below.")
|
73 |
with open(output_video_path, "rb") as file:
|
|
|
3 |
import os
|
4 |
import tempfile
|
5 |
import subprocess
|
|
|
6 |
import datetime
|
7 |
|
8 |
+
# Helper: Format seconds into ASS time format (H:MM:SS.cs)
|
9 |
+
def format_ass_time(seconds):
|
10 |
+
td = datetime.timedelta(seconds=seconds)
|
11 |
+
total_seconds = int(td.total_seconds())
|
12 |
+
hours = total_seconds // 3600
|
13 |
+
minutes = (total_seconds % 3600) // 60
|
14 |
+
secs = total_seconds % 60
|
15 |
+
# Get centiseconds (2 decimal places)
|
16 |
+
cs = int((td.total_seconds() - total_seconds) * 100)
|
17 |
+
return f"{hours}:{minutes:02d}:{secs:02d}.{cs:02d}"
|
18 |
+
|
19 |
+
# Load Whisper model (cached for performance)
|
20 |
@st.cache_resource
|
21 |
def load_model():
|
22 |
+
return whisper.load_model("small") # Using "small" for free usage
|
23 |
|
24 |
+
# Transcribe video using Whisper and return segments
|
25 |
def transcribe_video(video_path):
|
26 |
model = load_model()
|
27 |
result = model.transcribe(video_path)
|
28 |
return result["segments"]
|
29 |
|
30 |
+
# Create an ASS file with karaoke-style effects.
|
31 |
+
# Each segment is split into words; each word gets an ASS karaoke tag (\k)
|
32 |
+
# which reveals it gradually over the segment’s duration.
|
33 |
+
def create_ass(segments, ass_path):
|
34 |
+
header = """[Script Info]
|
35 |
+
Title: AI Captioning
|
36 |
+
ScriptType: v4.00+
|
37 |
+
Collisions: Normal
|
38 |
+
PlayResX: 1280
|
39 |
+
PlayResY: 720
|
40 |
+
Timer: 100.0000
|
41 |
+
|
42 |
+
[V4+ Styles]
|
43 |
+
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
|
44 |
+
Style: Default,Arial,36,&H00FFFFFF,&H000000FF,&H00000000,&H64000000,0,0,0,0,100,100,0,0,1,2,0,2,10,10,30,1
|
45 |
+
|
46 |
+
[Events]
|
47 |
+
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
48 |
+
"""
|
49 |
+
events = ""
|
50 |
+
for segment in segments:
|
51 |
+
text = segment.get("text", "").strip()
|
52 |
+
if not text:
|
53 |
+
continue
|
54 |
+
words = text.split()
|
55 |
+
if not words:
|
56 |
+
continue
|
57 |
+
start_time = format_ass_time(segment["start"])
|
58 |
+
end_time = format_ass_time(segment["end"])
|
59 |
+
# Calculate total duration in centiseconds
|
60 |
+
total_duration = segment["end"] - segment["start"]
|
61 |
+
total_cs = int(total_duration * 100)
|
62 |
+
# Determine duration per word (in centiseconds)
|
63 |
+
duration_per_word = max(total_cs // len(words), 1)
|
64 |
+
|
65 |
+
# Build karaoke text: each word preceded by its \k tag.
|
66 |
+
ass_text = ""
|
67 |
+
for word in words:
|
68 |
+
ass_text += r"{\k" + str(duration_per_word) + "}" + word + " "
|
69 |
+
ass_text = ass_text.strip()
|
70 |
+
|
71 |
+
dialogue_line = f"Dialogue: 0,{start_time},{end_time},Default,,0,0,0,,{ass_text}\n"
|
72 |
+
events += dialogue_line
|
73 |
+
|
74 |
+
with open(ass_path, "w", encoding="utf-8") as f:
|
75 |
+
f.write(header + events)
|
76 |
+
|
77 |
+
# Burn the ASS subtitles into the video using FFmpeg.
|
78 |
+
def burn_captions(video_path, ass_path, output_path):
|
79 |
command = [
|
80 |
"ffmpeg",
|
81 |
"-i", video_path,
|
82 |
+
"-vf", f"subtitles={ass_path}",
|
83 |
"-c:a", "copy",
|
84 |
output_path
|
85 |
]
|
86 |
subprocess.run(command, check=True)
|
87 |
|
88 |
+
# Streamlit UI
|
89 |
+
st.title("🎥 AI Video Captioning App with Karaoke-Style Captions")
|
90 |
|
91 |
uploaded_file = st.file_uploader("Upload a video file", type=["mp4", "mkv", "avi", "mov"])
|
92 |
|
|
|
100 |
|
101 |
if st.button("Generate Captions & Burn into Video"):
|
102 |
with st.spinner("Generating captions..."):
|
103 |
+
segments = transcribe_video(video_path)
|
104 |
|
105 |
+
# Create ASS subtitle file
|
106 |
+
ass_path = video_path.replace(".mp4", ".ass")
|
107 |
+
create_ass(segments, ass_path)
|
108 |
|
109 |
output_video_path = video_path.replace(".mp4", "_captioned.mp4")
|
110 |
|
111 |
with st.spinner("Burning captions into video..."):
|
112 |
+
burn_captions(video_path, ass_path, output_video_path)
|
113 |
|
114 |
st.success("Processing complete! Download your video below.")
|
115 |
with open(output_video_path, "rb") as file:
|