Update subtitle.py
Browse files- subtitle.py +86 -27
subtitle.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
"""
|
2 |
A comprehensive toolkit for generating and translating subtitles from media files.
|
3 |
|
@@ -207,13 +208,13 @@ def whisper_subtitle(uploaded_file, source_language):
|
|
207 |
# 1. Configure device and model
|
208 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
209 |
compute_type = "float16" if torch.cuda.is_available() else "int8"
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
model = WhisperModel("deepdml/faster-whisper-large-v3-turbo-ct2",device=device, compute_type=compute_type)
|
217 |
|
218 |
|
219 |
# 2. Process audio file
|
@@ -251,11 +252,11 @@ def whisper_subtitle(uploaded_file, source_language):
|
|
251 |
# 6. Generate all subtitle files
|
252 |
generate_srt_from_sentences(sentence_timestamps, srt_path=clean_srt_path)
|
253 |
word_level_srt(word_timestamps, srt_path=word_srt_path)
|
254 |
-
write_sentence_srt(
|
255 |
word_timestamps, output_file=shorts_srt_path, max_lines=1,
|
256 |
-
max_duration_s=2.0, max_chars_per_line=
|
257 |
)
|
258 |
-
write_sentence_srt(
|
259 |
word_timestamps, output_file=custom_srt_path, max_lines=2,
|
260 |
max_duration_s=7.0, max_chars_per_line=38
|
261 |
)
|
@@ -265,7 +266,7 @@ def whisper_subtitle(uploaded_file, source_language):
|
|
265 |
|
266 |
return (
|
267 |
clean_srt_path, custom_srt_path, word_srt_path, shorts_srt_path,
|
268 |
-
txt_path, transcript_text, detected_language
|
269 |
)
|
270 |
|
271 |
|
@@ -342,12 +343,13 @@ def merge_punctuation_glitches(subtitles):
|
|
342 |
|
343 |
return cleaned
|
344 |
|
|
|
345 |
def write_sentence_srt(
|
346 |
word_level_timestamps, output_file="subtitles_professional.srt", max_lines=2,
|
347 |
max_duration_s=7.0, max_chars_per_line=38, hard_pause_threshold=0.5,
|
348 |
merge_pause_threshold=0.4
|
349 |
):
|
350 |
-
"""Creates professional-grade SRT files
|
351 |
if not word_level_timestamps:
|
352 |
return
|
353 |
|
@@ -356,14 +358,20 @@ def write_sentence_srt(
|
|
356 |
i = 0
|
357 |
while i < len(word_level_timestamps):
|
358 |
start_time = word_level_timestamps[i]["start"]
|
359 |
-
|
|
|
|
|
|
|
360 |
j = i
|
361 |
while j < len(word_level_timestamps):
|
362 |
entry = word_level_timestamps[j]
|
363 |
-
|
|
|
|
|
|
|
364 |
|
365 |
if len(split_line_by_char_limit(potential_text, max_chars_per_line)) > max_lines: break
|
366 |
-
if (entry["end"] - start_time) > max_duration_s and
|
367 |
|
368 |
if j > i:
|
369 |
prev_entry = word_level_timestamps[j-1]
|
@@ -371,16 +379,24 @@ def write_sentence_srt(
|
|
371 |
if pause >= hard_pause_threshold: break
|
372 |
if prev_entry["word"].endswith(('.','!','?')): break
|
373 |
|
374 |
-
|
|
|
375 |
j += 1
|
376 |
|
377 |
-
if not
|
378 |
-
|
379 |
j = i + 1
|
380 |
|
381 |
-
text = " ".join(
|
382 |
end_time = word_level_timestamps[j - 1]["end"]
|
383 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
i = j
|
385 |
|
386 |
# Phase 2: Post-process to merge single-word "orphan" subtitles
|
@@ -397,20 +413,61 @@ def write_sentence_srt(
|
|
397 |
if len(split_line_by_char_limit(merged_text, max_chars_per_line)) <= max_lines:
|
398 |
prev_sub["text"] = merged_text
|
399 |
prev_sub["end"] = current_sub["end"]
|
|
|
|
|
|
|
400 |
continue
|
401 |
|
402 |
final_subtitles.append(current_sub)
|
403 |
|
404 |
final_subtitles = merge_punctuation_glitches(final_subtitles)
|
405 |
-
|
406 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
407 |
with open(output_file, "w", encoding="utf-8") as f:
|
408 |
for idx, sub in enumerate(final_subtitles, start=1):
|
|
|
409 |
text = sub["text"].replace(" ,", ",").replace(" .", ".")
|
410 |
formatted_lines = split_line_by_char_limit(text, max_chars_per_line)
|
|
|
|
|
|
|
411 |
f.write(f"{idx}\n")
|
412 |
-
f.write(f"{
|
413 |
f.write("\n".join(formatted_lines) + "\n\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
414 |
|
415 |
def write_subtitles_to_file(subtitles, filename="subtitles.srt"):
|
416 |
"""Writes a dictionary of subtitles to a standard SRT file."""
|
@@ -486,14 +543,15 @@ def subtitle_maker(media_file, source_lang, target_lang):
|
|
486 |
Returns:
|
487 |
A tuple containing paths to all generated files and the transcript text.
|
488 |
"""
|
|
|
489 |
try:
|
490 |
(
|
491 |
default_srt, custom_srt, word_srt, shorts_srt,
|
492 |
-
txt_path, transcript, detected_lang
|
493 |
) = whisper_subtitle(media_file, source_lang)
|
494 |
except Exception as e:
|
495 |
print(f"❌ An error occurred during transcription: {e}")
|
496 |
-
return (None, None, None, None, None, None, f"Error: {e}")
|
497 |
|
498 |
translated_srt_path = None
|
499 |
if detected_lang and detected_lang != target_lang:
|
@@ -508,7 +566,7 @@ def subtitle_maker(media_file, source_lang, target_lang):
|
|
508 |
|
509 |
return (
|
510 |
default_srt, translated_srt_path, custom_srt, word_srt,
|
511 |
-
shorts_srt, txt_path, transcript
|
512 |
)
|
513 |
|
514 |
|
@@ -525,7 +583,7 @@ os.makedirs(TEMP_FOLDER, exist_ok=True)
|
|
525 |
# source_lang = "English"
|
526 |
# target_lang = "English"
|
527 |
|
528 |
-
#
|
529 |
# media_file, source_lang, target_lang
|
530 |
# )
|
531 |
# If source_lang and target_lang are the same, translation will be skipped.
|
@@ -538,6 +596,7 @@ os.makedirs(TEMP_FOLDER, exist_ok=True)
|
|
538 |
# word_srt -> Word-level timestamps (useful for creating YouTube Shorts/Reels)
|
539 |
# shorts_srt -> Optimized subtitles for vertical videos (displays 3–4 words at a time , Maximum 17 characters per segment.)
|
540 |
# txt_path -> Full transcript as plain text (useful for video summarization or for asking questions about the video or audio data with other LLM tools)
|
|
|
541 |
# transcript -> Transcript text directly returned by the function, if you just need the transcript
|
542 |
|
543 |
# All functionality is contained in a single file, making it portable
|
|
|
1 |
+
# Code written by me, organized with the help of AI.
|
2 |
"""
|
3 |
A comprehensive toolkit for generating and translating subtitles from media files.
|
4 |
|
|
|
208 |
# 1. Configure device and model
|
209 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
210 |
compute_type = "float16" if torch.cuda.is_available() else "int8"
|
211 |
+
model_dir = download_model(
|
212 |
+
"deepdml/faster-whisper-large-v3-turbo-ct2",
|
213 |
+
download_folder="./",
|
214 |
+
redownload=False
|
215 |
+
)
|
216 |
+
model = WhisperModel(model_dir, device=device, compute_type=compute_type)
|
217 |
+
# model = WhisperModel("deepdml/faster-whisper-large-v3-turbo-ct2",device=device, compute_type=compute_type)
|
218 |
|
219 |
|
220 |
# 2. Process audio file
|
|
|
252 |
# 6. Generate all subtitle files
|
253 |
generate_srt_from_sentences(sentence_timestamps, srt_path=clean_srt_path)
|
254 |
word_level_srt(word_timestamps, srt_path=word_srt_path)
|
255 |
+
shorts_json=write_sentence_srt(
|
256 |
word_timestamps, output_file=shorts_srt_path, max_lines=1,
|
257 |
+
max_duration_s=2.0, max_chars_per_line=17
|
258 |
)
|
259 |
+
sentence_json=write_sentence_srt(
|
260 |
word_timestamps, output_file=custom_srt_path, max_lines=2,
|
261 |
max_duration_s=7.0, max_chars_per_line=38
|
262 |
)
|
|
|
266 |
|
267 |
return (
|
268 |
clean_srt_path, custom_srt_path, word_srt_path, shorts_srt_path,
|
269 |
+
txt_path, transcript_text, sentence_json,shorts_json,detected_language
|
270 |
)
|
271 |
|
272 |
|
|
|
343 |
|
344 |
return cleaned
|
345 |
|
346 |
+
import json
|
347 |
def write_sentence_srt(
|
348 |
word_level_timestamps, output_file="subtitles_professional.srt", max_lines=2,
|
349 |
max_duration_s=7.0, max_chars_per_line=38, hard_pause_threshold=0.5,
|
350 |
merge_pause_threshold=0.4
|
351 |
):
|
352 |
+
"""Creates professional-grade SRT files and a corresponding timestamp.json file."""
|
353 |
if not word_level_timestamps:
|
354 |
return
|
355 |
|
|
|
358 |
i = 0
|
359 |
while i < len(word_level_timestamps):
|
360 |
start_time = word_level_timestamps[i]["start"]
|
361 |
+
|
362 |
+
# We'll now store the full word objects, not just the text
|
363 |
+
current_word_objects = []
|
364 |
+
|
365 |
j = i
|
366 |
while j < len(word_level_timestamps):
|
367 |
entry = word_level_timestamps[j]
|
368 |
+
|
369 |
+
# Create potential text from the word objects
|
370 |
+
potential_words = [w["word"] for w in current_word_objects] + [entry["word"]]
|
371 |
+
potential_text = " ".join(potential_words)
|
372 |
|
373 |
if len(split_line_by_char_limit(potential_text, max_chars_per_line)) > max_lines: break
|
374 |
+
if (entry["end"] - start_time) > max_duration_s and current_word_objects: break
|
375 |
|
376 |
if j > i:
|
377 |
prev_entry = word_level_timestamps[j-1]
|
|
|
379 |
if pause >= hard_pause_threshold: break
|
380 |
if prev_entry["word"].endswith(('.','!','?')): break
|
381 |
|
382 |
+
# Append the full word object
|
383 |
+
current_word_objects.append(entry)
|
384 |
j += 1
|
385 |
|
386 |
+
if not current_word_objects:
|
387 |
+
current_word_objects.append(word_level_timestamps[i])
|
388 |
j = i + 1
|
389 |
|
390 |
+
text = " ".join([w["word"] for w in current_word_objects])
|
391 |
end_time = word_level_timestamps[j - 1]["end"]
|
392 |
+
|
393 |
+
# Include the list of word objects in our draft subtitle
|
394 |
+
draft_subtitles.append({
|
395 |
+
"start": start_time,
|
396 |
+
"end": end_time,
|
397 |
+
"text": text,
|
398 |
+
"words": current_word_objects
|
399 |
+
})
|
400 |
i = j
|
401 |
|
402 |
# Phase 2: Post-process to merge single-word "orphan" subtitles
|
|
|
413 |
if len(split_line_by_char_limit(merged_text, max_chars_per_line)) <= max_lines:
|
414 |
prev_sub["text"] = merged_text
|
415 |
prev_sub["end"] = current_sub["end"]
|
416 |
+
|
417 |
+
# Merge the word-level data as well
|
418 |
+
prev_sub["words"].extend(current_sub["words"])
|
419 |
continue
|
420 |
|
421 |
final_subtitles.append(current_sub)
|
422 |
|
423 |
final_subtitles = merge_punctuation_glitches(final_subtitles)
|
424 |
+
print(final_subtitles)
|
425 |
+
# ==============================================================================
|
426 |
+
# NEW CODE BLOCK: Generate JSON data and write files
|
427 |
+
# ==============================================================================
|
428 |
+
|
429 |
+
# This dictionary will hold the data for our JSON file
|
430 |
+
timestamps_data = {}
|
431 |
+
|
432 |
+
# Phase 3: Write the final SRT file (and prepare JSON data)
|
433 |
with open(output_file, "w", encoding="utf-8") as f:
|
434 |
for idx, sub in enumerate(final_subtitles, start=1):
|
435 |
+
# --- SRT Writing (Unchanged) ---
|
436 |
text = sub["text"].replace(" ,", ",").replace(" .", ".")
|
437 |
formatted_lines = split_line_by_char_limit(text, max_chars_per_line)
|
438 |
+
start_time_str = convert_time_to_srt_format(sub['start'])
|
439 |
+
end_time_str = convert_time_to_srt_format(sub['end'])
|
440 |
+
|
441 |
f.write(f"{idx}\n")
|
442 |
+
f.write(f"{start_time_str} --> {end_time_str}\n")
|
443 |
f.write("\n".join(formatted_lines) + "\n\n")
|
444 |
+
|
445 |
+
# --- JSON Data Population (New) ---
|
446 |
+
# Create the list of word dictionaries for the current subtitle
|
447 |
+
word_data = []
|
448 |
+
for word_obj in sub["words"]:
|
449 |
+
word_data.append({
|
450 |
+
"word": word_obj["word"],
|
451 |
+
"start": convert_time_to_srt_format(word_obj["start"]),
|
452 |
+
"end": convert_time_to_srt_format(word_obj["end"])
|
453 |
+
})
|
454 |
+
|
455 |
+
# Add the complete entry to our main dictionary
|
456 |
+
timestamps_data[str(idx)] = {
|
457 |
+
"text": "\n".join(formatted_lines),
|
458 |
+
"start": start_time_str,
|
459 |
+
"end": end_time_str,
|
460 |
+
"words": word_data
|
461 |
+
}
|
462 |
+
|
463 |
+
# Write the collected data to the JSON file
|
464 |
+
json_output_file = output_file.replace(".srt",".json")
|
465 |
+
with open(json_output_file, "w", encoding="utf-8") as f_json:
|
466 |
+
json.dump(timestamps_data, f_json, indent=4, ensure_ascii=False)
|
467 |
+
|
468 |
+
print(f"Successfully generated SRT file: {output_file}")
|
469 |
+
print(f"Successfully generated JSON file: {json_output_file}")
|
470 |
+
return json_output_file
|
471 |
|
472 |
def write_subtitles_to_file(subtitles, filename="subtitles.srt"):
|
473 |
"""Writes a dictionary of subtitles to a standard SRT file."""
|
|
|
543 |
Returns:
|
544 |
A tuple containing paths to all generated files and the transcript text.
|
545 |
"""
|
546 |
+
|
547 |
try:
|
548 |
(
|
549 |
default_srt, custom_srt, word_srt, shorts_srt,
|
550 |
+
txt_path, transcript, sentence_json,word_json,detected_lang
|
551 |
) = whisper_subtitle(media_file, source_lang)
|
552 |
except Exception as e:
|
553 |
print(f"❌ An error occurred during transcription: {e}")
|
554 |
+
return (None, None, None, None, None, None,None,None, f"Error: {e}")
|
555 |
|
556 |
translated_srt_path = None
|
557 |
if detected_lang and detected_lang != target_lang:
|
|
|
566 |
|
567 |
return (
|
568 |
default_srt, translated_srt_path, custom_srt, word_srt,
|
569 |
+
shorts_srt, txt_path,sentence_json,word_json, transcript
|
570 |
)
|
571 |
|
572 |
|
|
|
583 |
# source_lang = "English"
|
584 |
# target_lang = "English"
|
585 |
|
586 |
+
# default_srt, translated_srt_path, custom_srt, word_srt, shorts_srt, txt_path,sentence_json,word_json, transcript= subtitle_maker(
|
587 |
# media_file, source_lang, target_lang
|
588 |
# )
|
589 |
# If source_lang and target_lang are the same, translation will be skipped.
|
|
|
596 |
# word_srt -> Word-level timestamps (useful for creating YouTube Shorts/Reels)
|
597 |
# shorts_srt -> Optimized subtitles for vertical videos (displays 3–4 words at a time , Maximum 17 characters per segment.)
|
598 |
# txt_path -> Full transcript as plain text (useful for video summarization or for asking questions about the video or audio data with other LLM tools)
|
599 |
+
# sentence_json,word_json --> To Generate .ass file later
|
600 |
# transcript -> Transcript text directly returned by the function, if you just need the transcript
|
601 |
|
602 |
# All functionality is contained in a single file, making it portable
|