NeuralFalcon commited on
Commit
bb4b2b8
·
verified ·
1 Parent(s): ee47056

Update subtitle.py

Browse files
Files changed (1) hide show
  1. subtitle.py +86 -27
subtitle.py CHANGED
@@ -1,3 +1,4 @@
 
1
  """
2
  A comprehensive toolkit for generating and translating subtitles from media files.
3
 
@@ -207,13 +208,13 @@ def whisper_subtitle(uploaded_file, source_language):
207
  # 1. Configure device and model
208
  device = "cuda" if torch.cuda.is_available() else "cpu"
209
  compute_type = "float16" if torch.cuda.is_available() else "int8"
210
- # model_dir = download_model(
211
- # "deepdml/faster-whisper-large-v3-turbo-ct2",
212
- # download_folder="./",
213
- # redownload=False
214
- # )
215
- # model = WhisperModel(model_dir, device=device, compute_type=compute_type)
216
- model = WhisperModel("deepdml/faster-whisper-large-v3-turbo-ct2",device=device, compute_type=compute_type)
217
 
218
 
219
  # 2. Process audio file
@@ -251,11 +252,11 @@ def whisper_subtitle(uploaded_file, source_language):
251
  # 6. Generate all subtitle files
252
  generate_srt_from_sentences(sentence_timestamps, srt_path=clean_srt_path)
253
  word_level_srt(word_timestamps, srt_path=word_srt_path)
254
- write_sentence_srt(
255
  word_timestamps, output_file=shorts_srt_path, max_lines=1,
256
- max_duration_s=2.0, max_chars_per_line=10
257
  )
258
- write_sentence_srt(
259
  word_timestamps, output_file=custom_srt_path, max_lines=2,
260
  max_duration_s=7.0, max_chars_per_line=38
261
  )
@@ -265,7 +266,7 @@ def whisper_subtitle(uploaded_file, source_language):
265
 
266
  return (
267
  clean_srt_path, custom_srt_path, word_srt_path, shorts_srt_path,
268
- txt_path, transcript_text, detected_language
269
  )
270
 
271
 
@@ -342,12 +343,13 @@ def merge_punctuation_glitches(subtitles):
342
 
343
  return cleaned
344
 
 
345
  def write_sentence_srt(
346
  word_level_timestamps, output_file="subtitles_professional.srt", max_lines=2,
347
  max_duration_s=7.0, max_chars_per_line=38, hard_pause_threshold=0.5,
348
  merge_pause_threshold=0.4
349
  ):
350
- """Creates professional-grade SRT files with smart line breaking and merging."""
351
  if not word_level_timestamps:
352
  return
353
 
@@ -356,14 +358,20 @@ def write_sentence_srt(
356
  i = 0
357
  while i < len(word_level_timestamps):
358
  start_time = word_level_timestamps[i]["start"]
359
- current_words = []
 
 
 
360
  j = i
361
  while j < len(word_level_timestamps):
362
  entry = word_level_timestamps[j]
363
- potential_text = " ".join(current_words + [entry["word"]])
 
 
 
364
 
365
  if len(split_line_by_char_limit(potential_text, max_chars_per_line)) > max_lines: break
366
- if (entry["end"] - start_time) > max_duration_s and current_words: break
367
 
368
  if j > i:
369
  prev_entry = word_level_timestamps[j-1]
@@ -371,16 +379,24 @@ def write_sentence_srt(
371
  if pause >= hard_pause_threshold: break
372
  if prev_entry["word"].endswith(('.','!','?')): break
373
 
374
- current_words.append(entry["word"])
 
375
  j += 1
376
 
377
- if not current_words:
378
- current_words.append(word_level_timestamps[i]["word"])
379
  j = i + 1
380
 
381
- text = " ".join(current_words)
382
  end_time = word_level_timestamps[j - 1]["end"]
383
- draft_subtitles.append({ "start": start_time, "end": end_time, "text": text })
 
 
 
 
 
 
 
384
  i = j
385
 
386
  # Phase 2: Post-process to merge single-word "orphan" subtitles
@@ -397,20 +413,61 @@ def write_sentence_srt(
397
  if len(split_line_by_char_limit(merged_text, max_chars_per_line)) <= max_lines:
398
  prev_sub["text"] = merged_text
399
  prev_sub["end"] = current_sub["end"]
 
 
 
400
  continue
401
 
402
  final_subtitles.append(current_sub)
403
 
404
  final_subtitles = merge_punctuation_glitches(final_subtitles)
405
-
406
- # Phase 3: Write the final SRT file
 
 
 
 
 
 
 
407
  with open(output_file, "w", encoding="utf-8") as f:
408
  for idx, sub in enumerate(final_subtitles, start=1):
 
409
  text = sub["text"].replace(" ,", ",").replace(" .", ".")
410
  formatted_lines = split_line_by_char_limit(text, max_chars_per_line)
 
 
 
411
  f.write(f"{idx}\n")
412
- f.write(f"{convert_time_to_srt_format(sub['start'])} --> {convert_time_to_srt_format(sub['end'])}\n")
413
  f.write("\n".join(formatted_lines) + "\n\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
 
415
  def write_subtitles_to_file(subtitles, filename="subtitles.srt"):
416
  """Writes a dictionary of subtitles to a standard SRT file."""
@@ -486,14 +543,15 @@ def subtitle_maker(media_file, source_lang, target_lang):
486
  Returns:
487
  A tuple containing paths to all generated files and the transcript text.
488
  """
 
489
  try:
490
  (
491
  default_srt, custom_srt, word_srt, shorts_srt,
492
- txt_path, transcript, detected_lang
493
  ) = whisper_subtitle(media_file, source_lang)
494
  except Exception as e:
495
  print(f"❌ An error occurred during transcription: {e}")
496
- return (None, None, None, None, None, None, f"Error: {e}")
497
 
498
  translated_srt_path = None
499
  if detected_lang and detected_lang != target_lang:
@@ -508,7 +566,7 @@ def subtitle_maker(media_file, source_lang, target_lang):
508
 
509
  return (
510
  default_srt, translated_srt_path, custom_srt, word_srt,
511
- shorts_srt, txt_path, transcript
512
  )
513
 
514
 
@@ -525,7 +583,7 @@ os.makedirs(TEMP_FOLDER, exist_ok=True)
525
  # source_lang = "English"
526
  # target_lang = "English"
527
 
528
- # default_srt, translated_srt, custom_srt, word_srt, shorts_srt, txt_path, transcript = subtitle_maker(
529
  # media_file, source_lang, target_lang
530
  # )
531
  # If source_lang and target_lang are the same, translation will be skipped.
@@ -538,6 +596,7 @@ os.makedirs(TEMP_FOLDER, exist_ok=True)
538
  # word_srt -> Word-level timestamps (useful for creating YouTube Shorts/Reels)
539
  # shorts_srt -> Optimized subtitles for vertical videos (displays 3–4 words at a time , Maximum 17 characters per segment.)
540
  # txt_path -> Full transcript as plain text (useful for video summarization or for asking questions about the video or audio data with other LLM tools)
 
541
  # transcript -> Transcript text directly returned by the function, if you just need the transcript
542
 
543
  # All functionality is contained in a single file, making it portable
 
1
+ # Code written by me, organized with the help of AI.
2
  """
3
  A comprehensive toolkit for generating and translating subtitles from media files.
4
 
 
208
  # 1. Configure device and model
209
  device = "cuda" if torch.cuda.is_available() else "cpu"
210
  compute_type = "float16" if torch.cuda.is_available() else "int8"
211
+ model_dir = download_model(
212
+ "deepdml/faster-whisper-large-v3-turbo-ct2",
213
+ download_folder="./",
214
+ redownload=False
215
+ )
216
+ model = WhisperModel(model_dir, device=device, compute_type=compute_type)
217
+ # model = WhisperModel("deepdml/faster-whisper-large-v3-turbo-ct2",device=device, compute_type=compute_type)
218
 
219
 
220
  # 2. Process audio file
 
252
  # 6. Generate all subtitle files
253
  generate_srt_from_sentences(sentence_timestamps, srt_path=clean_srt_path)
254
  word_level_srt(word_timestamps, srt_path=word_srt_path)
255
+ shorts_json=write_sentence_srt(
256
  word_timestamps, output_file=shorts_srt_path, max_lines=1,
257
+ max_duration_s=2.0, max_chars_per_line=17
258
  )
259
+ sentence_json=write_sentence_srt(
260
  word_timestamps, output_file=custom_srt_path, max_lines=2,
261
  max_duration_s=7.0, max_chars_per_line=38
262
  )
 
266
 
267
  return (
268
  clean_srt_path, custom_srt_path, word_srt_path, shorts_srt_path,
269
+ txt_path, transcript_text, sentence_json,shorts_json,detected_language
270
  )
271
 
272
 
 
343
 
344
  return cleaned
345
 
346
+ import json
347
  def write_sentence_srt(
348
  word_level_timestamps, output_file="subtitles_professional.srt", max_lines=2,
349
  max_duration_s=7.0, max_chars_per_line=38, hard_pause_threshold=0.5,
350
  merge_pause_threshold=0.4
351
  ):
352
+ """Creates professional-grade SRT files and a corresponding timestamp.json file."""
353
  if not word_level_timestamps:
354
  return
355
 
 
358
  i = 0
359
  while i < len(word_level_timestamps):
360
  start_time = word_level_timestamps[i]["start"]
361
+
362
+ # We'll now store the full word objects, not just the text
363
+ current_word_objects = []
364
+
365
  j = i
366
  while j < len(word_level_timestamps):
367
  entry = word_level_timestamps[j]
368
+
369
+ # Create potential text from the word objects
370
+ potential_words = [w["word"] for w in current_word_objects] + [entry["word"]]
371
+ potential_text = " ".join(potential_words)
372
 
373
  if len(split_line_by_char_limit(potential_text, max_chars_per_line)) > max_lines: break
374
+ if (entry["end"] - start_time) > max_duration_s and current_word_objects: break
375
 
376
  if j > i:
377
  prev_entry = word_level_timestamps[j-1]
 
379
  if pause >= hard_pause_threshold: break
380
  if prev_entry["word"].endswith(('.','!','?')): break
381
 
382
+ # Append the full word object
383
+ current_word_objects.append(entry)
384
  j += 1
385
 
386
+ if not current_word_objects:
387
+ current_word_objects.append(word_level_timestamps[i])
388
  j = i + 1
389
 
390
+ text = " ".join([w["word"] for w in current_word_objects])
391
  end_time = word_level_timestamps[j - 1]["end"]
392
+
393
+ # Include the list of word objects in our draft subtitle
394
+ draft_subtitles.append({
395
+ "start": start_time,
396
+ "end": end_time,
397
+ "text": text,
398
+ "words": current_word_objects
399
+ })
400
  i = j
401
 
402
  # Phase 2: Post-process to merge single-word "orphan" subtitles
 
413
  if len(split_line_by_char_limit(merged_text, max_chars_per_line)) <= max_lines:
414
  prev_sub["text"] = merged_text
415
  prev_sub["end"] = current_sub["end"]
416
+
417
+ # Merge the word-level data as well
418
+ prev_sub["words"].extend(current_sub["words"])
419
  continue
420
 
421
  final_subtitles.append(current_sub)
422
 
423
  final_subtitles = merge_punctuation_glitches(final_subtitles)
424
+ print(final_subtitles)
425
+ # ==============================================================================
426
+ # NEW CODE BLOCK: Generate JSON data and write files
427
+ # ==============================================================================
428
+
429
+ # This dictionary will hold the data for our JSON file
430
+ timestamps_data = {}
431
+
432
+ # Phase 3: Write the final SRT file (and prepare JSON data)
433
  with open(output_file, "w", encoding="utf-8") as f:
434
  for idx, sub in enumerate(final_subtitles, start=1):
435
+ # --- SRT Writing (Unchanged) ---
436
  text = sub["text"].replace(" ,", ",").replace(" .", ".")
437
  formatted_lines = split_line_by_char_limit(text, max_chars_per_line)
438
+ start_time_str = convert_time_to_srt_format(sub['start'])
439
+ end_time_str = convert_time_to_srt_format(sub['end'])
440
+
441
  f.write(f"{idx}\n")
442
+ f.write(f"{start_time_str} --> {end_time_str}\n")
443
  f.write("\n".join(formatted_lines) + "\n\n")
444
+
445
+ # --- JSON Data Population (New) ---
446
+ # Create the list of word dictionaries for the current subtitle
447
+ word_data = []
448
+ for word_obj in sub["words"]:
449
+ word_data.append({
450
+ "word": word_obj["word"],
451
+ "start": convert_time_to_srt_format(word_obj["start"]),
452
+ "end": convert_time_to_srt_format(word_obj["end"])
453
+ })
454
+
455
+ # Add the complete entry to our main dictionary
456
+ timestamps_data[str(idx)] = {
457
+ "text": "\n".join(formatted_lines),
458
+ "start": start_time_str,
459
+ "end": end_time_str,
460
+ "words": word_data
461
+ }
462
+
463
+ # Write the collected data to the JSON file
464
+ json_output_file = output_file.replace(".srt",".json")
465
+ with open(json_output_file, "w", encoding="utf-8") as f_json:
466
+ json.dump(timestamps_data, f_json, indent=4, ensure_ascii=False)
467
+
468
+ print(f"Successfully generated SRT file: {output_file}")
469
+ print(f"Successfully generated JSON file: {json_output_file}")
470
+ return json_output_file
471
 
472
  def write_subtitles_to_file(subtitles, filename="subtitles.srt"):
473
  """Writes a dictionary of subtitles to a standard SRT file."""
 
543
  Returns:
544
  A tuple containing paths to all generated files and the transcript text.
545
  """
546
+
547
  try:
548
  (
549
  default_srt, custom_srt, word_srt, shorts_srt,
550
+ txt_path, transcript, sentence_json,word_json,detected_lang
551
  ) = whisper_subtitle(media_file, source_lang)
552
  except Exception as e:
553
  print(f"❌ An error occurred during transcription: {e}")
554
+ return (None, None, None, None, None, None,None,None, f"Error: {e}")
555
 
556
  translated_srt_path = None
557
  if detected_lang and detected_lang != target_lang:
 
566
 
567
  return (
568
  default_srt, translated_srt_path, custom_srt, word_srt,
569
+ shorts_srt, txt_path,sentence_json,word_json, transcript
570
  )
571
 
572
 
 
583
  # source_lang = "English"
584
  # target_lang = "English"
585
 
586
+ # default_srt, translated_srt_path, custom_srt, word_srt, shorts_srt, txt_path,sentence_json,word_json, transcript= subtitle_maker(
587
  # media_file, source_lang, target_lang
588
  # )
589
  # If source_lang and target_lang are the same, translation will be skipped.
 
596
  # word_srt -> Word-level timestamps (useful for creating YouTube Shorts/Reels)
597
  # shorts_srt -> Optimized subtitles for vertical videos (displays 3–4 words at a time , Maximum 17 characters per segment.)
598
  # txt_path -> Full transcript as plain text (useful for video summarization or for asking questions about the video or audio data with other LLM tools)
599
+ # sentence_json,word_json --> To Generate .ass file later
600
  # transcript -> Transcript text directly returned by the function, if you just need the transcript
601
 
602
  # All functionality is contained in a single file, making it portable