|
|
|
import json |
|
from pathlib import Path |
|
|
|
def load_transcript_json(transcript_file: str): |
|
""" |
|
Parse a Whisper JSON file into a Whisper JSON object |
|
|
|
# Parameters: |
|
transcript_file (str): Path to the Whisper JSON file |
|
""" |
|
with open(transcript_file, "r", encoding="utf-8") as f: |
|
whisper_result = json.load(f) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return whisper_result |
|
|
|
|
|
def load_transcript_srt(subtitle_file: str): |
|
import srt |
|
|
|
""" |
|
Parse a SRT file into a Whisper JSON object |
|
|
|
# Parameters: |
|
subtitle_file (str): Path to the SRT file |
|
""" |
|
with open(subtitle_file, "r", encoding="utf-8") as f: |
|
subs = srt.parse(f) |
|
|
|
whisper_result = { |
|
"text": "", |
|
"segments": [] |
|
} |
|
|
|
for sub in subs: |
|
|
|
segment = { |
|
"text": sub.content, |
|
"start": sub.start.total_seconds(), |
|
"end": sub.end.total_seconds(), |
|
"words": [] |
|
} |
|
whisper_result["segments"].append(segment) |
|
whisper_result["text"] += sub.content |
|
|
|
return whisper_result |
|
|
|
def load_transcript(file: str): |
|
|
|
file_extension = Path(file).suffix.lower() |
|
|
|
if file_extension == ".json": |
|
return load_transcript_json(file) |
|
elif file_extension == ".srt": |
|
return load_transcript_srt(file) |
|
else: |
|
raise ValueError(f"Unsupported file type: {file_extension}") |