Spaces:
Running
Running
| from faster_whisper import WhisperModel | |
| import math | |
| import gradio as gr | |
| from moviepy import VideoFileClip | |
| import requests | |
| def extract_audio(input_video_name): | |
| # Define the input video file and output audio file | |
| mp3_file = "audio.mp3" | |
| # Load the video clip | |
| video_clip = VideoFileClip(input_video_name) | |
| # Extract the audio from the video clip | |
| audio_clip = video_clip.audio | |
| duration = audio_clip.duration | |
| print(f"Audio duration: {duration}") | |
| # Write the audio to a separate file | |
| audio_clip.write_audiofile(mp3_file) | |
| # Close the video and audio clips | |
| audio_clip.close() | |
| video_clip.close() | |
| print("Audio extraction successful!") | |
| return mp3_file, duration | |
| def download_video(url): | |
| response = requests.get(url, stream=True) | |
| response.raise_for_status() | |
| video_file = "video.mp4" | |
| with open(video_file, 'wb') as file: | |
| for chunk in response.iter_content(chunk_size=8192): | |
| if chunk: | |
| file.write(chunk) | |
| print("Video downloaded successfully!") | |
| return video_file | |
| def word_level_transcribe(audio, max_segment_duration=2.0): # Set your desired max duration here | |
| model = WhisperModel("tiny", device="cpu") | |
| segments, info = model.transcribe(audio, vad_filter=True, vad_parameters=dict(min_silence_duration_ms=1500), word_timestamps=True, log_progress=True) | |
| segments = list(segments) # The transcription will actually run here. | |
| wordlevel_info = [] | |
| for segment in segments: | |
| for word in segment.words: | |
| print("[%.2fs -> %.2fs] %s" % (word.start, word.end, word.word)) | |
| wordlevel_info.append({'word':word.word,'start':word.start,'end':word.end}) | |
| return wordlevel_info | |
| def create_subtitles(wordlevel_info): | |
| punctuation_marks = {'.', '!', '?', ',', ';', ':', '—', '-', '。', '!', '?'} # Add/remove punctuation as needed | |
| subtitles = [] | |
| line = [] | |
| for word_data in wordlevel_info: | |
| line.append(word_data) | |
| current_word = word_data['word'] | |
| # Check if current word ends with punctuation or line reached 5 words | |
| ends_with_punct = current_word and (current_word[-1] in punctuation_marks) | |
| if ends_with_punct or len(line) == 5: | |
| # Create a new subtitle segment | |
| subtitle = { | |
| "word": " ".join(item["word"] for item in line), | |
| "start": line[0]["start"], | |
| "end": line[-1]["end"], | |
| "textcontents": line.copy() | |
| } | |
| subtitles.append(subtitle) | |
| line = [] | |
| # Add remaining words if any | |
| if line: | |
| subtitle = { | |
| "word": " ".join(item["word"] for item in line), | |
| "start": line[0]["start"], | |
| "end": line[-1]["end"], | |
| "textcontents": line.copy() | |
| } | |
| subtitles.append(subtitle) | |
| # Remove gaps between segments by extending the previous segment's end time | |
| for i in range(1, len(subtitles)): | |
| prev_subtitle = subtitles[i - 1] | |
| current_subtitle = subtitles[i] | |
| # Extend the previous segment's end time to the start of the current segment | |
| prev_subtitle["end"] = current_subtitle["start"] | |
| return subtitles | |
| def format_time(seconds): | |
| hours = math.floor(seconds / 3600) | |
| seconds %= 3600 | |
| minutes = math.floor(seconds / 60) | |
| seconds %= 60 | |
| milliseconds = round((seconds - math.floor(seconds)) * 1000) | |
| seconds = math.floor(seconds) | |
| formatted_time = f"{hours:02d}:{minutes:02d}:{seconds:01d},{milliseconds:03d}" | |
| return formatted_time | |
| def generate_subtitle_file(language, segments, input_video_name): | |
| subtitle_file = f"sub-{input_video_name}.{language}.srt" | |
| text = "" | |
| for index, segment in enumerate(segments): | |
| segment_start = format_time(segment['start']) | |
| segment_end = format_time(segment['end']) | |
| text += f"{str(index+1)} \n" | |
| text += f"{segment_start} --> {segment_end} \n" | |
| text += f"{segment['word']} \n" | |
| text += "\n" | |
| f = open(subtitle_file, "w", encoding='utf8') | |
| f.write(text) | |
| f.close() | |
| return subtitle_file | |
| def transcribe(url): | |
| video = download_video(url) | |
| mp3_file, duration = extract_audio(video) | |
| print("transcribe") | |
| wordlevel_info=word_level_transcribe(mp3_file) | |
| subtitles = create_subtitles(wordlevel_info) | |
| subtitle_file = generate_subtitle_file('fa', subtitles, 'video_subtitled') | |
| return subtitle_file, video, mp3_file | |
| with gr.Blocks() as demo: | |
| gr.Markdown("Start typing below and then click **Run** to see the progress and final output.") | |
| with gr.Column(): | |
| #audio_in = gr.Audio(type="filepath") | |
| url = gr.Text() | |
| srt_file = gr.File() | |
| btn = gr.Button("Create") | |
| video_file_output = gr.Video(label="Result Video") | |
| mp3_file = gr.Audio(type="filepath") | |
| btn.click( | |
| fn=transcribe, | |
| inputs=url, | |
| outputs=[srt_file, video_file_output, mp3_file], | |
| ) | |
| demo.launch(debug=True) |