Spaces:

ardha27
/

Youtube-AI-Summarizer

Running

App Files Files Community

Youtube-AI-Summarizer / app.py

Zeph27

nonactive youtube

c4f7d64 over 1 year ago

raw

history blame

9.47 kB

	import gradio as gr
	import yt_dlp
	from dotenv import load_dotenv
	import os
	import google.generativeai as genai
	import re
	import torch
	from transformers import pipeline
	from transformers.pipelines.audio_utils import ffmpeg_read
	import time
	import spaces

	load_dotenv()
	default_gemini_api_key = os.getenv('gemini_api_key')

	device = 0 if torch.cuda.is_available() else "cpu"

	def load_pipeline(model_name):
	return pipeline(
	task="automatic-speech-recognition",
	model=model_name,
	chunk_length_s=30,
	device=device,
	)

	def configure_genai(api_key, model_variant):
	genai.configure(api_key=api_key)
	return genai.GenerativeModel(model_variant)

	def extract_youtube_id(youtube_url):
	# Extract the YouTube video ID from various URL formats
	youtube_id_match = re.search(r'(?:v=\|\/)([0-9A-Za-z_-]{11}).*', youtube_url)
	if youtube_id_match:
	return youtube_id_match.group(1)
	return None

	def download_youtube_audio(youtube_url, output_filename):
	ydl_opts = {
	'format': 'bestaudio/best',
	'postprocessors': [{
	'key': 'FFmpegExtractAudio',
	'preferredcodec': 'mp3',
	'preferredquality': '192',
	}],
	'outtmpl': output_filename,
	}

	try:
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	ydl.download([youtube_url])

	print(f"Downloaded audio from YouTube URL: {youtube_url}")
	return output_filename
	except Exception as e:
	print(f"Error downloading YouTube audio: {str(e)}")
	raise gr.Error(f"Failed to download YouTube audio: {str(e)}")

	def summarize_transcription(transcription, model, gemini_prompt):
	try:
	prompt = f"{gemini_prompt}:\n\n{transcription}"
	response = model.generate_content(prompt)
	return response.text
	except Exception as e:
	print(f"Error summarizing transcription: {str(e)}")
	return f"Error summarizing transcription: {str(e)}"

	@spaces.GPU(duration=180)
	def process_audio(audio_file, language, whisper_model):
	print("Starting transcription...")
	start_time = time.time()

	if device == 0:
	pipe = load_pipeline(whisper_model)
	else:
	pipe = load_pipeline("openai/whisper-tiny")

	with open(audio_file, "rb") as f:
	inputs = f.read()

	inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
	inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}

	if language:
	print(f"Using language: {language}")
	transcription = pipe(inputs, batch_size=8, generate_kwargs={"task": "transcribe", "language": language}, return_timestamps=True)["text"]
	else:
	print("No language defined, using default language")
	transcription = pipe(inputs, batch_size=8, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]

	end_time = time.time()
	processing_time = round(end_time - start_time, 2)
	return transcription, processing_time

	def transcribe(youtube_url, audio_file, whisper_model, gemini_api_key, gemini_prompt, gemini_model_variant, language, progress=gr.Progress()):
	try:
	progress(0, desc="Initializing")
	if not gemini_api_key:
	gemini_api_key = default_gemini_api_key
	model = configure_genai(gemini_api_key, gemini_model_variant)

	if youtube_url:
	progress(0.1, desc="Extracting YouTube ID")
	youtube_id = extract_youtube_id(youtube_url)
	if youtube_id:
	output_filename = f"{youtube_id}"
	else:
	output_filename = f"unknown"
	progress(0.2, desc="Downloading YouTube audio")
	audio_file = download_youtube_audio(youtube_url, output_filename)
	audio_file = f"{audio_file}.mp3"
	print(f"Audio file downloaded: {audio_file}")
	else:
	progress(0.2, desc="Reading audio file")
	audio_file = f"{audio_file.name}"
	print(f"Audio file read: {audio_file}")

	progress(0.4, desc="Starting transcription")
	transcription, processing_time = process_audio(audio_file, language, whisper_model)

	progress(0.6, desc="Cleaning up")
	# Delete the audio file after transcription
	if os.path.exists(f"{audio_file}.mp3"):
	os.remove(f"{audio_file}.mp3")
	print(f"Deleted audio file: {audio_file}.mp3")

	progress(0.7, desc="Summarizing transcription")
	# Summarize the transcription
	summary = summarize_transcription(transcription, model, gemini_prompt)

	progress(0.8, desc="Preparing output")
	# Prepare the transcription and summary message
	transcription_message = f"{transcription}" if transcription else ""

	summary_message = f"{summary}" if summary else ""

	progress(0.9, desc="Saving output to file")
	print("Saving transcription and summary to file...")
	# Save transcription and summary to separate text files
	transcription_file = "transcription_output.txt"
	summary_file = "summary_output.txt"
	with open(transcription_file, "w", encoding="utf-8") as f:
	f.write(transcription_message)
	with open(summary_file, "w", encoding="utf-8") as f:
	f.write(summary_message)

	progress(1, desc="Complete")
	print("Transcription and summarization complete.")
	return transcription_message, summary_message, transcription_file, summary_file, processing_time
	except gr.Error as e:
	# Re-raise Gradio errors
	raise e
	except Exception as e:
	print(f"Error during transcription or summarization: {str(e)}")
	raise gr.Error(f"Transcription or summarization failed: {str(e)}")

	def toggle_input(choice):
	if choice == "YouTube URL":
	return gr.update(visible=True), gr.update(visible=False, value=None)
	else:
	return gr.update(visible=False, value=None), gr.update(visible=True)

	def toggle_language(choice):
	if choice == True:
	return gr.update(visible=True, value="id")
	else:
	return gr.update(visible=False, value="")

	with gr.Blocks(theme='NoCrypt/miku') as demo:
	gr.Label('Youtube Summarizer WebUI created with ❤️ by Ryusui', show_label=False)

	with gr.Accordion("Input"):
	with gr.Column():
	input_type = gr.Radio(["YouTube URL", "Audio File"], label="Input Type", value="Audio File", info="Please consider using the audio file if you face any issues with the YouTube URL. Currently youtube is banning HuggingFace IP Addresses.", interactive=False)
	with gr.Row():
	youtube_url = gr.Textbox(label="YouTube URL", visible=False, info="Input the full URL of the YouTube video you want to transcribe and summarize. Example: https://www.youtube.com/watch?v=VIDEO_ID")
	audio_file = gr.File(label="Upload Audio File", visible=True, file_types=['.wav', '.flac', '.mp3'])
	whisper_model = gr.Dropdown(["openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium", "openai/whisper-large-v3"], label="Whisper Model", value="openai/whisper-large-v3", info="Tiny is the fastest model, but it's not the best quality. large-v3 is the best quality, but it's the slowest model.")
	gemini_model_variant = gr.Dropdown(["gemini-1.5-flash", "gemini-1.5-pro"], label="Gemini Model Variant", value="gemini-1.5-pro", info="Gemini-1.5-flash is the fastest model, but it's not the best quality. Gemini-1.5-pro is the best quality, but it's slower")
	define_language = gr.Checkbox(label="Define Language", value=False, info="If you want to define the language, check this box")
	language = gr.Dropdown(["id","en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh"], label="Language", value=None, info="Select the language for transcription", visible=False)
	gemini_api_key = gr.Textbox(label="Gemini API Key (Optional)", placeholder="Enter your Gemini API key or leave blank to use default", info="If you facing error on transcription, please try to use your own API key")
	gemini_prompt = gr.Textbox(label="Gemini Prompt", value="Buatkan resume dari transkrip ini")
	transcribe_button = gr.Button("Transcribe and Summarize")

	with gr.Accordion("Output"):
	with gr.Column():
	transcription_output = gr.Textbox(label="Transcription Output")
	summary_output = gr.Textbox(label="Summary Output")
	transcription_file = gr.File(label="Download Transcription")
	summary_file = gr.File(label="Download Summary")
	processing_time = gr.Textbox(label="Transcription Processing Time (seconds)")

	input_type.change(fn=toggle_input, inputs=input_type, outputs=[youtube_url, audio_file])
	define_language.change(fn=toggle_language, inputs=define_language, outputs=[language])

	transcribe_button.click(
	fn=transcribe,
	inputs=[
	youtube_url,
	audio_file,
	whisper_model,
	gemini_api_key,
	gemini_prompt,
	gemini_model_variant,
	language,
	],
	outputs=[transcription_output, summary_output, transcription_file, summary_file, processing_time]
	)

	print("Launching Gradio interface...")
	demo.launch()