Spaces:

marcosremar2
/

llama-omni

Build error

App Files Files Community

llama-omni / app.py

marcosremar2

edff

d478b16 25 days ago

raw

history blame

14.5 kB

	import gradio as gr
	import torch
	from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
	import os
	import warnings
	import importlib
	import sys
	import subprocess

	# Check if we can import LLaMA-Omni2's modules
	try_native_modules = True
	native_llama_omni_available = False
	native_modules_error = None

	if try_native_modules:
	try:
	# Try importing LLaMA-Omni2 specific modules using subprocess to avoid crashing if imports fail
	print("Checking for LLaMA-Omni2 native modules...")
	module_check_result = subprocess.run(
	[sys.executable, "-c", "import llama_omni2; print('LLaMA-Omni2 modules found!')"],
	capture_output=True,
	text=True
	)
	if "LLaMA-Omni2 modules found!" in module_check_result.stdout:
	print("LLaMA-Omni2 native modules are available!")
	native_llama_omni_available = True
	else:
	print(f"LLaMA-Omni2 native modules not found: {module_check_result.stderr}")
	native_modules_error = module_check_result.stderr
	except Exception as e:
	print(f"Error checking for LLaMA-Omni2 native modules: {e}")
	native_modules_error = str(e)

	# --- Model Configuration ---
	whisper_model_id = "openai/whisper-tiny"
	llama_omni_model_id = "ICTNLP/LLaMA-Omni2-0.5B" # Primary model we'll try to load
	fallback_model_id = "gpt2" # Fallback if LLaMA-Omni2 fails to load

	# --- Device Configuration ---
	if torch.cuda.is_available():
	device_for_pipelines = 0 # Use the first GPU for Hugging Face pipelines
	torch_device = "cuda:0" # PyTorch device string
	dtype_for_pipelines = torch.float16
	else:
	device_for_pipelines = -1 # Use CPU for Hugging Face pipelines
	torch_device = "cpu"
	dtype_for_pipelines = torch.float32

	print(f"Using device: {torch_device} for model loading.")
	print(f"Pipelines will use device_id: {device_for_pipelines} and dtype: {dtype_for_pipelines}")

	# --- Load Speech-to-Text (ASR) Pipeline ---
	asr_pipeline_instance = None
	try:
	print(f"Loading ASR model: {whisper_model_id}...")
	asr_pipeline_instance = pipeline(
	"automatic-speech-recognition",
	model=whisper_model_id,
	torch_dtype=dtype_for_pipelines,
	device=device_for_pipelines
	)
	print(f"ASR model ({whisper_model_id}) loaded successfully.")
	except Exception as e:
	print(f"Error loading ASR model ({whisper_model_id}): {e}")
	asr_pipeline_instance = None

	# --- Load Text Generation Model ---
	text_gen_pipeline_instance = None
	text_generation_model_id = None # Will be set to the model that successfully loads
	llama_omni_native_module = None # Will hold the native LLaMA-Omni2 module if loaded

	# Try native LLaMA-Omni2 module first if available
	if native_llama_omni_available:
	try:
	print("Attempting to load LLaMA-Omni2 using native modules...")
	# Import the required modules
	import llama_omni2
	from llama_omni2.model import Model as LLamaOmniModel

	# Load the model
	llama_omni_native_module = LLamaOmniModel.from_pretrained(llama_omni_model_id)
	text_generation_model_id = llama_omni_model_id
	print(f"LLaMA-Omni2 native module loaded successfully: {type(llama_omni_native_module)}")
	except Exception as e:
	print(f"Error loading native LLaMA-Omni2 module: {e}")
	llama_omni_native_module = None

	# If native module failed, try loading using transformers
	if llama_omni_native_module is None and text_generation_model_id is None:
	try:
	print(f"Attempting to load LLaMA-Omni2 using transformers: {llama_omni_model_id}...")
	# LLaMA models often require specific loading configurations
	tokenizer = AutoTokenizer.from_pretrained(llama_omni_model_id, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	llama_omni_model_id,
	torch_dtype=dtype_for_pipelines,
	trust_remote_code=True,
	device_map="auto" if torch.cuda.is_available() else None
	)

	text_gen_pipeline_instance = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	torch_dtype=dtype_for_pipelines,
	device=device_for_pipelines if not torch.cuda.is_available() else None
	)
	text_generation_model_id = llama_omni_model_id
	print(f"LLaMA-Omni2 model ({llama_omni_model_id}) loaded successfully via transformers.")

	except Exception as e:
	warnings.warn(f"Error loading LLaMA-Omni2 model: {e}\nFalling back to {fallback_model_id}")
	print(f"Error loading LLaMA-Omni2 model via transformers: {e}")
	print(f"Falling back to {fallback_model_id}")

	# Fall back to GPT-2 if LLaMA-Omni2 fails to load both ways
	if text_generation_model_id is None:
	try:
	print(f"Loading fallback text generation model: {fallback_model_id}...")
	text_gen_pipeline_instance = pipeline(
	"text-generation",
	model=fallback_model_id,
	torch_dtype=dtype_for_pipelines,
	device=device_for_pipelines
	)
	text_generation_model_id = fallback_model_id
	print(f"Fallback model ({fallback_model_id}) loaded successfully.")
	except Exception as e:
	print(f"Error loading fallback model ({fallback_model_id}): {e}")
	text_gen_pipeline_instance = None

	# --- Core Functions ---
	def transcribe_audio_input(audio_filepath):
	if not asr_pipeline_instance:
	return "ASR model not available. Please check startup logs.", ""
	if audio_filepath is None:
	return "No audio file provided for transcription.", ""
	try:
	print(f"Transcribing: {audio_filepath}")
	result = asr_pipeline_instance(audio_filepath, chunk_length_s=30)
	transcribed_text = result["text"]
	print(f"Transcription: '{transcribed_text}'")
	return transcribed_text, transcribed_text
	except Exception as e:
	print(f"Transcription error: {e}")
	return f"Error during transcription: {str(e)}", ""

	def generate_text_response(prompt_text):
	# If we have a native LLaMA-Omni2 module, use it
	if llama_omni_native_module is not None:
	if not prompt_text or not prompt_text.strip():
	return "Prompt is empty. Please provide text for generation."
	try:
	print(f"Generating response with native LLaMA-Omni2 for prompt: '{prompt_text[:100]}...'")
	# Using the native module's interface for text generation
	response = llama_omni_native_module.generate(prompt_text, max_length=150)
	print(f"Generated response: '{response}'")
	return response
	except Exception as e:
	print(f"Error using native LLaMA-Omni2 generation: {e}")
	return f"Error during native LLaMA-Omni2 text generation: {str(e)}"

	# Otherwise use the transformers pipeline
	if not text_gen_pipeline_instance:
	return f"Text generation model not available. Check logs."
	if not prompt_text or not prompt_text.strip():
	return "Prompt is empty. Please provide text for generation."
	try:
	print(f"Generating response for prompt (first 100 chars): '{prompt_text[:100]}...'")

	# Different generation parameters based on model
	if text_generation_model_id == llama_omni_model_id:
	# Parameters optimized for LLaMA-Omni2
	generated_outputs = text_gen_pipeline_instance(
	prompt_text,
	max_new_tokens=150,
	do_sample=True,
	temperature=0.7,
	top_p=0.9,
	num_return_sequences=1
	)
	else:
	# Parameters for fallback model
	generated_outputs = text_gen_pipeline_instance(
	prompt_text,
	max_new_tokens=100,
	num_return_sequences=1
	)

	response_text = generated_outputs[0]["generated_text"]
	print(f"Generated response: '{response_text}'")
	return response_text
	except Exception as e:
	print(f"Text generation error: {e}")
	return f"Error during text generation: {str(e)}"

	def combined_pipeline_process(audio_filepath):
	if audio_filepath is None:
	return "No audio input.", "No audio input."

	transcribed_text, _ = transcribe_audio_input(audio_filepath)

	if not asr_pipeline_instance or "Error during transcription" in transcribed_text or not transcribed_text.strip():
	error_msg_for_generation = "Cannot generate response: Transcription failed or was empty."
	if not asr_pipeline_instance:
	error_msg_for_generation = "Cannot generate response: ASR model not loaded."
	return transcribed_text, error_msg_for_generation

	if not text_gen_pipeline_instance and llama_omni_native_module is None:
	return transcribed_text, f"Cannot generate response: No text generation model available."

	final_response = generate_text_response(transcribed_text)
	return transcribed_text, final_response

	# Determine model status for UI
	if llama_omni_native_module is not None:
	llama_model_status = "Native LLaMA-Omni2 module loaded successfully"
	using_model = "LLaMA-Omni2-0.5B (native modules)"
	elif text_generation_model_id == llama_omni_model_id:
	llama_model_status = "LLaMA-Omni2 loaded via transformers"
	using_model = "LLaMA-Omni2-0.5B (via transformers)"
	elif text_generation_model_id == fallback_model_id:
	llama_model_status = "Failed to load - Using GPT-2 as fallback"
	using_model = "GPT-2 (fallback model)"
	else:
	llama_model_status = "Failed to load any text generation model"
	using_model = "No model available"

	# --- Gradio Interface Definition ---
	with gr.Blocks(theme=gr.themes.Soft(), title="Whisper + LLaMA-Omni2 Demo") as app_interface:
	gr.Markdown(
	f"""
	# Speech-to-Text and Text Generation Demo

	This application uses OpenAI Whisper Tiny for speech recognition and attempts to use LLaMA-Omni2-0.5B for text generation.
	If LLaMA-Omni2 cannot be loaded, it falls back to GPT-2.

	Currently using: {using_model}

	Upload an audio file to transcribe it. The transcribed text will then be used as a prompt for the text generation model.
	"""
	)

	with gr.Tab("Full Pipeline: Audio -> Transcription -> Generation"):
	gr.Markdown("### Step 1: Upload Audio -> Step 2: Transcribe -> Step 3: Generate Text")
	input_audio_pipeline = gr.Audio(type="filepath", label="Upload Your Audio File (.wav, .mp3)")
	submit_button_full = gr.Button("Run Full Process", variant="primary")
	output_transcription_pipeline = gr.Textbox(label="Transcribed Text (from Whisper)", lines=5)
	model_label = f"Generated Text (from {using_model})"
	output_generation_pipeline = gr.Textbox(label=model_label, lines=7)

	submit_button_full.click(
	fn=combined_pipeline_process,
	inputs=[input_audio_pipeline],
	outputs=[output_transcription_pipeline, output_generation_pipeline]
	)

	with gr.Tab("Test Speech-to-Text (Whisper Tiny)"):
	gr.Markdown("### Transcribe audio to text using Whisper Tiny.")
	input_audio_asr = gr.Audio(type="filepath", label="Upload Audio for ASR")
	submit_button_asr = gr.Button("Transcribe Audio", variant="secondary")
	output_transcription_asr = gr.Textbox(label="Transcription Result", lines=10)

	def asr_only_ui(audio_file):
	if audio_file is None: return "Please upload an audio file."
	transcription, _ = transcribe_audio_input(audio_file)
	return transcription

	submit_button_asr.click(
	fn=asr_only_ui,
	inputs=[input_audio_asr],
	outputs=[output_transcription_asr]
	)

	with gr.Tab(f"Test Text Generation"):
	model_name_gen = using_model
	gr.Markdown(f"### Generate text from a prompt using {model_name_gen}.")
	input_text_prompt_gen = gr.Textbox(label="Your Text Prompt", placeholder="Enter text here...", lines=5)
	submit_button_gen = gr.Button("Generate Text", variant="secondary")
	output_generation_gen = gr.Textbox(label="Generated Text Result", lines=10)

	submit_button_gen.click(
	fn=generate_text_response,
	inputs=[input_text_prompt_gen],
	outputs=[output_generation_gen]
	)

	gr.Markdown("--- ")
	gr.Markdown("### Model Loading Status (at application start):")
	asr_load_status = "Successfully Loaded" if asr_pipeline_instance else "Failed to Load (check console logs)"

	gr.Markdown(f"* Whisper Model ({whisper_model_id}): `{asr_load_status}`")
	gr.Markdown(f"* LLaMA-Omni2 Model ({llama_omni_model_id}): `{llama_model_status}`")

	if native_llama_omni_available:
	gr.Markdown("* LLaMA-Omni2 Native Modules: `Available`")
	else:
	native_error = f": {native_modules_error}" if native_modules_error else ""
	gr.Markdown(f"* LLaMA-Omni2 Native Modules: `Not Available{native_error}`")

	if using_model.startswith("GPT-2"):
	gr.Markdown(
	"""
	Note about LLaMA-Omni2-0.5B: This model has complex dependencies and requires a specific setup environment.
	The system attempted to load it but fell back to GPT-2. For full functionality with LLaMA-Omni2, you should:

	1. Clone the [LLaMA-Omni2 repository](https://github.com/ictnlp/LLaMA-Omni2)
	2. Install the required dependencies including CosyVoice 2
	3. Download the Whisper-large-v3 model and flow-matching model and vocoder of CosyVoice 2
	4. Set up the controller, model worker, and web server as described in the repository

	Note that LLaMA-Omni2 is designed for generating both text and speech responses simultaneously.
	For the full experience with speech synthesis, you need the complete setup.
	"""
	)

	# --- Launch the Gradio App ---
	if __name__ == "__main__":
	print("Launching Gradio demo...")
	try:
	app_interface.launch(share=True)
	except Exception as e:
	print(f"Error launching with share=True: {e}")
	print("Trying to launch without sharing...")
	app_interface.launch()