import gradio as gr import torch import os import sys import subprocess import spaces from pathlib import Path # Clone and setup the repository @spaces.GPU def setup_environment(): if not os.path.exists('LLaMA-Omni'): subprocess.run(['git', 'clone', 'https://github.com/ictnlp/LLaMA-Omni']) # Add to path sys.path.append(os.path.join(os.path.dirname(__file__), 'LLaMA-Omni')) # Download models os.makedirs('models/speech_encoder', exist_ok=True) os.makedirs('vocoder', exist_ok=True) # Download Whisper if not os.path.exists('models/speech_encoder/large-v3.pt'): import whisper whisper.load_model("large-v3", download_root="models/speech_encoder/") # Download vocoder if not os.path.exists('vocoder/g_00500000'): subprocess.run([ 'wget', '-q', 'https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000', '-P', 'vocoder/' ]) subprocess.run([ 'wget', '-q', 'https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/config.json', '-P', 'vocoder/' ]) # Global variables for model model = None speech_generator = None @spaces.GPU def load_models(): global model, speech_generator if model is None: setup_environment() from omni_speech.model import OmniSpeechModel from omni_speech.speech_generator import SpeechGeneratorCausalFull # Load model model_path = "ICTNLP/Llama-3.1-8B-Omni" model = OmniSpeechModel.from_pretrained(model_path, torch_dtype=torch.float16) model = model.cuda() # Initialize speech generator speech_generator = SpeechGeneratorCausalFull( model=model, vocoder='vocoder/g_00500000', vocoder_cfg='vocoder/config.json' ) @spaces.GPU(duration=60) def process_audio(audio_path, text_input=None): """Process audio input and generate text and speech response.""" # Load models if needed load_models() from omni_speech.conversation import conv_templates from omni_speech.utils import build_transform_audios # Load and preprocess audio transform = build_transform_audios() audio_tensor = transform(audio_path) # Prepare conversation conv = conv_templates["llama_3"].copy() if text_input: conv.append_message(conv.roles[0], text_input) else: conv.append_message(conv.roles[0], "