Spaces:

marcosremar2
/

llama-omni

Build error

App Files Files Community

marcosremar2 commited on 22 days ago

Commit

d478b16

1 Parent(s): b9d0632

edff

Browse files

Files changed (3) hide show

__pycache__/app.cpython-313.pyc +0 -0
app.py +195 -51
requirements.txt +2 -0

__pycache__/app.cpython-313.pyc ADDED Viewed

Binary file (14 kB). View file

app.py CHANGED Viewed

@@ -1,26 +1,48 @@
 import gradio as gr
 import torch
-from transformers import pipeline
-import os # os is imported but not used. Consider removing if not needed.
 # --- Model Configuration ---
 whisper_model_id = "openai/whisper-tiny"
-# Using gpt2 as a placeholder due to LLaMA-Omni2-0.5B's complex setup needs.
-# LLaMA-Omni2-0.5B (ICTNLP/LLaMA-Omni2-0.5B) is a speech-language model
-# requiring specific dependencies (e.g., CosyVoice) and often its own serving infrastructure.
-# It's not typically loaded via a simple transformers.pipeline for text generation alone.
-text_generation_model_id = "gpt2"
 # --- Device Configuration ---
 if torch.cuda.is_available():
     device_for_pipelines = 0  # Use the first GPU for Hugging Face pipelines
     torch_device = "cuda:0"    # PyTorch device string
-    # For models that support it and where precision is not critical, float16 can save memory/speed up.
-    # However, Whisper models are often more robust with float32 for pipeline usage unless memory is very constrained.
-    # GPT-2 also generally runs fine on float32 and doesn't strictly need float16 for basic use.
-    dtype_for_pipelines = torch.float16 # or torch.float32 depending on model/GPU
 else:
-    device_for_pipelines = -1 # Use CPU for Hugging Face pipelines
     torch_device = "cpu"
     dtype_for_pipelines = torch.float32
@@ -34,28 +56,78 @@ try:
     asr_pipeline_instance = pipeline(
         "automatic-speech-recognition",
         model=whisper_model_id,
-        torch_dtype=dtype_for_pipelines, # Using specified dtype
         device=device_for_pipelines
     )
     print(f"ASR model ({whisper_model_id}) loaded successfully.")
 except Exception as e:
     print(f"Error loading ASR model ({whisper_model_id}): {e}")
-    asr_pipeline_instance = None # Explicitly set to None on failure
-# --- Load Text Generation Pipeline ---
 text_gen_pipeline_instance = None
-try:
-    print(f"Loading text generation model: {text_generation_model_id}...")
-    text_gen_pipeline_instance = pipeline(
-        "text-generation",
-        model=text_generation_model_id,
-        torch_dtype=dtype_for_pipelines, # Using specified dtype
-        device=device_for_pipelines
-    )
-    print(f"Text generation model ({text_generation_model_id}) loaded successfully.")
-except Exception as e:
-    print(f"Error loading text generation model ({text_generation_model_id}): {e}")
-    text_gen_pipeline_instance = None # Explicitly set to None on failure
 # --- Core Functions ---
 def transcribe_audio_input(audio_filepath):
@@ -65,24 +137,56 @@ def transcribe_audio_input(audio_filepath):
         return "No audio file provided for transcription.", ""
     try:
         print(f"Transcribing: {audio_filepath}")
-        # Add chunk_length_s for handling longer audio files robustly
         result = asr_pipeline_instance(audio_filepath, chunk_length_s=30)
         transcribed_text = result["text"]
         print(f"Transcription: '{transcribed_text}'")
-        return transcribed_text, transcribed_text # Return for UI and next step
     except Exception as e:
         print(f"Transcription error: {e}")
         return f"Error during transcription: {str(e)}", ""
 def generate_text_response(prompt_text):
     if not text_gen_pipeline_instance:
-        return f"Text generation model ({text_generation_model_id}) not available. Check logs."
     if not prompt_text or not prompt_text.strip():
         return "Prompt is empty. Please provide text for generation."
     try:
         print(f"Generating response for prompt (first 100 chars): '{prompt_text[:100]}...'")
-        # max_new_tokens is generally preferred over max_length for more control
-        generated_outputs = text_gen_pipeline_instance(prompt_text, max_new_tokens=100, num_return_sequences=1)
         response_text = generated_outputs[0]["generated_text"]
         print(f"Generated response: '{response_text}'")
         return response_text
@@ -102,22 +206,38 @@ def combined_pipeline_process(audio_filepath):
             error_msg_for_generation = "Cannot generate response: ASR model not loaded."
         return transcribed_text, error_msg_for_generation
-    if not text_gen_pipeline_instance:
-        return transcribed_text, f"Cannot generate response: Text generation model ({text_generation_model_id}) not loaded."
     final_response = generate_text_response(transcribed_text)
     return transcribed_text, final_response
 # --- Gradio Interface Definition ---
-with gr.Blocks(theme=gr.themes.Soft(), title="Audio to Text Generation") as app_interface:
     gr.Markdown(
-        """
         # Speech-to-Text and Text Generation Demo
-        This application uses **OpenAI Whisper Tiny** for speech recognition and **GPT-2** (as a stand-in for more complex models like LLaMA-Omni2) for text generation.
-        You can upload an audio file, have it transcribed, and then use that transcription as a prompt to generate further text.
-        **Note on LLaMA-Omni2-0.5B:** The `ICTNLP/LLaMA-Omni2-0.5B` model is a sophisticated speech-language model designed for real-time spoken chat, generating both text and speech. It requires a specific setup environment (including its own speech synthesis like CosyVoice and potentially a dedicated serving mechanism). It's not plug-and-play with a simple `transformers.pipeline` in the same way standard ASR or text-only LLMs are. Therefore, GPT-2 is used here to demonstrate the Gradio app structure.
         """
     )
@@ -126,7 +246,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Audio to Text Generation") as app_
         input_audio_pipeline = gr.Audio(type="filepath", label="Upload Your Audio File (.wav, .mp3)")
         submit_button_full = gr.Button("Run Full Process", variant="primary")
         output_transcription_pipeline = gr.Textbox(label="Transcribed Text (from Whisper)", lines=5)
-        output_generation_pipeline = gr.Textbox(label=f"Generated Text (from {text_generation_model_id})", lines=7)
         submit_button_full.click(
             fn=combined_pipeline_process,
@@ -142,7 +263,6 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Audio to Text Generation") as app_
         def asr_only_ui(audio_file):
             if audio_file is None: return "Please upload an audio file."
-            # The transcribe_audio_input returns two values; we only need the first for this UI.
             transcription, _ = transcribe_audio_input(audio_file)
             return transcription
@@ -152,8 +272,9 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Audio to Text Generation") as app_
             outputs=[output_transcription_asr]
         )
-    with gr.Tab(f"Test Text Generation ({text_generation_model_id})"):
-        gr.Markdown(f"### Generate text from a prompt using {text_generation_model_id}.")
         input_text_prompt_gen = gr.Textbox(label="Your Text Prompt", placeholder="Enter text here...", lines=5)
         submit_button_gen = gr.Button("Generate Text", variant="secondary")
         output_generation_gen = gr.Textbox(label="Generated Text Result", lines=10)
@@ -167,15 +288,38 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Audio to Text Generation") as app_
     gr.Markdown("--- ")
     gr.Markdown("### Model Loading Status (at application start):")
     asr_load_status = "Successfully Loaded" if asr_pipeline_instance else "Failed to Load (check console logs)"
-    text_gen_load_status = "Successfully Loaded" if text_gen_pipeline_instance else "Failed to Load (check console logs)"
     gr.Markdown(f"*   **Whisper Model ({whisper_model_id}):** `{asr_load_status}`")
-    gr.Markdown(f"*   **Text Generation Model ({text_generation_model_id}):** `{text_gen_load_status}`")
 # --- Launch the Gradio App ---
 if __name__ == "__main__":
-    print("Attempting to launch Gradio application...")
-    # share=True is good for Hugging Face Spaces. For local, it's optional.
-    # For persistent public link when running locally (requires internet & can have security implications):
-    # app_interface.launch(share=True)
-    app_interface.launch()
-    print("Gradio application launched. Check your browser or console for the URL.")

 import gradio as gr
 import torch
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
+import os
+import warnings
+import importlib
+import sys
+import subprocess
+# Check if we can import LLaMA-Omni2's modules
+try_native_modules = True
+native_llama_omni_available = False
+native_modules_error = None
+if try_native_modules:
+    try:
+        # Try importing LLaMA-Omni2 specific modules using subprocess to avoid crashing if imports fail
+        print("Checking for LLaMA-Omni2 native modules...")
+        module_check_result = subprocess.run(
+            [sys.executable, "-c", "import llama_omni2; print('LLaMA-Omni2 modules found!')"],
+            capture_output=True,
+            text=True
+        )
+        if "LLaMA-Omni2 modules found!" in module_check_result.stdout:
+            print("LLaMA-Omni2 native modules are available!")
+            native_llama_omni_available = True
+        else:
+            print(f"LLaMA-Omni2 native modules not found: {module_check_result.stderr}")
+            native_modules_error = module_check_result.stderr
+    except Exception as e:
+        print(f"Error checking for LLaMA-Omni2 native modules: {e}")
+        native_modules_error = str(e)
 # --- Model Configuration ---
 whisper_model_id = "openai/whisper-tiny"
+llama_omni_model_id = "ICTNLP/LLaMA-Omni2-0.5B"  # Primary model we'll try to load
+fallback_model_id = "gpt2"  # Fallback if LLaMA-Omni2 fails to load
 # --- Device Configuration ---
 if torch.cuda.is_available():
     device_for_pipelines = 0  # Use the first GPU for Hugging Face pipelines
     torch_device = "cuda:0"    # PyTorch device string
+    dtype_for_pipelines = torch.float16
 else:
+    device_for_pipelines = -1  # Use CPU for Hugging Face pipelines
     torch_device = "cpu"
     dtype_for_pipelines = torch.float32
     asr_pipeline_instance = pipeline(
         "automatic-speech-recognition",
         model=whisper_model_id,
+        torch_dtype=dtype_for_pipelines,
         device=device_for_pipelines
     )
     print(f"ASR model ({whisper_model_id}) loaded successfully.")
 except Exception as e:
     print(f"Error loading ASR model ({whisper_model_id}): {e}")
+    asr_pipeline_instance = None
+# --- Load Text Generation Model ---
 text_gen_pipeline_instance = None
+text_generation_model_id = None  # Will be set to the model that successfully loads
+llama_omni_native_module = None  # Will hold the native LLaMA-Omni2 module if loaded
+# Try native LLaMA-Omni2 module first if available
+if native_llama_omni_available:
+    try:
+        print("Attempting to load LLaMA-Omni2 using native modules...")
+        # Import the required modules
+        import llama_omni2
+        from llama_omni2.model import Model as LLamaOmniModel
+        # Load the model
+        llama_omni_native_module = LLamaOmniModel.from_pretrained(llama_omni_model_id)
+        text_generation_model_id = llama_omni_model_id
+        print(f"LLaMA-Omni2 native module loaded successfully: {type(llama_omni_native_module)}")
+    except Exception as e:
+        print(f"Error loading native LLaMA-Omni2 module: {e}")
+        llama_omni_native_module = None
+# If native module failed, try loading using transformers
+if llama_omni_native_module is None and text_generation_model_id is None:
+    try:
+        print(f"Attempting to load LLaMA-Omni2 using transformers: {llama_omni_model_id}...")
+        # LLaMA models often require specific loading configurations
+        tokenizer = AutoTokenizer.from_pretrained(llama_omni_model_id, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(
+            llama_omni_model_id,
+            torch_dtype=dtype_for_pipelines,
+            trust_remote_code=True,
+            device_map="auto" if torch.cuda.is_available() else None
+        )
+        text_gen_pipeline_instance = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            torch_dtype=dtype_for_pipelines,
+            device=device_for_pipelines if not torch.cuda.is_available() else None
+        )
+        text_generation_model_id = llama_omni_model_id
+        print(f"LLaMA-Omni2 model ({llama_omni_model_id}) loaded successfully via transformers.")
+    except Exception as e:
+        warnings.warn(f"Error loading LLaMA-Omni2 model: {e}\nFalling back to {fallback_model_id}")
+        print(f"Error loading LLaMA-Omni2 model via transformers: {e}")
+        print(f"Falling back to {fallback_model_id}")
+# Fall back to GPT-2 if LLaMA-Omni2 fails to load both ways
+if text_generation_model_id is None:
+    try:
+        print(f"Loading fallback text generation model: {fallback_model_id}...")
+        text_gen_pipeline_instance = pipeline(
+            "text-generation",
+            model=fallback_model_id,
+            torch_dtype=dtype_for_pipelines,
+            device=device_for_pipelines
+        )
+        text_generation_model_id = fallback_model_id
+        print(f"Fallback model ({fallback_model_id}) loaded successfully.")
+    except Exception as e:
+        print(f"Error loading fallback model ({fallback_model_id}): {e}")
+        text_gen_pipeline_instance = None
 # --- Core Functions ---
 def transcribe_audio_input(audio_filepath):
         return "No audio file provided for transcription.", ""
     try:
         print(f"Transcribing: {audio_filepath}")
         result = asr_pipeline_instance(audio_filepath, chunk_length_s=30)
         transcribed_text = result["text"]
         print(f"Transcription: '{transcribed_text}'")
+        return transcribed_text, transcribed_text
     except Exception as e:
         print(f"Transcription error: {e}")
         return f"Error during transcription: {str(e)}", ""
 def generate_text_response(prompt_text):
+    # If we have a native LLaMA-Omni2 module, use it
+    if llama_omni_native_module is not None:
+        if not prompt_text or not prompt_text.strip():
+            return "Prompt is empty. Please provide text for generation."
+        try:
+            print(f"Generating response with native LLaMA-Omni2 for prompt: '{prompt_text[:100]}...'")
+            # Using the native module's interface for text generation
+            response = llama_omni_native_module.generate(prompt_text, max_length=150)
+            print(f"Generated response: '{response}'")
+            return response
+        except Exception as e:
+            print(f"Error using native LLaMA-Omni2 generation: {e}")
+            return f"Error during native LLaMA-Omni2 text generation: {str(e)}"
+    # Otherwise use the transformers pipeline
     if not text_gen_pipeline_instance:
+        return f"Text generation model not available. Check logs."
     if not prompt_text or not prompt_text.strip():
         return "Prompt is empty. Please provide text for generation."
     try:
         print(f"Generating response for prompt (first 100 chars): '{prompt_text[:100]}...'")
+        # Different generation parameters based on model
+        if text_generation_model_id == llama_omni_model_id:
+            # Parameters optimized for LLaMA-Omni2
+            generated_outputs = text_gen_pipeline_instance(
+                prompt_text,
+                max_new_tokens=150,
+                do_sample=True,
+                temperature=0.7,
+                top_p=0.9,
+                num_return_sequences=1
+            )
+        else:
+            # Parameters for fallback model
+            generated_outputs = text_gen_pipeline_instance(
+                prompt_text,
+                max_new_tokens=100,
+                num_return_sequences=1
+            )
         response_text = generated_outputs[0]["generated_text"]
         print(f"Generated response: '{response_text}'")
         return response_text
             error_msg_for_generation = "Cannot generate response: ASR model not loaded."
         return transcribed_text, error_msg_for_generation
+    if not text_gen_pipeline_instance and llama_omni_native_module is None:
+        return transcribed_text, f"Cannot generate response: No text generation model available."
     final_response = generate_text_response(transcribed_text)
     return transcribed_text, final_response
+# Determine model status for UI
+if llama_omni_native_module is not None:
+    llama_model_status = "Native LLaMA-Omni2 module loaded successfully"
+    using_model = "LLaMA-Omni2-0.5B (native modules)"
+elif text_generation_model_id == llama_omni_model_id:
+    llama_model_status = "LLaMA-Omni2 loaded via transformers"
+    using_model = "LLaMA-Omni2-0.5B (via transformers)"
+elif text_generation_model_id == fallback_model_id:
+    llama_model_status = "Failed to load - Using GPT-2 as fallback"
+    using_model = "GPT-2 (fallback model)"
+else:
+    llama_model_status = "Failed to load any text generation model"
+    using_model = "No model available"
 # --- Gradio Interface Definition ---
+with gr.Blocks(theme=gr.themes.Soft(), title="Whisper + LLaMA-Omni2 Demo") as app_interface:
     gr.Markdown(
+        f"""
         # Speech-to-Text and Text Generation Demo
+        This application uses **OpenAI Whisper Tiny** for speech recognition and attempts to use **LLaMA-Omni2-0.5B** for text generation.
+        If LLaMA-Omni2 cannot be loaded, it falls back to GPT-2.
+        **Currently using:** {using_model}
+        Upload an audio file to transcribe it. The transcribed text will then be used as a prompt for the text generation model.
         """
     )
         input_audio_pipeline = gr.Audio(type="filepath", label="Upload Your Audio File (.wav, .mp3)")
         submit_button_full = gr.Button("Run Full Process", variant="primary")
         output_transcription_pipeline = gr.Textbox(label="Transcribed Text (from Whisper)", lines=5)
+        model_label = f"Generated Text (from {using_model})"
+        output_generation_pipeline = gr.Textbox(label=model_label, lines=7)
         submit_button_full.click(
             fn=combined_pipeline_process,
         def asr_only_ui(audio_file):
             if audio_file is None: return "Please upload an audio file."
             transcription, _ = transcribe_audio_input(audio_file)
             return transcription
             outputs=[output_transcription_asr]
         )
+    with gr.Tab(f"Test Text Generation"):
+        model_name_gen = using_model
+        gr.Markdown(f"### Generate text from a prompt using {model_name_gen}.")
         input_text_prompt_gen = gr.Textbox(label="Your Text Prompt", placeholder="Enter text here...", lines=5)
         submit_button_gen = gr.Button("Generate Text", variant="secondary")
         output_generation_gen = gr.Textbox(label="Generated Text Result", lines=10)
     gr.Markdown("--- ")
     gr.Markdown("### Model Loading Status (at application start):")
     asr_load_status = "Successfully Loaded" if asr_pipeline_instance else "Failed to Load (check console logs)"
     gr.Markdown(f"*   **Whisper Model ({whisper_model_id}):** `{asr_load_status}`")
+    gr.Markdown(f"*   **LLaMA-Omni2 Model ({llama_omni_model_id}):** `{llama_model_status}`")
+    if native_llama_omni_available:
+        gr.Markdown("*   **LLaMA-Omni2 Native Modules:** `Available`")
+    else:
+        native_error = f": {native_modules_error}" if native_modules_error else ""
+        gr.Markdown(f"*   **LLaMA-Omni2 Native Modules:** `Not Available{native_error}`")
+    if using_model.startswith("GPT-2"):
+        gr.Markdown(
+            """
+            **Note about LLaMA-Omni2-0.5B:** This model has complex dependencies and requires a specific setup environment.
+            The system attempted to load it but fell back to GPT-2. For full functionality with LLaMA-Omni2, you should:
+            1. Clone the [LLaMA-Omni2 repository](https://github.com/ictnlp/LLaMA-Omni2)
+            2. Install the required dependencies including CosyVoice 2
+            3. Download the Whisper-large-v3 model and flow-matching model and vocoder of CosyVoice 2
+            4. Set up the controller, model worker, and web server as described in the repository
+            Note that LLaMA-Omni2 is designed for generating both text and speech responses simultaneously.
+            For the full experience with speech synthesis, you need the complete setup.
+            """
+        )
 # --- Launch the Gradio App ---
 if __name__ == "__main__":
+    print("Launching Gradio demo...")
+    try:
+        app_interface.launch(share=True)
+    except Exception as e:
+        print(f"Error launching with share=True: {e}")
+        print("Trying to launch without sharing...")
+        app_interface.launch()

requirements.txt CHANGED Viewed

@@ -17,5 +17,7 @@ pydub
 ffmpeg-python
 huggingface_hub # For downloading models from HF Hub
 soundfile # To handle audio files if not using gr.Audio input directly for some reason
 # fairseq and flash-attn are removed, expected to be handled by LLaMA-Omni2's setup via `pip install -e .` in Dockerfile

 ffmpeg-python
 huggingface_hub # For downloading models from HF Hub
 soundfile # To handle audio files if not using gr.Audio input directly for some reason
+safetensors
+ai2-olmo # In case LLaMA-Omni2 uses olmo under the hood for the LLM part
 # fairseq and flash-attn are removed, expected to be handled by LLaMA-Omni2's setup via `pip install -e .` in Dockerfile