Spaces:

marcosremar2
/

llama-omni

Build error

App Files Files Community

marcosremar2 commited on 23 days ago

Commit

b9d0632

1 Parent(s): 87f6bd6

rerer

Browse files

Files changed (2) hide show

app.py +163 -232
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,250 +1,181 @@
-import os
-import sys
 import gradio as gr
-import whisper
-from huggingface_hub import snapshot_download
 import torch
-import subprocess
-import transformers
-# --- Aggressively update/install transformers and huggingface_hub BEFORE importing them ---
-print('Attempting to upgrade pip, transformers, and huggingface_hub...')
 try:
-    print('Upgrading pip...')
-    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-U', 'pip'])
-    print('Upgrading transformers and huggingface_hub...')
-    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-U', 'transformers', 'huggingface_hub'])
-    print('Attempting to install transformers from main branch for latest features...')
-    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'git+https://github.com/huggingface/transformers.git'])
-    print('Pip, Transformers, and huggingface_hub update/install process completed.')
-except subprocess.CalledProcessError as e:
-    print(f'ERROR: Failed to upgrade/install packages: {e}')
-    print('Continuing with potentially older versions. This might lead to model loading issues.')
 except Exception as e:
-    print(f'An unexpected error occurred during package upgrades: {e}')
-# --- Now, import from transformers ---
 try:
-    from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
-    print('Successfully imported AutoModelForCausalLM, AutoTokenizer, AutoConfig from transformers.')
-except ImportError as e:
-    print(f'CRITICAL ERROR: Failed to import from transformers after attempting upgrades: {e}')
-    print('The application might not work correctly. Please check the environment and dependencies.')
-    # As a last resort, define dummy classes if import fails, so the rest of the script doesn't crash immediately
-    class AutoModelForCausalLM: pass
-    class AutoTokenizer: pass
-    class AutoConfig: pass
 except Exception as e:
-    print(f'An unexpected error occurred during transformers import: {e}')
-# --- Configuration ---
-WHISPER_MODEL_SIZE = 'small'  # Using smallest model for faster processing in testing
-SPEECH_ENCODER_PATH = 'models/speech_encoder'
-MODEL_NAME = 'LLaMA-Omni2-0.5B'
-MODEL_PATH = f'models/{MODEL_NAME}'
-HF_REPO = f'ICTNLP/{MODEL_NAME}'
-# --- Print diagnostics ---
-print('===== Application Startup =====')
-print('Python:', sys.version)
-print('Torch version:', torch.__version__)
-print(f'CUDA available: {torch.cuda.is_available()}')
-if torch.cuda.is_available():
-    print(f'CUDA device: {torch.cuda.get_device_name(0)}')
-    print(f'CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB')
-# --- Main models ---
-whisper_model = None
-llama_model = None
-tokenizer = None
-def load_whisper_model():
-    '''Load Whisper model for speech recognition'''
-    global whisper_model
-    print(f'Loading Whisper {WHISPER_MODEL_SIZE} model...')
-    # Create directory if it doesn't exist
-    os.makedirs(SPEECH_ENCODER_PATH, exist_ok=True)
-    # Load the model (will download if not present)
-    whisper_model = whisper.load_model(WHISPER_MODEL_SIZE, download_root=SPEECH_ENCODER_PATH)
-    print(f'Whisper {WHISPER_MODEL_SIZE} model loaded successfully!')
-    return whisper_model
-def load_llama_model():
-    '''Load LLaMA-Omni2 model'''
-    global llama_model, tokenizer
-    print(f'Attempting to load LLaMA-Omni2 model: {HF_REPO}')
-    # Ensure local model directory exists for downloads
-    os.makedirs(MODEL_PATH, exist_ok=True)
-    # Download model files if they aren't already present locally
-    # Check for a common file like config.json to decide if download is needed
-    if not os.path.exists(os.path.join(MODEL_PATH, 'config.json')):
-        print(f'Local model files not found. Downloading from Hugging Face Hub: {HF_REPO} to {MODEL_PATH}')
-        try:
-            snapshot_download(
-                repo_id=HF_REPO,
-                local_dir=MODEL_PATH,
-                local_dir_use_symlinks=False,
-                resume_download=True,
-            )
-            print('Model download complete.')
-        except Exception as e:
-            print(f'ERROR during model download: {e}')
-            pass  # Allow to proceed to loading attempt, which will then fail more descriptively
     try:
-        device = 'cuda' if torch.cuda.is_available() else 'cpu'
-        torch_dtype = torch.float16 if device == 'cuda' else torch.float32
-        print(f'Target device: {device}, dtype: {torch_dtype}')
-        print(f'Attempt 1: Loading tokenizer and model directly from Hub identifier: {HF_REPO} with trust_remote_code=True')
-        try:
-            tokenizer = AutoTokenizer.from_pretrained(
-                HF_REPO,
-                trust_remote_code=True
-            )
-            print('Tokenizer loaded successfully from Hub identifier.')
-            config = AutoConfig.from_pretrained(
-                HF_REPO,
-                trust_remote_code=True
-            )
-            print('Config loaded successfully from Hub identifier.')
-            llama_model = AutoModelForCausalLM.from_pretrained(
-                HF_REPO,
-                config=config,  # Pass the loaded config
-                torch_dtype=torch_dtype,
-                device_map=device,  # device_map handles moving parts of the model to CPU if OOM on GPU
-                trust_remote_code=True
-            )
-            print(f'LLaMA-Omni2 model loaded successfully directly from Hub: {HF_REPO}')
-            return llama_model
-        except Exception as e1:
-            print(f'Error in Attempt 1 (direct Hub load for {HF_REPO}): {e1}')
-            print('This often means the model requires a specific transformers version or has complex remote code.')
-        print(f'Attempt 2: Loading tokenizer and model from local path: {MODEL_PATH} with trust_remote_code=True (fallback)')
-        try:
-            tokenizer = AutoTokenizer.from_pretrained(
-                MODEL_PATH,  # Fallback to local path
-                trust_remote_code=True
-            )
-            print('Tokenizer loaded successfully from local path.')
-            config = AutoConfig.from_pretrained(
-                MODEL_PATH,
-                trust_remote_code=True
-            )
-            print('Config loaded successfully from local path.')
-            llama_model = AutoModelForCausalLM.from_pretrained(
-                MODEL_PATH,  # Fallback to local path
-                config=config,
-                torch_dtype=torch_dtype,
-                device_map=device,
-                trust_remote_code=True
-            )
-            print(f'LLaMA-Omni2 model loaded successfully from local path: {MODEL_PATH}')
-            return llama_model
-        except Exception as e2:
-            print(f'Error in Attempt 2 (local path load for {MODEL_PATH}): {e2}')
-        print('All attempts to load the LLaMA-Omni2 model failed.')
-        raise RuntimeError('Failed to load LLaMA-Omni2 model after multiple attempts.')
-    except Exception as e_outer:
-        print(f'CRITICAL ERROR loading LLaMA-Omni2 model: {e_outer}')
-        print('Falling back: Text generation will not be available.')
-        llama_model = None  # Ensure llama_model is None if loading fails
-        tokenizer = None  # Ensure tokenizer is None
-        return None
-def transcribe_audio(audio_path):
-    '''Transcribe audio using Whisper'''
-    global whisper_model
-    if whisper_model is None:
-        whisper_model = load_whisper_model()
-    try:
-        result = whisper_model.transcribe(audio_path)
-        return result['text']
     except Exception as e:
-        return f'Error transcribing audio: {e}'
-def generate_text(input_text):
-    '''Generate text using LLaMA-Omni2'''
-    global llama_model, tokenizer
-    if llama_model is None or tokenizer is None:
-        load_llama_model()
     try:
-        # If model loading failed, just return a placeholder response
-        if llama_model is None:
-            return f'Model could not be loaded. Input was: {input_text}'
-        device = next(llama_model.parameters()).device
-        inputs = tokenizer(input_text, return_tensors='pt').to(device)
-        outputs = llama_model.generate(
-            inputs.input_ids,
-            max_length=100,
-            num_return_sequences=1,
-            do_sample=True,
-            temperature=0.7,
-        )
-        return tokenizer.decode(outputs[0], skip_special_tokens=True)
     except Exception as e:
-        return f'Error generating text: {e}'
-def speech_to_text_to_speech(audio_path):
-    '''Pipeline: Speech -> Text -> Response'''
-    # First transcribe the audio
-    transcription = transcribe_audio(audio_path)
-    # Then generate a response
-    response = generate_text(transcription)
-    return transcription, response
-# --- Gradio Interface for Hugging Face Spaces ---
-def create_demo():
-    with gr.Blocks(title='LLaMA-Omni2 Demo on Hugging Face Spaces') as demo:
-        gr.Markdown('# LLaMA-Omni2 Demo')
-        gr.Markdown('This demo uses the smallest Whisper model and LLaMA-Omni2-0.5B for testing purposes.')
-        with gr.Tab('Text Generation'):
-            with gr.Row():
-                text_input = gr.Textbox(label='Input Text', placeholder='Enter text here...')
-                text_output = gr.Textbox(label='Generated Response')
-            text_button = gr.Button('Generate Response')
-            text_button.click(generate_text, inputs=text_input, outputs=text_output)
-        with gr.Tab('Speech-to-Text'):
-            audio_input = gr.Audio(type='filepath', label='Upload or Record Audio')
-            transcription_output = gr.Textbox(label='Transcription')
-            response_output = gr.Textbox(label='Generated Response')
-            transcribe_button = gr.Button('Transcribe and Respond')
-            transcribe_button.click(speech_to_text_to_speech,
-                                   inputs=audio_input,
-                                   outputs=[transcription_output, response_output])
-        gr.Markdown('### Note: The first run will download models if needed, which may take some time.')
-    return demo
-# --- Main entry point ---
-if __name__ == '__main__':
-    print('Starting LLaMA-Omni2 Interface for Hugging Face Spaces...')
-    # Create and launch the Gradio interface
-    demo = create_demo()
-    demo.launch(server_name='0.0.0.0', server_port=7860, share=True)  # share=True for Hugging Face Spaces

 import gradio as gr
 import torch
+from transformers import pipeline
+import os # os is imported but not used. Consider removing if not needed.
+# --- Model Configuration ---
+whisper_model_id = "openai/whisper-tiny"
+# Using gpt2 as a placeholder due to LLaMA-Omni2-0.5B's complex setup needs.
+# LLaMA-Omni2-0.5B (ICTNLP/LLaMA-Omni2-0.5B) is a speech-language model
+# requiring specific dependencies (e.g., CosyVoice) and often its own serving infrastructure.
+# It's not typically loaded via a simple transformers.pipeline for text generation alone.
+text_generation_model_id = "gpt2"
+# --- Device Configuration ---
+if torch.cuda.is_available():
+    device_for_pipelines = 0  # Use the first GPU for Hugging Face pipelines
+    torch_device = "cuda:0"    # PyTorch device string
+    # For models that support it and where precision is not critical, float16 can save memory/speed up.
+    # However, Whisper models are often more robust with float32 for pipeline usage unless memory is very constrained.
+    # GPT-2 also generally runs fine on float32 and doesn't strictly need float16 for basic use.
+    dtype_for_pipelines = torch.float16 # or torch.float32 depending on model/GPU
+else:
+    device_for_pipelines = -1 # Use CPU for Hugging Face pipelines
+    torch_device = "cpu"
+    dtype_for_pipelines = torch.float32
+print(f"Using device: {torch_device} for model loading.")
+print(f"Pipelines will use device_id: {device_for_pipelines} and dtype: {dtype_for_pipelines}")
+# --- Load Speech-to-Text (ASR) Pipeline ---
+asr_pipeline_instance = None
 try:
+    print(f"Loading ASR model: {whisper_model_id}...")
+    asr_pipeline_instance = pipeline(
+        "automatic-speech-recognition",
+        model=whisper_model_id,
+        torch_dtype=dtype_for_pipelines, # Using specified dtype
+        device=device_for_pipelines
+    )
+    print(f"ASR model ({whisper_model_id}) loaded successfully.")
 except Exception as e:
+    print(f"Error loading ASR model ({whisper_model_id}): {e}")
+    asr_pipeline_instance = None # Explicitly set to None on failure
+# --- Load Text Generation Pipeline ---
+text_gen_pipeline_instance = None
 try:
+    print(f"Loading text generation model: {text_generation_model_id}...")
+    text_gen_pipeline_instance = pipeline(
+        "text-generation",
+        model=text_generation_model_id,
+        torch_dtype=dtype_for_pipelines, # Using specified dtype
+        device=device_for_pipelines
+    )
+    print(f"Text generation model ({text_generation_model_id}) loaded successfully.")
 except Exception as e:
+    print(f"Error loading text generation model ({text_generation_model_id}): {e}")
+    text_gen_pipeline_instance = None # Explicitly set to None on failure
+# --- Core Functions ---
+def transcribe_audio_input(audio_filepath):
+    if not asr_pipeline_instance:
+        return "ASR model not available. Please check startup logs.", ""
+    if audio_filepath is None:
+        return "No audio file provided for transcription.", ""
     try:
+        print(f"Transcribing: {audio_filepath}")
+        # Add chunk_length_s for handling longer audio files robustly
+        result = asr_pipeline_instance(audio_filepath, chunk_length_s=30)
+        transcribed_text = result["text"]
+        print(f"Transcription: '{transcribed_text}'")
+        return transcribed_text, transcribed_text # Return for UI and next step
     except Exception as e:
+        print(f"Transcription error: {e}")
+        return f"Error during transcription: {str(e)}", ""
+def generate_text_response(prompt_text):
+    if not text_gen_pipeline_instance:
+        return f"Text generation model ({text_generation_model_id}) not available. Check logs."
+    if not prompt_text or not prompt_text.strip():
+        return "Prompt is empty. Please provide text for generation."
     try:
+        print(f"Generating response for prompt (first 100 chars): '{prompt_text[:100]}...'")
+        # max_new_tokens is generally preferred over max_length for more control
+        generated_outputs = text_gen_pipeline_instance(prompt_text, max_new_tokens=100, num_return_sequences=1)
+        response_text = generated_outputs[0]["generated_text"]
+        print(f"Generated response: '{response_text}'")
+        return response_text
     except Exception as e:
+        print(f"Text generation error: {e}")
+        return f"Error during text generation: {str(e)}"
+def combined_pipeline_process(audio_filepath):
+    if audio_filepath is None:
+        return "No audio input.", "No audio input."
+    transcribed_text, _ = transcribe_audio_input(audio_filepath)
+    if not asr_pipeline_instance or "Error during transcription" in transcribed_text or not transcribed_text.strip():
+        error_msg_for_generation = "Cannot generate response: Transcription failed or was empty."
+        if not asr_pipeline_instance:
+            error_msg_for_generation = "Cannot generate response: ASR model not loaded."
+        return transcribed_text, error_msg_for_generation
+    if not text_gen_pipeline_instance:
+        return transcribed_text, f"Cannot generate response: Text generation model ({text_generation_model_id}) not loaded."
+    final_response = generate_text_response(transcribed_text)
+    return transcribed_text, final_response
+# --- Gradio Interface Definition ---
+with gr.Blocks(theme=gr.themes.Soft(), title="Audio to Text Generation") as app_interface:
+    gr.Markdown(
+        """
+        # Speech-to-Text and Text Generation Demo
+        This application uses **OpenAI Whisper Tiny** for speech recognition and **GPT-2** (as a stand-in for more complex models like LLaMA-Omni2) for text generation.
+        You can upload an audio file, have it transcribed, and then use that transcription as a prompt to generate further text.
+        **Note on LLaMA-Omni2-0.5B:** The `ICTNLP/LLaMA-Omni2-0.5B` model is a sophisticated speech-language model designed for real-time spoken chat, generating both text and speech. It requires a specific setup environment (including its own speech synthesis like CosyVoice and potentially a dedicated serving mechanism). It's not plug-and-play with a simple `transformers.pipeline` in the same way standard ASR or text-only LLMs are. Therefore, GPT-2 is used here to demonstrate the Gradio app structure.
+        """
+    )
+    with gr.Tab("Full Pipeline: Audio -> Transcription -> Generation"):
+        gr.Markdown("### Step 1: Upload Audio -> Step 2: Transcribe -> Step 3: Generate Text")
+        input_audio_pipeline = gr.Audio(type="filepath", label="Upload Your Audio File (.wav, .mp3)")
+        submit_button_full = gr.Button("Run Full Process", variant="primary")
+        output_transcription_pipeline = gr.Textbox(label="Transcribed Text (from Whisper)", lines=5)
+        output_generation_pipeline = gr.Textbox(label=f"Generated Text (from {text_generation_model_id})", lines=7)
+        submit_button_full.click(
+            fn=combined_pipeline_process,
+            inputs=[input_audio_pipeline],
+            outputs=[output_transcription_pipeline, output_generation_pipeline]
+        )
+    with gr.Tab("Test Speech-to-Text (Whisper Tiny)"):
+        gr.Markdown("### Transcribe audio to text using Whisper Tiny.")
+        input_audio_asr = gr.Audio(type="filepath", label="Upload Audio for ASR")
+        submit_button_asr = gr.Button("Transcribe Audio", variant="secondary")
+        output_transcription_asr = gr.Textbox(label="Transcription Result", lines=10)
+        def asr_only_ui(audio_file):
+            if audio_file is None: return "Please upload an audio file."
+            # The transcribe_audio_input returns two values; we only need the first for this UI.
+            transcription, _ = transcribe_audio_input(audio_file)
+            return transcription
+        submit_button_asr.click(
+            fn=asr_only_ui,
+            inputs=[input_audio_asr],
+            outputs=[output_transcription_asr]
+        )
+    with gr.Tab(f"Test Text Generation ({text_generation_model_id})"):
+        gr.Markdown(f"### Generate text from a prompt using {text_generation_model_id}.")
+        input_text_prompt_gen = gr.Textbox(label="Your Text Prompt", placeholder="Enter text here...", lines=5)
+        submit_button_gen = gr.Button("Generate Text", variant="secondary")
+        output_generation_gen = gr.Textbox(label="Generated Text Result", lines=10)
+        submit_button_gen.click(
+            fn=generate_text_response,
+            inputs=[input_text_prompt_gen],
+            outputs=[output_generation_gen]
+        )
+    gr.Markdown("--- ")
+    gr.Markdown("### Model Loading Status (at application start):")
+    asr_load_status = "Successfully Loaded" if asr_pipeline_instance else "Failed to Load (check console logs)"
+    text_gen_load_status = "Successfully Loaded" if text_gen_pipeline_instance else "Failed to Load (check console logs)"
+    gr.Markdown(f"*   **Whisper Model ({whisper_model_id}):** `{asr_load_status}`")
+    gr.Markdown(f"*   **Text Generation Model ({text_generation_model_id}):** `{text_gen_load_status}`")
+# --- Launch the Gradio App ---
+if __name__ == "__main__":
+    print("Attempting to launch Gradio application...")
+    # share=True is good for Hugging Face Spaces. For local, it's optional.
+    # For persistent public link when running locally (requires internet & can have security implications):
+    # app_interface.launch(share=True)
+    app_interface.launch()
+    print("Gradio application launched. Check your browser or console for the URL.")

requirements.txt CHANGED Viewed

@@ -16,5 +16,6 @@ shortuuid
 pydub
 ffmpeg-python
 huggingface_hub # For downloading models from HF Hub
 # fairseq and flash-attn are removed, expected to be handled by LLaMA-Omni2's setup via `pip install -e .` in Dockerfile

 pydub
 ffmpeg-python
 huggingface_hub # For downloading models from HF Hub
+soundfile # To handle audio files if not using gr.Audio input directly for some reason
 # fairseq and flash-attn are removed, expected to be handled by LLaMA-Omni2's setup via `pip install -e .` in Dockerfile