Spaces:

jblast94
/

voice-assistant

Build error

App Files Files Community

jblast94 commited on 5 days ago

Commit

e638507

verified ·

1 Parent(s): 67a2f34

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -47

app.py CHANGED Viewed

@@ -1,30 +1,27 @@
 import gradio as gr
 import os
-import requests # Used for making API calls to your Chatterbox endpoint
-from transformers import AutoModel, AutoTokenizer
 # --- Model Loading ---
-# This loads the Gemma model you specified. Note that GGUF files are typically
-# optimized for CPU inference with libraries like llama-cpp-python, but we'll
-# use the `transformers` library as requested. If you encounter errors, you
-# may need to switch to a different method for GGUF models.
-# The `torch_dtype="auto"` is a good practice to automatically select the best data type.
 model_name = "mradermacher/gemma-3n-E2B-GGUF"
-# Let's try to load the model and tokenizer
 try:
-    # Use a specific class that supports the model's architecture.
-    # The `AutoModel` you provided is a general-purpose class.
-    # We will use `AutoTokenizer` and a simple `AutoModel` here, as requested.
-    # We add `trust_remote_code=True` in case the model requires it for loading.
-    model = AutoModel.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    print("Model and tokenizer loaded successfully!")
 except Exception as e:
-    print(f"Error loading model: {e}")
-    print("Please check the model name or if the model type is compatible with `AutoModel`.")
-    model = None
-    tokenizer = None
 # --- Constants & Configuration ---
 # To secure your Chatterbox endpoint URL, you should add it to your
@@ -38,9 +35,9 @@ def process_audio_and_generate(audio_file_path):
     This function handles the full workflow:
     1. Takes the path to a recorded audio file.
     2. Sends the audio to your Chatterbox TTS endpoint for transcription.
-    3. Passes the transcribed text to the Gemma model.
     4. Generates a text response.
     Args:
         audio_file_path (str): The file path of the recorded audio.
@@ -51,46 +48,48 @@ def process_audio_and_generate(audio_file_path):
         return "Please provide an audio recording.", "No audio input received."
     # --- Step 1: Speech-to-Text (using your Chatterbox endpoint) ---
-    # This is where you will make the API call to your Chatterbox TTS endpoint.
-    # You'll need to read the audio file and send it as a POST request.
     try:
         with open(audio_file_path, "rb") as audio_file:
-            # We'll assume the API expects the audio file in the request body.
-            headers = {"Content-Type": "audio/x-wav"} # Adjust the MIME type as needed
-            response = requests.post(CHATTERBOX_ENDPOINT, data=audio_file, headers=headers)
             transcription = response.json().get("transcription", "Transcription failed.")
-    except Exception as e:
         transcription = f"Error calling Chatterbox API: {e}"
         return transcription, "Transcription failed."
-    # --- Step 2: Generate Response with Gemma ---
-    # This is a placeholder for how you would pass the transcription to the model.
-    # The actual implementation will depend on the model's specific API.
-    # We'll use a simple text generation approach.
-    if model and tokenizer:
         try:
-            # Tokenize the input text
-            inputs = tokenizer(transcription, return_tensors="pt")
-            # Generate a response. You may need to adjust generation parameters.
-            # Using `max_new_tokens` to limit the response length for efficiency.
-            outputs = model.generate(**inputs, max_new_tokens=100)
-            # Decode the generated tokens to a string
-            response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # The model will likely repeat the input, so we'll clean it up.
-            clean_response = response_text.replace(transcription, "", 1).strip()
         except Exception as e:
-            clean_response = f"Error generating response: {e}"
-    else:
-        clean_response = "Gemma model not loaded. Please check the logs."
-    return transcription, clean_response
 # --- Gradio Interface Setup ---
-# This creates the user interface with a microphone input and two text outputs.
 iface = gr.Interface(
     fn=process_audio_and_generate,
     inputs=gr.Audio(sources=["microphone"], type="filepath"),
@@ -105,3 +104,4 @@ iface = gr.Interface(
 # Launch the Gradio app
 if __name__ == "__main__":
     iface.launch()

 import gradio as gr
 import os
+import requests
+from llama_cpp import Llama  # Import the Llama class from llama-cpp-python
 # --- Model Loading ---
+# The model you selected is in the GGUF format, which is not compatible with
+# the standard Hugging Face AutoModel class. We need to use a dedicated
+# GGUF inference engine, like llama-cpp-python.
 model_name = "mradermacher/gemma-3n-E2B-GGUF"
+model_path = gr.mount_model(model_name) # This function will download the GGUF file
+# Try to initialize the Llama model
 try:
+    # Initialize the Llama model with the GGUF file path
+    # We set `verbose=False` to keep the logs clean.
+    llm = Llama(model_path=model_path, n_gpu_layers=1, verbose=False)
+    print("Llama model initialized successfully!")
 except Exception as e:
+    print(f"Error initializing Llama model: {e}")
+    llm = None
+    print("Please check if the model is compatible with llama-cpp-python.")
 # --- Constants & Configuration ---
 # To secure your Chatterbox endpoint URL, you should add it to your
     This function handles the full workflow:
     1. Takes the path to a recorded audio file.
     2. Sends the audio to your Chatterbox TTS endpoint for transcription.
+    3. Passes the transcribed text to the GGUF model.
     4. Generates a text response.
     Args:
         audio_file_path (str): The file path of the recorded audio.
         return "Please provide an audio recording.", "No audio input received."
     # --- Step 1: Speech-to-Text (using your Chatterbox endpoint) ---
+    transcription = "Transcription failed." # Default value in case of error
     try:
         with open(audio_file_path, "rb") as audio_file:
+            # Assumes the API expects a multipart form data request with the file.
+            files = {'file': audio_file}
+            response = requests.post(CHATTERBOX_ENDPOINT, files=files)
+            response.raise_for_status() # Raise an exception for bad status codes
             transcription = response.json().get("transcription", "Transcription failed.")
+    except requests.exceptions.RequestException as e:
         transcription = f"Error calling Chatterbox API: {e}"
+        print(transcription)
+        return transcription, "Transcription service is not available."
+    except Exception as e:
+        transcription = f"Error during transcription: {e}"
+        print(transcription)
         return transcription, "Transcription failed."
+    # --- Step 2: Generate Response with Gemma (GGUF model) ---
+    response_text = "Gemma model is not available." # Default value
+    if llm:
         try:
+            # We'll use the model's `create_completion` method to generate text.
+            # We wrap the transcription in a prompt template that the model expects.
+            prompt = f"### User:\n{transcription}\n### Assistant:\n"
+            # Generate the response from the model
+            completion = llm.create_completion(
+                prompt,
+                max_tokens=150,  # Limits the length of the response
+                stop=["### User:"], # Stops generation when it sees the next user turn
+                echo=False,  # Don't repeat the input prompt in the output
+            )
+            response_text = completion['choices'][0]['text']
         except Exception as e:
+            response_text = f"Error generating response from model: {e}"
+            print(response_text)
+    return transcription, response_text.strip()
 # --- Gradio Interface Setup ---
 iface = gr.Interface(
     fn=process_audio_and_generate,
     inputs=gr.Audio(sources=["microphone"], type="filepath"),
 # Launch the Gradio app
 if __name__ == "__main__":
     iface.launch()