agent-course-gaia

Sleeping

App Files Files Community

kirbah commited on 8 days ago

Commit

65dc764

1 Parent(s): c8e9894

Add MP3 audio transcription tool

Browse files

Files changed (3) hide show

audio_tools.py +86 -0
basic_agent.py +12 -4
requirements.txt +2 -1

audio_tools.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+from smolagents import tool  # Assuming smolagents.tool is the correct decorator
+@tool
+def transcribe_mp3_audio_file(mp3_file_path: str) -> str:
+    """
+    Transcribes an MP3 audio file using OpenAI's Whisper 'base' model.
+    Args:
+        mp3_file_path (str): The absolute local path to the MP3 audio file.
+                             This path should be obtained from the 'File Information' section
+                             if the file was downloaded by the agent.
+    Returns:
+        str: The transcribed text from the audio file, or an error message if transcription fails.
+    """
+    try:
+        import whisper  # Attempt to import whisper only when the tool is called
+    except ImportError:
+        return ("Error: The 'openai-whisper' library is required but not installed. "
+                "Please install it using 'pip install openai-whisper' and ensure ffmpeg is also installed.")
+    if not os.path.exists(mp3_file_path):
+        return f"Error: Audio file not found at the specified path: '{mp3_file_path}'. Please verify the path."
+    if not mp3_file_path.lower().endswith(".mp3"):
+        return f"Error: The provided file path '{mp3_file_path}' does not appear to be an MP3 file. This tool currently only supports .mp3 files."
+    try:
+        print(
+            f"AudioTool: Loading Whisper 'base' model to transcribe '{mp3_file_path}'...")
+        # You can choose different model sizes: "tiny", "base", "small", "medium", "large"
+        # "base" is a good balance of speed and accuracy for many use cases.
+        # Larger models are more accurate but slower and require more resources.
+        model = whisper.load_model("base")
+        print(f"AudioTool: Transcribing audio from '{mp3_file_path}'...")
+        # fp16=False can improve compatibility on CPU
+        result = model.transcribe(mp3_file_path, fp16=False)
+        transcribed_text = result.get("text", "")
+        if transcribed_text:
+            print(
+                f"AudioTool: Transcription successful for '{mp3_file_path}'.")
+            return transcribed_text
+        else:
+            # This case might occur if the audio is silent or whisper couldn't detect speech
+            return f"Notice: Transcription resulted in empty text for '{mp3_file_path}'. The audio might be silent or contain no clear speech."
+    except FileNotFoundError:  # Should be caught by os.path.exists, but as a fallback for whisper's internal handling
+        return f"Error: Whisper could not find the audio file at path: '{mp3_file_path}' (even if it was initially detected)."
+    except Exception as e:
+        # Check if the error is due to ffmpeg not being found
+        if "ffmpeg" in str(e).lower() and ("not found" in str(e).lower() or "not installed" in str(e).lower()):
+            return ("Error during transcription: ffmpeg not found. "
+                    "OpenAI Whisper requires ffmpeg to be installed and in your system's PATH. "
+                    f"Details: {type(e).__name__} - {str(e)}")
+        return f"Error during audio transcription for '{mp3_file_path}': {type(e).__name__} - {str(e)}"
+if __name__ == '__main__':
+    # This is a placeholder for testing.
+    # To test, you would need an actual MP3 file.
+    # For example:
+    # test_mp3_path = "path/to/your/test_audio.mp3"
+    # if os.path.exists(test_mp3_path):
+    #     print(f"--- Testing with MP3 file: {test_mp3_path} ---")
+    #     transcript = transcribe_mp3_audio_file(test_mp3_path)
+    #     print("Transcription Result:")
+    #     print(transcript)
+    # else:
+    #     print(f"Test MP3 file not found at: {test_mp3_path}. Cannot run local test.")
+    print("Audio transcription tool defined. To test, provide a path to an MP3 file in the example block.")
+    # Test with a non-existent file
+    print("\n--- Testing with non-existent file ---")
+    print(transcribe_mp3_audio_file("non_existent_file.mp3"))
+    # Test with a non-mp3 file
+    print("\n--- Testing with non-MP3 file extension ---")
+    # Create a dummy file for this test
+    dummy_file = "dummy.txt"
+    with open(dummy_file, "w") as f:
+        f.write("this is not an mp3")
+    print(transcribe_mp3_audio_file(dummy_file))
+    os.remove(dummy_file)

basic_agent.py CHANGED Viewed

@@ -3,6 +3,7 @@ import os
 from file_handler import get_task_file_path, DEFAULT_FILES_DIR
 from youtube_tool import get_youtube_video_transcript
 from file_processing_tools import get_csv_data_summary, get_excel_data_summary
 from smolagents import (
     ToolCallingAgent,
@@ -77,6 +78,7 @@ class BasicAgent:
             get_youtube_video_transcript,
             get_csv_data_summary,
             get_excel_data_summary,
             DuckDuckGoSearchTool(),
             VisitWebpageTool(),
             WikipediaSearchTool(),
@@ -121,10 +123,16 @@ Key Instructions:
 4.  **Handling Tool Errors**:
     *   If a tool call itself returns an error: analyze the error message, try to correct the input to the tool (e.g., fix code, verify URL). Do not immediately retry the exact same call. Consider if a different tool or approach is more suitable.
 5.  **Formulate Your Response**:
-    *   Provide only the final, concise answer. Do not include reasoning, apologies, or conversational filler.
-    *   If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
-    *   If, after thorough investigation (including careful analysis of direct tool outputs and appropriate error handling), you cannot determine a definitive answer, respond with 'I don't know'.
 --- Start of Question & File Information ---
 {question_to_llm}
 --- End of Question & File Information ---

 from file_handler import get_task_file_path, DEFAULT_FILES_DIR
 from youtube_tool import get_youtube_video_transcript
 from file_processing_tools import get_csv_data_summary, get_excel_data_summary
+from audio_tools import transcribe_mp3_audio_file
 from smolagents import (
     ToolCallingAgent,
             get_youtube_video_transcript,
             get_csv_data_summary,
             get_excel_data_summary,
+            transcribe_mp3_audio_file,
             DuckDuckGoSearchTool(),
             VisitWebpageTool(),
             WikipediaSearchTool(),
 4.  **Handling Tool Errors**:
     *   If a tool call itself returns an error: analyze the error message, try to correct the input to the tool (e.g., fix code, verify URL). Do not immediately retry the exact same call. Consider if a different tool or approach is more suitable.
 5.  **Formulate Your Response**:
+    *   Provide only the final, concise answer to the question.
+    *   Do not include your reasoning steps, apologies, self-correction narratives, or any conversational filler in the final answer.
+    *   **Number Formatting**:
+        *   For a single large number, do not use commas as thousands separators (e.g., write `1234567` not `1,234,567`).
+        *   Do not include units such as `$` or percent signs `%` unless the question specifically asks for them.
+    *   **List Formatting**:
+        *   If the answer is a list of items (e.g., numbers, names, page numbers) and the question implies a comma-separated format or that's the most natural way to present it:
+            *   Separate items with a comma followed by a single space (e.g., `apple, pear, orange` or `132, 197, 245`).
+            *   If the question asks for the list to be sorted, ensure it is.
+    *   **"I don't know"**: If, after thorough investigation (including careful analysis of direct tool outputs and appropriate error handling), you cannot determine a definitive answer, respond with the exact phrase 'I don't know'.
 --- Start of Question & File Information ---
 {question_to_llm}
 --- End of Question & File Information ---

requirements.txt CHANGED Viewed

@@ -7,4 +7,5 @@ wikipedia-api
 youtube-transcript-api
 pandas
 openpyxl
-markdownify

 youtube-transcript-api
 pandas
 openpyxl
+markdownify
+openai-whisper