Spaces:
Sleeping
Sleeping
Add MP3 audio transcription tool
Browse files- audio_tools.py +86 -0
- basic_agent.py +12 -4
- requirements.txt +2 -1
audio_tools.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from smolagents import tool # Assuming smolagents.tool is the correct decorator
|
3 |
+
|
4 |
+
|
5 |
+
@tool
|
6 |
+
def transcribe_mp3_audio_file(mp3_file_path: str) -> str:
|
7 |
+
"""
|
8 |
+
Transcribes an MP3 audio file using OpenAI's Whisper 'base' model.
|
9 |
+
|
10 |
+
Args:
|
11 |
+
mp3_file_path (str): The absolute local path to the MP3 audio file.
|
12 |
+
This path should be obtained from the 'File Information' section
|
13 |
+
if the file was downloaded by the agent.
|
14 |
+
|
15 |
+
Returns:
|
16 |
+
str: The transcribed text from the audio file, or an error message if transcription fails.
|
17 |
+
"""
|
18 |
+
try:
|
19 |
+
import whisper # Attempt to import whisper only when the tool is called
|
20 |
+
except ImportError:
|
21 |
+
return ("Error: The 'openai-whisper' library is required but not installed. "
|
22 |
+
"Please install it using 'pip install openai-whisper' and ensure ffmpeg is also installed.")
|
23 |
+
|
24 |
+
if not os.path.exists(mp3_file_path):
|
25 |
+
return f"Error: Audio file not found at the specified path: '{mp3_file_path}'. Please verify the path."
|
26 |
+
|
27 |
+
if not mp3_file_path.lower().endswith(".mp3"):
|
28 |
+
return f"Error: The provided file path '{mp3_file_path}' does not appear to be an MP3 file. This tool currently only supports .mp3 files."
|
29 |
+
|
30 |
+
try:
|
31 |
+
print(
|
32 |
+
f"AudioTool: Loading Whisper 'base' model to transcribe '{mp3_file_path}'...")
|
33 |
+
# You can choose different model sizes: "tiny", "base", "small", "medium", "large"
|
34 |
+
# "base" is a good balance of speed and accuracy for many use cases.
|
35 |
+
# Larger models are more accurate but slower and require more resources.
|
36 |
+
model = whisper.load_model("base")
|
37 |
+
|
38 |
+
print(f"AudioTool: Transcribing audio from '{mp3_file_path}'...")
|
39 |
+
# fp16=False can improve compatibility on CPU
|
40 |
+
result = model.transcribe(mp3_file_path, fp16=False)
|
41 |
+
|
42 |
+
transcribed_text = result.get("text", "")
|
43 |
+
if transcribed_text:
|
44 |
+
print(
|
45 |
+
f"AudioTool: Transcription successful for '{mp3_file_path}'.")
|
46 |
+
return transcribed_text
|
47 |
+
else:
|
48 |
+
# This case might occur if the audio is silent or whisper couldn't detect speech
|
49 |
+
return f"Notice: Transcription resulted in empty text for '{mp3_file_path}'. The audio might be silent or contain no clear speech."
|
50 |
+
|
51 |
+
except FileNotFoundError: # Should be caught by os.path.exists, but as a fallback for whisper's internal handling
|
52 |
+
return f"Error: Whisper could not find the audio file at path: '{mp3_file_path}' (even if it was initially detected)."
|
53 |
+
except Exception as e:
|
54 |
+
# Check if the error is due to ffmpeg not being found
|
55 |
+
if "ffmpeg" in str(e).lower() and ("not found" in str(e).lower() or "not installed" in str(e).lower()):
|
56 |
+
return ("Error during transcription: ffmpeg not found. "
|
57 |
+
"OpenAI Whisper requires ffmpeg to be installed and in your system's PATH. "
|
58 |
+
f"Details: {type(e).__name__} - {str(e)}")
|
59 |
+
return f"Error during audio transcription for '{mp3_file_path}': {type(e).__name__} - {str(e)}"
|
60 |
+
|
61 |
+
|
62 |
+
if __name__ == '__main__':
|
63 |
+
# This is a placeholder for testing.
|
64 |
+
# To test, you would need an actual MP3 file.
|
65 |
+
# For example:
|
66 |
+
# test_mp3_path = "path/to/your/test_audio.mp3"
|
67 |
+
# if os.path.exists(test_mp3_path):
|
68 |
+
# print(f"--- Testing with MP3 file: {test_mp3_path} ---")
|
69 |
+
# transcript = transcribe_mp3_audio_file(test_mp3_path)
|
70 |
+
# print("Transcription Result:")
|
71 |
+
# print(transcript)
|
72 |
+
# else:
|
73 |
+
# print(f"Test MP3 file not found at: {test_mp3_path}. Cannot run local test.")
|
74 |
+
|
75 |
+
print("Audio transcription tool defined. To test, provide a path to an MP3 file in the example block.")
|
76 |
+
# Test with a non-existent file
|
77 |
+
print("\n--- Testing with non-existent file ---")
|
78 |
+
print(transcribe_mp3_audio_file("non_existent_file.mp3"))
|
79 |
+
# Test with a non-mp3 file
|
80 |
+
print("\n--- Testing with non-MP3 file extension ---")
|
81 |
+
# Create a dummy file for this test
|
82 |
+
dummy_file = "dummy.txt"
|
83 |
+
with open(dummy_file, "w") as f:
|
84 |
+
f.write("this is not an mp3")
|
85 |
+
print(transcribe_mp3_audio_file(dummy_file))
|
86 |
+
os.remove(dummy_file)
|
basic_agent.py
CHANGED
@@ -3,6 +3,7 @@ import os
|
|
3 |
from file_handler import get_task_file_path, DEFAULT_FILES_DIR
|
4 |
from youtube_tool import get_youtube_video_transcript
|
5 |
from file_processing_tools import get_csv_data_summary, get_excel_data_summary
|
|
|
6 |
|
7 |
from smolagents import (
|
8 |
ToolCallingAgent,
|
@@ -77,6 +78,7 @@ class BasicAgent:
|
|
77 |
get_youtube_video_transcript,
|
78 |
get_csv_data_summary,
|
79 |
get_excel_data_summary,
|
|
|
80 |
DuckDuckGoSearchTool(),
|
81 |
VisitWebpageTool(),
|
82 |
WikipediaSearchTool(),
|
@@ -121,10 +123,16 @@ Key Instructions:
|
|
121 |
4. **Handling Tool Errors**:
|
122 |
* If a tool call itself returns an error: analyze the error message, try to correct the input to the tool (e.g., fix code, verify URL). Do not immediately retry the exact same call. Consider if a different tool or approach is more suitable.
|
123 |
5. **Formulate Your Response**:
|
124 |
-
* Provide only the final, concise answer
|
125 |
-
*
|
126 |
-
*
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
--- Start of Question & File Information ---
|
129 |
{question_to_llm}
|
130 |
--- End of Question & File Information ---
|
|
|
3 |
from file_handler import get_task_file_path, DEFAULT_FILES_DIR
|
4 |
from youtube_tool import get_youtube_video_transcript
|
5 |
from file_processing_tools import get_csv_data_summary, get_excel_data_summary
|
6 |
+
from audio_tools import transcribe_mp3_audio_file
|
7 |
|
8 |
from smolagents import (
|
9 |
ToolCallingAgent,
|
|
|
78 |
get_youtube_video_transcript,
|
79 |
get_csv_data_summary,
|
80 |
get_excel_data_summary,
|
81 |
+
transcribe_mp3_audio_file,
|
82 |
DuckDuckGoSearchTool(),
|
83 |
VisitWebpageTool(),
|
84 |
WikipediaSearchTool(),
|
|
|
123 |
4. **Handling Tool Errors**:
|
124 |
* If a tool call itself returns an error: analyze the error message, try to correct the input to the tool (e.g., fix code, verify URL). Do not immediately retry the exact same call. Consider if a different tool or approach is more suitable.
|
125 |
5. **Formulate Your Response**:
|
126 |
+
* Provide only the final, concise answer to the question.
|
127 |
+
* Do not include your reasoning steps, apologies, self-correction narratives, or any conversational filler in the final answer.
|
128 |
+
* **Number Formatting**:
|
129 |
+
* For a single large number, do not use commas as thousands separators (e.g., write `1234567` not `1,234,567`).
|
130 |
+
* Do not include units such as `$` or percent signs `%` unless the question specifically asks for them.
|
131 |
+
* **List Formatting**:
|
132 |
+
* If the answer is a list of items (e.g., numbers, names, page numbers) and the question implies a comma-separated format or that's the most natural way to present it:
|
133 |
+
* Separate items with a comma followed by a single space (e.g., `apple, pear, orange` or `132, 197, 245`).
|
134 |
+
* If the question asks for the list to be sorted, ensure it is.
|
135 |
+
* **"I don't know"**: If, after thorough investigation (including careful analysis of direct tool outputs and appropriate error handling), you cannot determine a definitive answer, respond with the exact phrase 'I don't know'.
|
136 |
--- Start of Question & File Information ---
|
137 |
{question_to_llm}
|
138 |
--- End of Question & File Information ---
|
requirements.txt
CHANGED
@@ -7,4 +7,5 @@ wikipedia-api
|
|
7 |
youtube-transcript-api
|
8 |
pandas
|
9 |
openpyxl
|
10 |
-
markdownify
|
|
|
|
7 |
youtube-transcript-api
|
8 |
pandas
|
9 |
openpyxl
|
10 |
+
markdownify
|
11 |
+
openai-whisper
|