kirbah commited on
Commit
65dc764
·
1 Parent(s): c8e9894

Add MP3 audio transcription tool

Browse files
Files changed (3) hide show
  1. audio_tools.py +86 -0
  2. basic_agent.py +12 -4
  3. requirements.txt +2 -1
audio_tools.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from smolagents import tool # Assuming smolagents.tool is the correct decorator
3
+
4
+
5
+ @tool
6
+ def transcribe_mp3_audio_file(mp3_file_path: str) -> str:
7
+ """
8
+ Transcribes an MP3 audio file using OpenAI's Whisper 'base' model.
9
+
10
+ Args:
11
+ mp3_file_path (str): The absolute local path to the MP3 audio file.
12
+ This path should be obtained from the 'File Information' section
13
+ if the file was downloaded by the agent.
14
+
15
+ Returns:
16
+ str: The transcribed text from the audio file, or an error message if transcription fails.
17
+ """
18
+ try:
19
+ import whisper # Attempt to import whisper only when the tool is called
20
+ except ImportError:
21
+ return ("Error: The 'openai-whisper' library is required but not installed. "
22
+ "Please install it using 'pip install openai-whisper' and ensure ffmpeg is also installed.")
23
+
24
+ if not os.path.exists(mp3_file_path):
25
+ return f"Error: Audio file not found at the specified path: '{mp3_file_path}'. Please verify the path."
26
+
27
+ if not mp3_file_path.lower().endswith(".mp3"):
28
+ return f"Error: The provided file path '{mp3_file_path}' does not appear to be an MP3 file. This tool currently only supports .mp3 files."
29
+
30
+ try:
31
+ print(
32
+ f"AudioTool: Loading Whisper 'base' model to transcribe '{mp3_file_path}'...")
33
+ # You can choose different model sizes: "tiny", "base", "small", "medium", "large"
34
+ # "base" is a good balance of speed and accuracy for many use cases.
35
+ # Larger models are more accurate but slower and require more resources.
36
+ model = whisper.load_model("base")
37
+
38
+ print(f"AudioTool: Transcribing audio from '{mp3_file_path}'...")
39
+ # fp16=False can improve compatibility on CPU
40
+ result = model.transcribe(mp3_file_path, fp16=False)
41
+
42
+ transcribed_text = result.get("text", "")
43
+ if transcribed_text:
44
+ print(
45
+ f"AudioTool: Transcription successful for '{mp3_file_path}'.")
46
+ return transcribed_text
47
+ else:
48
+ # This case might occur if the audio is silent or whisper couldn't detect speech
49
+ return f"Notice: Transcription resulted in empty text for '{mp3_file_path}'. The audio might be silent or contain no clear speech."
50
+
51
+ except FileNotFoundError: # Should be caught by os.path.exists, but as a fallback for whisper's internal handling
52
+ return f"Error: Whisper could not find the audio file at path: '{mp3_file_path}' (even if it was initially detected)."
53
+ except Exception as e:
54
+ # Check if the error is due to ffmpeg not being found
55
+ if "ffmpeg" in str(e).lower() and ("not found" in str(e).lower() or "not installed" in str(e).lower()):
56
+ return ("Error during transcription: ffmpeg not found. "
57
+ "OpenAI Whisper requires ffmpeg to be installed and in your system's PATH. "
58
+ f"Details: {type(e).__name__} - {str(e)}")
59
+ return f"Error during audio transcription for '{mp3_file_path}': {type(e).__name__} - {str(e)}"
60
+
61
+
62
+ if __name__ == '__main__':
63
+ # This is a placeholder for testing.
64
+ # To test, you would need an actual MP3 file.
65
+ # For example:
66
+ # test_mp3_path = "path/to/your/test_audio.mp3"
67
+ # if os.path.exists(test_mp3_path):
68
+ # print(f"--- Testing with MP3 file: {test_mp3_path} ---")
69
+ # transcript = transcribe_mp3_audio_file(test_mp3_path)
70
+ # print("Transcription Result:")
71
+ # print(transcript)
72
+ # else:
73
+ # print(f"Test MP3 file not found at: {test_mp3_path}. Cannot run local test.")
74
+
75
+ print("Audio transcription tool defined. To test, provide a path to an MP3 file in the example block.")
76
+ # Test with a non-existent file
77
+ print("\n--- Testing with non-existent file ---")
78
+ print(transcribe_mp3_audio_file("non_existent_file.mp3"))
79
+ # Test with a non-mp3 file
80
+ print("\n--- Testing with non-MP3 file extension ---")
81
+ # Create a dummy file for this test
82
+ dummy_file = "dummy.txt"
83
+ with open(dummy_file, "w") as f:
84
+ f.write("this is not an mp3")
85
+ print(transcribe_mp3_audio_file(dummy_file))
86
+ os.remove(dummy_file)
basic_agent.py CHANGED
@@ -3,6 +3,7 @@ import os
3
  from file_handler import get_task_file_path, DEFAULT_FILES_DIR
4
  from youtube_tool import get_youtube_video_transcript
5
  from file_processing_tools import get_csv_data_summary, get_excel_data_summary
 
6
 
7
  from smolagents import (
8
  ToolCallingAgent,
@@ -77,6 +78,7 @@ class BasicAgent:
77
  get_youtube_video_transcript,
78
  get_csv_data_summary,
79
  get_excel_data_summary,
 
80
  DuckDuckGoSearchTool(),
81
  VisitWebpageTool(),
82
  WikipediaSearchTool(),
@@ -121,10 +123,16 @@ Key Instructions:
121
  4. **Handling Tool Errors**:
122
  * If a tool call itself returns an error: analyze the error message, try to correct the input to the tool (e.g., fix code, verify URL). Do not immediately retry the exact same call. Consider if a different tool or approach is more suitable.
123
  5. **Formulate Your Response**:
124
- * Provide only the final, concise answer. Do not include reasoning, apologies, or conversational filler.
125
- * If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
126
- * If, after thorough investigation (including careful analysis of direct tool outputs and appropriate error handling), you cannot determine a definitive answer, respond with 'I don't know'.
127
-
 
 
 
 
 
 
128
  --- Start of Question & File Information ---
129
  {question_to_llm}
130
  --- End of Question & File Information ---
 
3
  from file_handler import get_task_file_path, DEFAULT_FILES_DIR
4
  from youtube_tool import get_youtube_video_transcript
5
  from file_processing_tools import get_csv_data_summary, get_excel_data_summary
6
+ from audio_tools import transcribe_mp3_audio_file
7
 
8
  from smolagents import (
9
  ToolCallingAgent,
 
78
  get_youtube_video_transcript,
79
  get_csv_data_summary,
80
  get_excel_data_summary,
81
+ transcribe_mp3_audio_file,
82
  DuckDuckGoSearchTool(),
83
  VisitWebpageTool(),
84
  WikipediaSearchTool(),
 
123
  4. **Handling Tool Errors**:
124
  * If a tool call itself returns an error: analyze the error message, try to correct the input to the tool (e.g., fix code, verify URL). Do not immediately retry the exact same call. Consider if a different tool or approach is more suitable.
125
  5. **Formulate Your Response**:
126
+ * Provide only the final, concise answer to the question.
127
+ * Do not include your reasoning steps, apologies, self-correction narratives, or any conversational filler in the final answer.
128
+ * **Number Formatting**:
129
+ * For a single large number, do not use commas as thousands separators (e.g., write `1234567` not `1,234,567`).
130
+ * Do not include units such as `$` or percent signs `%` unless the question specifically asks for them.
131
+ * **List Formatting**:
132
+ * If the answer is a list of items (e.g., numbers, names, page numbers) and the question implies a comma-separated format or that's the most natural way to present it:
133
+ * Separate items with a comma followed by a single space (e.g., `apple, pear, orange` or `132, 197, 245`).
134
+ * If the question asks for the list to be sorted, ensure it is.
135
+ * **"I don't know"**: If, after thorough investigation (including careful analysis of direct tool outputs and appropriate error handling), you cannot determine a definitive answer, respond with the exact phrase 'I don't know'.
136
  --- Start of Question & File Information ---
137
  {question_to_llm}
138
  --- End of Question & File Information ---
requirements.txt CHANGED
@@ -7,4 +7,5 @@ wikipedia-api
7
  youtube-transcript-api
8
  pandas
9
  openpyxl
10
- markdownify
 
 
7
  youtube-transcript-api
8
  pandas
9
  openpyxl
10
+ markdownify
11
+ openai-whisper