Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -100,11 +100,13 @@ class ConversationBot:
|
|
| 100 |
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
| 101 |
audio_load = whisper.load_audio(file.name)
|
| 102 |
soundfile.write(audio_filename, audio_load, samplerate = 16000)
|
| 103 |
-
description = self.a2t.inference(audio_filename)
|
| 104 |
-
Human_prompt = "\nHuman: provide an audio named {}. The description is: {}. This information helps you to understand this audio, but you should use tools to finish following tasks, " \
|
| 105 |
-
|
|
|
|
|
|
|
| 106 |
AI_prompt = "Received. "
|
| 107 |
-
self.agent.memory.buffer = self.agent.memory.buffer +
|
| 108 |
print("======>Current memory:\n %s" % self.agent.memory)
|
| 109 |
#state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
|
| 110 |
state = state + [(f"*{audio_filename}*", AI_prompt)]
|
|
@@ -124,11 +126,13 @@ class ConversationBot:
|
|
| 124 |
img = img.convert('RGB')
|
| 125 |
img.save(image_filename, "PNG")
|
| 126 |
print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
|
| 127 |
-
description = self.i2t.inference(image_filename)
|
| 128 |
-
Human_prompt = "\nHuman: provide a figure named {}. The description is: {}. This information helps you to understand this image, but you should use tools to finish following tasks, " \
|
| 129 |
-
|
|
|
|
|
|
|
| 130 |
AI_prompt = "Received. "
|
| 131 |
-
self.agent.memory.buffer = self.agent.memory.buffer +
|
| 132 |
print("======>Current memory:\n %s" % self.agent.memory)
|
| 133 |
state = state + [(f"*{image_filename}*", AI_prompt)]
|
| 134 |
print("Outputs:", state)
|
|
@@ -159,10 +163,10 @@ class ConversationBot:
|
|
| 159 |
self.t2a = T2A(device="cpu")
|
| 160 |
self.tts = TTS(device="cpu")
|
| 161 |
# self.t2s = T2S(device="cuda:0")
|
| 162 |
-
|
| 163 |
self.a2t = A2T(device="cpu")
|
| 164 |
# self.asr = ASR(device="cuda:0")
|
| 165 |
-
|
| 166 |
#self.tts_ood = TTS_OOD(device="cuda:0")
|
| 167 |
self.tools = [
|
| 168 |
# Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
|
|
@@ -188,15 +192,15 @@ class ConversationBot:
|
|
| 188 |
Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
|
| 189 |
description="useful for when you want to convert a user input text into speech audio it saved it to a file."
|
| 190 |
"The input to this tool should be a string, representing the text used to be converted to speech."),
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
Tool(name="Generate Text From The Audio", func=self.a2t.inference,
|
| 195 |
description="useful for when you want to describe an audio in text, receives audio_path as input."
|
|
|
|
|
|
|
|
|
|
| 196 |
"The input to this tool should be a string, representing the audio_path.")]
|
| 197 |
-
# Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
|
| 198 |
-
# description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
|
| 199 |
-
# "The input to this tool should be a string, representing the audio_path."),
|
| 200 |
# Tool(name="Transcribe speech", func=self.asr.inference,
|
| 201 |
# description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
|
| 202 |
# "The input to this tool should be a string, representing the audio_path.")]
|
|
@@ -218,7 +222,7 @@ if __name__ == '__main__':
|
|
| 218 |
with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
|
| 219 |
with gr.Row():
|
| 220 |
openai_api_key_textbox = gr.Textbox(
|
| 221 |
-
placeholder="Paste your OpenAI API key here to start
|
| 222 |
show_label=False,
|
| 223 |
lines=1,
|
| 224 |
type="password",
|
|
@@ -228,7 +232,7 @@ if __name__ == '__main__':
|
|
| 228 |
chatbot = gr.Chatbot(elem_id="chatbot", label="AudioGPT")
|
| 229 |
state = gr.State([])
|
| 230 |
with gr.Row(visible = False) as input_raws:
|
| 231 |
-
with gr.Column(scale=0.
|
| 232 |
txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False)
|
| 233 |
with gr.Column(scale=0.15, min_width=0):
|
| 234 |
clear = gr.Button("Clear️")
|
|
|
|
| 100 |
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
| 101 |
audio_load = whisper.load_audio(file.name)
|
| 102 |
soundfile.write(audio_filename, audio_load, samplerate = 16000)
|
| 103 |
+
# description = self.a2t.inference(audio_filename)
|
| 104 |
+
# Human_prompt = "\nHuman: provide an audio named {}. The description is: {}. This information helps you to understand this audio, but you should use tools to finish following tasks, " \
|
| 105 |
+
# "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(audio_filename, description)
|
| 106 |
+
# AI_prompt = "Received. "
|
| 107 |
+
# self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
|
| 108 |
AI_prompt = "Received. "
|
| 109 |
+
self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
|
| 110 |
print("======>Current memory:\n %s" % self.agent.memory)
|
| 111 |
#state = state + [(f"<audio src=audio_filename controls=controls></audio>*{audio_filename}*", AI_prompt)]
|
| 112 |
state = state + [(f"*{audio_filename}*", AI_prompt)]
|
|
|
|
| 126 |
img = img.convert('RGB')
|
| 127 |
img.save(image_filename, "PNG")
|
| 128 |
print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
|
| 129 |
+
# description = self.i2t.inference(image_filename)
|
| 130 |
+
# Human_prompt = "\nHuman: provide a figure named {}. The description is: {}. This information helps you to understand this image, but you should use tools to finish following tasks, " \
|
| 131 |
+
# "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(image_filename, description)
|
| 132 |
+
# AI_prompt = "Received. "
|
| 133 |
+
# self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
|
| 134 |
AI_prompt = "Received. "
|
| 135 |
+
self.agent.memory.buffer = self.agent.memory.buffer + 'AI: ' + AI_prompt
|
| 136 |
print("======>Current memory:\n %s" % self.agent.memory)
|
| 137 |
state = state + [(f"*{image_filename}*", AI_prompt)]
|
| 138 |
print("Outputs:", state)
|
|
|
|
| 163 |
self.t2a = T2A(device="cpu")
|
| 164 |
self.tts = TTS(device="cpu")
|
| 165 |
# self.t2s = T2S(device="cuda:0")
|
| 166 |
+
self.i2a = I2A(device="cuda:0")
|
| 167 |
self.a2t = A2T(device="cpu")
|
| 168 |
# self.asr = ASR(device="cuda:0")
|
| 169 |
+
self.inpaint = Inpaint(device="cuda:0")
|
| 170 |
#self.tts_ood = TTS_OOD(device="cuda:0")
|
| 171 |
self.tools = [
|
| 172 |
# Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
|
|
|
|
| 192 |
Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
|
| 193 |
description="useful for when you want to convert a user input text into speech audio it saved it to a file."
|
| 194 |
"The input to this tool should be a string, representing the text used to be converted to speech."),
|
| 195 |
+
Tool(name="Generate Audio From The Image", func=self.i2a.inference,
|
| 196 |
+
description="useful for when you want to generate an audio based on an image."
|
| 197 |
+
"The input to this tool should be a string, representing the image_path. "),
|
| 198 |
Tool(name="Generate Text From The Audio", func=self.a2t.inference,
|
| 199 |
description="useful for when you want to describe an audio in text, receives audio_path as input."
|
| 200 |
+
"The input to this tool should be a string, representing the audio_path.")
|
| 201 |
+
Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
|
| 202 |
+
description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
|
| 203 |
"The input to this tool should be a string, representing the audio_path.")]
|
|
|
|
|
|
|
|
|
|
| 204 |
# Tool(name="Transcribe speech", func=self.asr.inference,
|
| 205 |
# description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
|
| 206 |
# "The input to this tool should be a string, representing the audio_path.")]
|
|
|
|
| 222 |
with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
|
| 223 |
with gr.Row():
|
| 224 |
openai_api_key_textbox = gr.Textbox(
|
| 225 |
+
placeholder="Paste your OpenAI API key here to start AudioGPT(sk-...) and press Enter ↵️",
|
| 226 |
show_label=False,
|
| 227 |
lines=1,
|
| 228 |
type="password",
|
|
|
|
| 232 |
chatbot = gr.Chatbot(elem_id="chatbot", label="AudioGPT")
|
| 233 |
state = gr.State([])
|
| 234 |
with gr.Row(visible = False) as input_raws:
|
| 235 |
+
with gr.Column(scale=0.9):
|
| 236 |
txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False)
|
| 237 |
with gr.Column(scale=0.15, min_width=0):
|
| 238 |
clear = gr.Button("Clear️")
|