alex16052G commited on
Commit
df113fc
verified
1 Parent(s): 4535df5

Update chat_ai.py

Browse files
Files changed (1) hide show
  1. chat_ai.py +24 -15
chat_ai.py CHANGED
@@ -1,10 +1,9 @@
1
- # chat_ai.py
2
-
3
  # ruff: noqa: E402
4
  # Above allows ruff to ignore E402: module level import not at top of file
5
 
6
  import re
7
  import tempfile
 
8
 
9
  import click
10
  import gradio as gr
@@ -14,6 +13,7 @@ import torchaudio
14
  from cached_path import cached_path
15
  from transformers import AutoModelForCausalLM, AutoTokenizer
16
  from num2words import num2words
 
17
 
18
  try:
19
  import spaces
@@ -37,6 +37,13 @@ from f5_tts.infer.utils_infer import (
37
  save_spectrogram,
38
  )
39
 
 
 
 
 
 
 
 
40
  # Cargar el vocoder
41
  vocoder = load_vocoder()
42
 
@@ -50,6 +57,9 @@ F5TTS_ema_model = load_model(
50
  chat_model_state = None
51
  chat_tokenizer_state = None
52
 
 
 
 
53
  @gpu_decorator
54
  def generate_response(messages, model, tokenizer):
55
  """Genera una respuesta usando el modelo de chat"""
@@ -135,7 +145,7 @@ def load_chat_model():
135
  if chat_model_state is None:
136
  model_name = "Qwen/Qwen2.5-3B-Instruct"
137
  chat_model_state = AutoModelForCausalLM.from_pretrained(
138
- model_name, torch_dtype="auto", device_map="auto"
139
  )
140
  chat_tokenizer_state = AutoTokenizer.from_pretrained(model_name)
141
  return chat_model_state, chat_tokenizer_state
@@ -154,7 +164,6 @@ with gr.Blocks() as app_chat:
154
 
155
  if not USING_SPACES:
156
  load_chat_model_btn = gr.Button("Cargar Modelo de Chat", variant="primary")
157
-
158
  chat_interface_container = gr.Column(visible=False)
159
 
160
  @gpu_decorator
@@ -220,19 +229,19 @@ with gr.Blocks() as app_chat:
220
  )
221
 
222
  @gpu_decorator
223
- def process_input(audio_path, text, history, conv_state):
224
  """Procesa la entrada de audio o texto del usuario y genera una respuesta."""
225
  if not audio_path and not text.strip():
226
  return history, conv_state, ""
227
 
228
  if audio_path:
229
- # Aqu铆 podr铆as integrar una transcripci贸n autom谩tica si lo deseas
230
- # Actualmente, asume que el texto es proporcionado si hay audio
231
- # Puedes integrar Whisper u otro modelo de transcripci贸n si es necesario
232
- # Por ejemplo:
233
- # text = transcribe_audio(audio_path)
234
- # Pero por ahora, usamos el texto proporcionado
235
- pass
236
 
237
  if not text.strip():
238
  return history, conv_state, ""
@@ -282,7 +291,7 @@ with gr.Blocks() as app_chat:
282
  # Manejar la entrada de audio
283
  audio_input_chat.stop_recording(
284
  process_input,
285
- inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
286
  outputs=[chatbot_interface, conversation_state, text_input_chat],
287
  ).then(
288
  generate_audio_response,
@@ -297,7 +306,7 @@ with gr.Blocks() as app_chat:
297
  # Manejar la entrada de texto
298
  text_input_chat.submit(
299
  process_input,
300
- inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
301
  outputs=[chatbot_interface, conversation_state, text_input_chat],
302
  ).then(
303
  generate_audio_response,
@@ -312,7 +321,7 @@ with gr.Blocks() as app_chat:
312
  # Manejar el bot贸n de enviar
313
  send_btn_chat.click(
314
  process_input,
315
- inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
316
  outputs=[chatbot_interface, conversation_state, text_input_chat],
317
  ).then(
318
  generate_audio_response,
 
 
 
1
  # ruff: noqa: E402
2
  # Above allows ruff to ignore E402: module level import not at top of file
3
 
4
  import re
5
  import tempfile
6
+ import sys
7
 
8
  import click
9
  import gradio as gr
 
13
  from cached_path import cached_path
14
  from transformers import AutoModelForCausalLM, AutoTokenizer
15
  from num2words import num2words
16
+ import torch
17
 
18
  try:
19
  import spaces
 
37
  save_spectrogram,
38
  )
39
 
40
+ # Importar Whisper para transcripci贸n
41
+ try:
42
+ import whisper
43
+ except ImportError:
44
+ print("El paquete 'whisper' no est谩 instalado. Aseg煤rate de instalarlo con 'pip install openai-whisper'")
45
+ sys.exit(1)
46
+
47
  # Cargar el vocoder
48
  vocoder = load_vocoder()
49
 
 
57
  chat_model_state = None
58
  chat_tokenizer_state = None
59
 
60
+ # Cargar el modelo de transcripci贸n Whisper
61
+ transcription_model = whisper.load_model("base") # Puedes elegir otros tama帽os como 'small', 'medium', 'large'
62
+
63
  @gpu_decorator
64
  def generate_response(messages, model, tokenizer):
65
  """Genera una respuesta usando el modelo de chat"""
 
145
  if chat_model_state is None:
146
  model_name = "Qwen/Qwen2.5-3B-Instruct"
147
  chat_model_state = AutoModelForCausalLM.from_pretrained(
148
+ model_name, torch_dtype=torch.float16, device_map="auto"
149
  )
150
  chat_tokenizer_state = AutoTokenizer.from_pretrained(model_name)
151
  return chat_model_state, chat_tokenizer_state
 
164
 
165
  if not USING_SPACES:
166
  load_chat_model_btn = gr.Button("Cargar Modelo de Chat", variant="primary")
 
167
  chat_interface_container = gr.Column(visible=False)
168
 
169
  @gpu_decorator
 
229
  )
230
 
231
  @gpu_decorator
232
+ def process_input(audio_path, text, history, conv_state, system_prompt):
233
  """Procesa la entrada de audio o texto del usuario y genera una respuesta."""
234
  if not audio_path and not text.strip():
235
  return history, conv_state, ""
236
 
237
  if audio_path:
238
+ # Transcribir audio usando Whisper
239
+ result = transcription_model.transcribe(audio_path)
240
+ transcribed_text = result["text"].strip()
241
+ if transcribed_text:
242
+ text = transcribed_text
243
+ else:
244
+ text = text.strip()
245
 
246
  if not text.strip():
247
  return history, conv_state, ""
 
291
  # Manejar la entrada de audio
292
  audio_input_chat.stop_recording(
293
  process_input,
294
+ inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state, system_prompt_chat],
295
  outputs=[chatbot_interface, conversation_state, text_input_chat],
296
  ).then(
297
  generate_audio_response,
 
306
  # Manejar la entrada de texto
307
  text_input_chat.submit(
308
  process_input,
309
+ inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state, system_prompt_chat],
310
  outputs=[chatbot_interface, conversation_state, text_input_chat],
311
  ).then(
312
  generate_audio_response,
 
321
  # Manejar el bot贸n de enviar
322
  send_btn_chat.click(
323
  process_input,
324
+ inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state, system_prompt_chat],
325
  outputs=[chatbot_interface, conversation_state, text_input_chat],
326
  ).then(
327
  generate_audio_response,