Spaces:

alex16052G
/

abi

Paused

App Files Files Community

alex16052G commited on Jan 22

Commit

df113fc

verified ·

1 Parent(s): 4535df5

Update chat_ai.py

Browse files

Files changed (1) hide show

chat_ai.py +24 -15

chat_ai.py CHANGED Viewed

@@ -1,10 +1,9 @@
-# chat_ai.py
 # ruff: noqa: E402
 # Above allows ruff to ignore E402: module level import not at top of file
 import re
 import tempfile
 import click
 import gradio as gr
@@ -14,6 +13,7 @@ import torchaudio
 from cached_path import cached_path
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from num2words import num2words
 try:
     import spaces
@@ -37,6 +37,13 @@ from f5_tts.infer.utils_infer import (
     save_spectrogram,
 )
 # Cargar el vocoder
 vocoder = load_vocoder()
@@ -50,6 +57,9 @@ F5TTS_ema_model = load_model(
 chat_model_state = None
 chat_tokenizer_state = None
 @gpu_decorator
 def generate_response(messages, model, tokenizer):
     """Genera una respuesta usando el modelo de chat"""
@@ -135,7 +145,7 @@ def load_chat_model():
     if chat_model_state is None:
         model_name = "Qwen/Qwen2.5-3B-Instruct"
         chat_model_state = AutoModelForCausalLM.from_pretrained(
-            model_name, torch_dtype="auto", device_map="auto"
         )
         chat_tokenizer_state = AutoTokenizer.from_pretrained(model_name)
     return chat_model_state, chat_tokenizer_state
@@ -154,7 +164,6 @@ with gr.Blocks() as app_chat:
     if not USING_SPACES:
         load_chat_model_btn = gr.Button("Cargar Modelo de Chat", variant="primary")
         chat_interface_container = gr.Column(visible=False)
         @gpu_decorator
@@ -220,19 +229,19 @@ with gr.Blocks() as app_chat:
         )
         @gpu_decorator
-        def process_input(audio_path, text, history, conv_state):
             """Procesa la entrada de audio o texto del usuario y genera una respuesta."""
             if not audio_path and not text.strip():
                 return history, conv_state, ""
             if audio_path:
-                # Aquí podrías integrar una transcripción automática si lo deseas
-                # Actualmente, asume que el texto es proporcionado si hay audio
-                # Puedes integrar Whisper u otro modelo de transcripción si es necesario
-                # Por ejemplo:
-                # text = transcribe_audio(audio_path)
-                # Pero por ahora, usamos el texto proporcionado
-                pass
             if not text.strip():
                 return history, conv_state, ""
@@ -282,7 +291,7 @@ with gr.Blocks() as app_chat:
         # Manejar la entrada de audio
         audio_input_chat.stop_recording(
             process_input,
-            inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
             outputs=[chatbot_interface, conversation_state, text_input_chat],
         ).then(
             generate_audio_response,
@@ -297,7 +306,7 @@ with gr.Blocks() as app_chat:
         # Manejar la entrada de texto
         text_input_chat.submit(
             process_input,
-            inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
             outputs=[chatbot_interface, conversation_state, text_input_chat],
         ).then(
             generate_audio_response,
@@ -312,7 +321,7 @@ with gr.Blocks() as app_chat:
         # Manejar el botón de enviar
         send_btn_chat.click(
             process_input,
-            inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
             outputs=[chatbot_interface, conversation_state, text_input_chat],
         ).then(
             generate_audio_response,

 # ruff: noqa: E402
 # Above allows ruff to ignore E402: module level import not at top of file
 import re
 import tempfile
+import sys
 import click
 import gradio as gr
 from cached_path import cached_path
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from num2words import num2words
+import torch
 try:
     import spaces
     save_spectrogram,
 )
+# Importar Whisper para transcripción
+try:
+    import whisper
+except ImportError:
+    print("El paquete 'whisper' no está instalado. Asegúrate de instalarlo con 'pip install openai-whisper'")
+    sys.exit(1)
 # Cargar el vocoder
 vocoder = load_vocoder()
 chat_model_state = None
 chat_tokenizer_state = None
+# Cargar el modelo de transcripción Whisper
+transcription_model = whisper.load_model("base")  # Puedes elegir otros tamaños como 'small', 'medium', 'large'
 @gpu_decorator
 def generate_response(messages, model, tokenizer):
     """Genera una respuesta usando el modelo de chat"""
     if chat_model_state is None:
         model_name = "Qwen/Qwen2.5-3B-Instruct"
         chat_model_state = AutoModelForCausalLM.from_pretrained(
+            model_name, torch_dtype=torch.float16, device_map="auto"
         )
         chat_tokenizer_state = AutoTokenizer.from_pretrained(model_name)
     return chat_model_state, chat_tokenizer_state
     if not USING_SPACES:
         load_chat_model_btn = gr.Button("Cargar Modelo de Chat", variant="primary")
         chat_interface_container = gr.Column(visible=False)
         @gpu_decorator
         )
         @gpu_decorator
+        def process_input(audio_path, text, history, conv_state, system_prompt):
             """Procesa la entrada de audio o texto del usuario y genera una respuesta."""
             if not audio_path and not text.strip():
                 return history, conv_state, ""
             if audio_path:
+                # Transcribir audio usando Whisper
+                result = transcription_model.transcribe(audio_path)
+                transcribed_text = result["text"].strip()
+                if transcribed_text:
+                    text = transcribed_text
+                else:
+                    text = text.strip()
             if not text.strip():
                 return history, conv_state, ""
         # Manejar la entrada de audio
         audio_input_chat.stop_recording(
             process_input,
+            inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state, system_prompt_chat],
             outputs=[chatbot_interface, conversation_state, text_input_chat],
         ).then(
             generate_audio_response,
         # Manejar la entrada de texto
         text_input_chat.submit(
             process_input,
+            inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state, system_prompt_chat],
             outputs=[chatbot_interface, conversation_state, text_input_chat],
         ).then(
             generate_audio_response,
         # Manejar el botón de enviar
         send_btn_chat.click(
             process_input,
+            inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state, system_prompt_chat],
             outputs=[chatbot_interface, conversation_state, text_input_chat],
         ).then(
             generate_audio_response,