# main.py import asyncio import base64 import io import logging import os from threading import Thread, Event # Added Event for better thread control import time # For timeout checks import soundfile as sf import torch import uvicorn import whisper from fastapi import FastAPI, File, UploadFile, WebSocket, WebSocketDisconnect from fastapi.responses import HTMLResponse, JSONResponse from fastapi.middleware.cors import CORSMiddleware from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer from transformers import AutoTokenizer, GenerationConfig # Keep transformers.GenerationConfig import google.generativeai as genai import numpy as np # --- Configuration --- WHISPER_MODEL_SIZE = os.getenv("WHISPER_MODEL_SIZE", "tiny") TTS_MODEL_NAME = "ai4bharat/indic-parler-tts" GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "AIzaSyD6x3Yoby4eQ6QL2kaaG_Rz3fG3rh7wPB8") GEMINI_MODEL_NAME = "gemini-1.5-flash-latest" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" attn_implementation = "flash_attention_2" if torch.cuda.is_available() else "eager" torch_dtype_tts = torch.bfloat16 if DEVICE == "cuda" and torch.cuda.is_bf16_supported() else (torch.float16 if DEVICE == "cuda" else torch.float32) torch_dtype_whisper = torch.float16 if DEVICE == "cuda" else torch.float32 TTS_DEFAULT_PARAMS = { "do_sample": True, "temperature": 1.0, "top_k": 50, "top_p": 0.95, "min_new_tokens": 5, # Reduced for quicker start with streamer # "max_new_tokens": 256, # Optional global cap } # --- Logging --- logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # --- FastAPI App Initialization --- app = FastAPI(title="Conversational AI Chatbot with Enhanced Stream Abortion") app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # --- Global Model Variables --- whisper_model = None gemini_model_instance = None tts_model = None tts_tokenizer = None # We will build the GenerationConfig object from TTS_DEFAULT_PARAMS inside the functions # or store it globally if preferred, initialized from transformers.GenerationConfig # --- Model Loading & API Configuration --- @app.on_event("startup") async def load_resources(): global whisper_model, tts_model, tts_tokenizer, gemini_model_instance logger.info(f"Loading local models. Whisper on {DEVICE} with {torch_dtype_whisper}, TTS on {DEVICE} with {torch_dtype_tts}") try: logger.info(f"Loading Whisper model: {WHISPER_MODEL_SIZE}") whisper_model = whisper.load_model(WHISPER_MODEL_SIZE, device=DEVICE) logger.info("Whisper model loaded successfully.") logger.info(f"Loading IndicParler-TTS model: {TTS_MODEL_NAME}") tts_model = ParlerTTSForConditionalGeneration.from_pretrained(TTS_MODEL_NAME, attn_implementation=attn_implementation).to(DEVICE, dtype=torch_dtype_tts) tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_NAME) if tts_tokenizer: if tts_tokenizer.pad_token_id is not None: TTS_DEFAULT_PARAMS["pad_token_id"] = tts_tokenizer.pad_token_id # ParlerTTS uses a special token_id for silence, not eos_token_id for generation end. # eos_token_id is more for text models. # if tts_tokenizer.eos_token_id is not None: # TTS_DEFAULT_PARAMS["eos_token_id"] = tts_tokenizer.eos_token_id logger.info(f"IndicParler-TTS model loaded. Default generation params: {TTS_DEFAULT_PARAMS}") if not GEMINI_API_KEY: logger.warning("GEMINI_API_KEY not found. LLM functionality will be limited.") else: try: genai.configure(api_key=GEMINI_API_KEY) gemini_model_instance = genai.GenerativeModel(GEMINI_MODEL_NAME) logger.info(f"Gemini API configured with model: {GEMINI_MODEL_NAME}") except Exception as e: logger.error(f"Failed to configure Gemini API: {e}", exc_info=True) gemini_model_instance = None except Exception as e: logger.error(f"Error loading models: {e}", exc_info=True) logger.info("Local models and API configurations loaded.") # --- Helper Functions --- async def transcribe_audio_bytes(audio_bytes: bytes) -> str: if not whisper_model: raise RuntimeError("Whisper model not loaded.") temp_audio_path = f"temp_audio_main_{os.urandom(4).hex()}.wav" try: with open(temp_audio_path, "wb") as f: f.write(audio_bytes) result = whisper_model.transcribe(temp_audio_path, fp16=(DEVICE == "cuda" and torch_dtype_whisper == torch.float16)) transcribed_text = result["text"].strip() logger.info(f"Transcription: {transcribed_text}") return transcribed_text except Exception as e: logger.error(f"Error during transcription: {e}", exc_info=True) return "" finally: if os.path.exists(temp_audio_path): try: os.remove(temp_audio_path) except Exception as e_del: logger.error(f"Error deleting temp audio file {temp_audio_path}: {e_del}") async def generate_gemini_response(text: str) -> str: if not gemini_model_instance: logger.error("Gemini model instance not available.") return "Sorry, the language model is currently unavailable." try: full_prompt = f"User: {text}\nAssistant:" loop = asyncio.get_event_loop() response = await loop.run_in_executor(None, gemini_model_instance.generate_content, full_prompt) response_text = "I'm sorry, I couldn't generate a response for that." if hasattr(response, 'text') and response.text: # For simple text responses response_text = response.text.strip() elif response.parts: # New way to access parts for gemini-1.5-flash and pro response_text = "".join(part.text for part in response.parts).strip() elif response.candidates and response.candidates[0].content.parts: # Older way response_text = response.candidates[0].content.parts[0].text.strip() else: safety_feedback = "" if hasattr(response, 'prompt_feedback') and response.prompt_feedback: safety_feedback = f" Safety Feedback: {response.prompt_feedback}" elif response.candidates and hasattr(response.candidates[0], 'finish_reason') and response.candidates[0].finish_reason != "STOP": safety_feedback = f" Finish Reason: {response.candidates[0].finish_reason}" logger.warning(f"Gemini response might be empty or blocked.{safety_feedback}") logger.info(f"Gemini LLM Response: {response_text}") return response_text except Exception as e: logger.error(f"Error during Gemini LLM generation: {e}", exc_info=True) return "Sorry, I encountered an error trying to respond." async def synthesize_speech_streaming(text: str, description: str = "A clear, female voice speaking in English.", play_steps_in_s: float = 0.4, cancellation_event: Event = Event()): if not tts_model or not tts_tokenizer: logger.error("TTS model or tokenizer not loaded.") if cancellation_event and cancellation_event.is_set(): logger.info("TTS cancelled before start."); yield b""; return yield b"" return if not text or not text.strip(): logger.warning("TTS input text is empty. Yielding empty audio.") if cancellation_event and cancellation_event.is_set(): logger.info("TTS cancelled before start (empty text)."); yield b""; return yield b"" return streamer = None thread = None try: logger.info(f"Starting TTS streaming with ParlerTTSStreamer for: \"{text[:50]}...\"") # Ensure sampling_rate is correctly accessed from the model's config # For ParlerTTS, it's usually under model.config.audio_encoder.sampling_rate if hasattr(tts_model.config, 'audio_encoder') and hasattr(tts_model.config.audio_encoder, 'sampling_rate'): sampling_rate = tts_model.config.audio_encoder.sampling_rate else: logger.warning("Could not find tts_model.config.audio_encoder.sampling_rate, defaulting to 24000") sampling_rate = 24000 # A common default for ParlerTTS if not found try: frame_rate = getattr(tts_model.config.audio_encoder, 'frame_rate', 100) except AttributeError: logger.warning("frame_rate not found in tts_model.config.audio_encoder. Using default of 100 Hz for play_steps calculation.") frame_rate = 100 play_steps = int(frame_rate * play_steps_in_s) if play_steps == 0 : play_steps = 1 logger.info(f"Streamer params: sampling_rate={sampling_rate}, frame_rate={frame_rate}, play_steps_in_s={play_steps_in_s}, play_steps={play_steps}") streamer = ParlerTTSStreamer(tts_model, device=DEVICE, play_steps=play_steps) description_inputs = tts_tokenizer(description, return_tensors="pt") prompt_inputs = tts_tokenizer(text, return_tensors="pt") gen_config_dict = TTS_DEFAULT_PARAMS.copy() # ParlerTTS generate method might not take a GenerationConfig object directly, # but rather individual kwargs. The streamer example passes them as kwargs. # We ensure pad_token_id and eos_token_id are set if the tokenizer has them. if tts_tokenizer.pad_token_id is not None: gen_config_dict["pad_token_id"] = tts_tokenizer.pad_token_id # ParlerTTS might not use eos_token_id in the same way as text models. # if tts_tokenizer.eos_token_id is not None: # gen_config_dict["eos_token_id"] = tts_tokenizer.eos_token_id thread_generation_kwargs = { "input_ids": description_inputs.input_ids.to(DEVICE), "prompt_input_ids": prompt_inputs.input_ids.to(DEVICE), "attention_mask": description_inputs.attention_mask.to(DEVICE) if hasattr(description_inputs, 'attention_mask') else None, "streamer": streamer, **gen_config_dict # Spread the generation parameters } if thread_generation_kwargs["attention_mask"] is None: del thread_generation_kwargs["attention_mask"] def _generate_in_thread(): try: logger.info(f"TTS generation thread started.") with torch.no_grad(): tts_model.generate(**thread_generation_kwargs) logger.info("TTS generation thread finished model.generate().") except Exception as e_thread: logger.error(f"Error in TTS generation thread: {e_thread}", exc_info=True) finally: if streamer: streamer.end() logger.info("TTS generation thread called streamer.end().") thread = Thread(target=_generate_in_thread) thread.daemon = True thread.start() loop = asyncio.get_event_loop() while True: if cancellation_event and cancellation_event.is_set(): logger.info("TTS streaming cancelled by event.") break try: # Run the blocking streamer.__next__() in an executor audio_chunk_tensor = await loop.run_in_executor(None, streamer.__next__) if audio_chunk_tensor is None: logger.info("Streamer yielded None explicitly, ending stream.") break # This check for numel == 0 is important as streamer might yield empty tensors if not isinstance(audio_chunk_tensor, torch.Tensor) or audio_chunk_tensor.numel() == 0: # REMOVED: if streamer.is_done(): (AttributeError) # Instead, rely on StopIteration or explicit None from streamer await asyncio.sleep(0.01) # Small sleep if empty but not done continue audio_chunk_np = audio_chunk_tensor.cpu().to(torch.float32).numpy().squeeze() if audio_chunk_np.size == 0: continue audio_chunk_int16 = np.clip(audio_chunk_np * 32767, -32768, 32767).astype(np.int16) yield audio_chunk_int16.tobytes() # No need for sleep here if chunks are substantial, client will process # await asyncio.sleep(0.001) # Can be removed or made very small except StopIteration: logger.info("Streamer finished (StopIteration).") break except Exception as e_stream_iter: logger.error(f"Error iterating streamer: {e_stream_iter}", exc_info=True) break logger.info(f"Finished TTS streaming iteration for: \"{text[:50]}...\"") except Exception as e: logger.error(f"Error in synthesize_speech_streaming function: {e}", exc_info=True) yield b"" finally: logger.info("Exiting synthesize_speech_streaming. Ensuring streamer is ended and thread is joined.") if streamer: streamer.end() if thread and thread.is_alive(): logger.info("Waiting for TTS generation thread to complete in finally block...") final_join_start_time = time.time() thread.join(timeout=2.0) if thread.is_alive(): logger.warning(f"TTS generation thread still alive after {time.time() - final_join_start_time:.2f}s in finally block.") # --- FastAPI HTTP Endpoints --- @app.post("/api/stt", summary="Speech to Text") async def speech_to_text_endpoint(file: UploadFile = File(...)): if not whisper_model: return JSONResponse(content={"error": "Whisper model not loaded"}, status_code=503) try: audio_bytes = await file.read() transcribed_text = await transcribe_audio_bytes(audio_bytes) return {"transcription": transcribed_text} except Exception as e: return JSONResponse(content={"error": str(e)}, status_code=500) @app.post("/api/llm", summary="LLM Response Generation (Gemini)") async def llm_endpoint(payload: dict): if not gemini_model_instance: return JSONResponse(content={"error": "Gemini LLM not configured or API key missing"}, status_code=503) try: text = payload.get("text") if not text: return JSONResponse(content={"error": "No text provided"}, status_code=400) response = await generate_gemini_response(text) return {"response": response} except Exception as e: return JSONResponse(content={"error": str(e)}, status_code=500) @app.post("/api/tts", summary="Text to Speech (Non-Streaming for HTTP)") async def text_to_speech_endpoint(payload: dict): if not tts_model or not tts_tokenizer: return JSONResponse(content={"error": "TTS model/tokenizer not loaded"}, status_code=503) try: text = payload.get("text") description = payload.get("description", "A clear, female voice speaking in English.") if not text: return JSONResponse(content={"error": "No text provided"}, status_code=400) description_inputs = tts_tokenizer(description, return_tensors="pt") prompt_inputs = tts_tokenizer(text, return_tensors="pt") # Use a GenerationConfig object for clarity and consistency gen_config_dict = TTS_DEFAULT_PARAMS.copy() if tts_tokenizer.pad_token_id is not None: gen_config_dict["pad_token_id"] = tts_tokenizer.pad_token_id # if tts_tokenizer.eos_token_id is not None: # ParlerTTS might not use standard eos # gen_config_dict["eos_token_id"] = tts_tokenizer.eos_token_id # Create GenerationConfig from transformers generation_config_obj = GenerationConfig(**gen_config_dict) with torch.no_grad(): generation = tts_model.generate( input_ids=description_inputs.input_ids.to(DEVICE), prompt_input_ids=prompt_inputs.input_ids.to(DEVICE), attention_mask=description_inputs.attention_mask.to(DEVICE) if hasattr(description_inputs, 'attention_mask') else None, generation_config=generation_config_obj # Pass the config object ).cpu().to(torch.float32).numpy().squeeze() audio_io = io.BytesIO() scaled_generation = np.clip(generation * 32767, -32768, 32767).astype(np.int16) current_sampling_rate = tts_model.config.audio_encoder.sampling_rate if hasattr(tts_model.config, 'audio_encoder') else 24000 sf.write(audio_io, scaled_generation, samplerate=current_sampling_rate, format='WAV', subtype='PCM_16') audio_io.seek(0) audio_bytes = audio_io.read() if not audio_bytes: return JSONResponse(content={"error": "TTS failed to generate audio"}, status_code=500) audio_base64 = base64.b64encode(audio_bytes).decode('utf-8') return {"audio_base64": audio_base64, "format": "wav", "sample_rate": current_sampling_rate} except Exception as e: logger.error(f"TTS endpoint error: {e}", exc_info=True) return JSONResponse(content={"error": str(e)}, status_code=500) # --- WebSocket Endpoint for Real-time Conversation --- @app.websocket("/ws/conversation") async def conversation_websocket(websocket: WebSocket): await websocket.accept() logger.info(f"WebSocket connection accepted from: {websocket.client}") tts_cancellation_event = Event() # For this specific connection try: while True: if websocket.client_state.name != 'CONNECTED': # Check if client disconnected before receive logger.info(f"WebSocket client {websocket.client} disconnected before receive.") break audio_data = await websocket.receive_bytes() logger.info(f"Received {len(audio_data)} bytes of user audio data from {websocket.client}.") if not audio_data: logger.warning(f"Received empty audio data from user {websocket.client}.") continue transcribed_text = await transcribe_audio_bytes(audio_data) if not transcribed_text: logger.warning(f"Transcription failed for {websocket.client}.") await websocket.send_text("SYSTEM_ERROR: Transcription failed.") continue await websocket.send_text(f"USER_TRANSCRIPT: {transcribed_text}") llm_response_text = await generate_gemini_response(transcribed_text) if not llm_response_text or "Sorry, I encountered an error" in llm_response_text or "unavailable" in llm_response_text: logger.warning(f"LLM (Gemini) failed for {websocket.client}: {llm_response_text}") await websocket.send_text(f"SYSTEM_ERROR: LLM failed. ({llm_response_text})") continue await websocket.send_text(f"ASSISTANT_RESPONSE_TEXT: {llm_response_text}") tts_description = "A clear, female voice speaking in English." current_sampling_rate = tts_model.config.audio_encoder.sampling_rate if hasattr(tts_model.config, 'audio_encoder') else 24000 audio_params_msg = f"TTS_STREAM_START:{{\"sample_rate\": {current_sampling_rate}, \"channels\": 1, \"bit_depth\": 16}}" await websocket.send_text(audio_params_msg) logger.info(f"Sent to client {websocket.client}: {audio_params_msg}") chunk_count = 0 tts_cancellation_event.clear() # Reset event for new TTS task async for audio_chunk_bytes in synthesize_speech_streaming(llm_response_text, tts_description, cancellation_event=tts_cancellation_event): if not audio_chunk_bytes: logger.debug(f"Received empty bytes from streaming generator for {websocket.client}, might be end or error in generator.") continue try: if websocket.client_state.name != 'CONNECTED': logger.warning(f"Client {websocket.client} disconnected during TTS stream. Aborting TTS.") tts_cancellation_event.set() # Signal TTS thread to stop break await websocket.send_bytes(audio_chunk_bytes) chunk_count += 1 except Exception as send_err: logger.warning(f"Error sending audio chunk to {websocket.client}: {send_err}. Client likely disconnected.") tts_cancellation_event.set() # Signal TTS thread to stop break if not tts_cancellation_event.is_set(): # Only send END if not cancelled logger.info(f"Sent {chunk_count} TTS audio chunks to client {websocket.client}.") await websocket.send_text("TTS_STREAM_END") logger.info(f"Sent TTS_STREAM_END to client {websocket.client}.") else: logger.info(f"TTS stream for {websocket.client} was cancelled. Sent {chunk_count} chunks before cancellation.") except WebSocketDisconnect: logger.info(f"WebSocket connection closed by client {websocket.client}.") tts_cancellation_event.set() # Signal any active TTS to stop except Exception as e: logger.error(f"Error in WebSocket conversation with {websocket.client}: {e}", exc_info=True) tts_cancellation_event.set() # Signal any active TTS to stop try: if websocket.client_state.name == 'CONNECTED': await websocket.send_text(f"SYSTEM_ERROR: An unexpected error occurred: {str(e)}") except Exception: pass finally: logger.info(f"Cleaning up WebSocket connection for {websocket.client}.") tts_cancellation_event.set() # Ensure event is set on any exit path if websocket.client_state.name == 'CONNECTED' or websocket.client_state.name == 'CONNECTING': try: await websocket.close() except Exception: pass logger.info(f"WebSocket connection resources cleaned up for {websocket.client}.") # ... (HTML serving and main execution block remain the same) ... @app.get("/", response_class=HTMLResponse) async def get_home(): html_content = """ Conversational AI Chatbot (Streaming)

Real-time AI Chatbot (Streaming TTS)

You (transcribed): ...
Assistant (text): ...

Status: Idle

""" return HTMLResponse(content=html_content) if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")