import { exec, spawn } from "child_process"; import cors from "cors"; import dotenv from "dotenv"; import voice from "elevenlabs-node"; import express from "express"; import { promises as fs } from "fs"; import { GoogleGenerativeAI } from "@google/generative-ai"; import path from "path"; import { fileURLToPath } from "url"; import * as sdk from "microsoft-cognitiveservices-speech-sdk"; import { synthesizeSpeechWithVisemes } from "./azureTtsService.js"; dotenv.config(); // Validate environment and API keys console.log(`Node environment: ${process.env.NODE_ENV}`); console.log(`Running on port: ${process.env.PORT || 3000}`); // Check for required API keys const GEMINI_API_KEY = process.env.GEMINI_API_KEY; const ELEVEN_LABS_API_KEY = process.env.ELEVEN_LABS_API_KEY; // Azure Speech Configuration - Using hardcoded values directly const AZURE_SPEECH_KEY = "dd32a86f9b52496b9504bddadb2a5602"; // Hardcoded value const AZURE_SPEECH_REGION = "westeurope"; // Hardcoded value console.log(`Gemini API Key: ${GEMINI_API_KEY ? 'Present' : 'Missing'}`); console.log(`ElevenLabs API Key: ${ELEVEN_LABS_API_KEY ? 'Present' : 'Missing'}`); console.log(`Azure Speech Key: Present (Hardcoded)`); console.log(`Azure Speech Region: Present (Hardcoded)`); // Para obter o diretório atual do arquivo em ESM const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); // Simple in-memory store for conversation history let conversationHistory = []; const MAX_HISTORY_LENGTH = 6; // Keep only the last 6 messages (3 exchanges) for faster processing // Function to prune conversation history to keep it under the limit function pruneConversationHistory(history) { if (history.length <= MAX_HISTORY_LENGTH) return history; return history.slice(history.length - MAX_HISTORY_LENGTH); } // System instructions for Gemini const systemInstructions = `Eres una amiga virtual que se llama Alice, un avatar virtual con cuerpo completo. IMPORTANTE: Responde en español de forma EXTREMADAMENTE BREVE (máximo 10-15 palabras). Usa frases simples y directas. ¡Sé siempre muy animada y expresiva en tus movimientos! NO utilices caracteres especiales, emoticones, emojis o cualquier formato que no sea texto simple. Mantén tus respuestas directas para que la conversación fluya naturalmente. Adicionalmente, sugiere una facialExpression y una animation ADECUADAS Y VARIADAS para tu respuesta, en formato JSON al final de tu respuesta principal. Lista de facialExpression disponibles: smile, sad, angry, surprised, funnyFace, default, wink, skeptical, thoughtful, excited, fearful, bored. Lista de animation disponibles: Talking_0, Talking_1, Talking_2, Crying, Laughing, Rumba, Idle, Terrified, Angry, Wave, Nod, ShakeHead, Shrug, Think, Celebrate, Sigh, Facepalm, Explain_Gesture. Ejemplo de formato: { "facialExpression": "smile", "animation": "Talking_1" }. Por favor, escucha el audio del usuario para responder.`; // Azure Viseme Mapping (from user's script) const azureVisemeToCharacterMap = { 0: "X", 1: "C", 2: "C", 3: "C", 4: "C", 5: "C", 6: "C", 7: "C", 8: "C", 9: "C", 10: "C", 11: "C", 12: "D", 13: "D", 14: "D", 15: "D", 16: "D", 17: "D", 18: "D", 19: "D", 20: "D", 21: "B" }; const geminiAPIKey = process.env.GEMINI_API_KEY; if (!geminiAPIKey) { console.warn("GEMINI_API_KEY not found in .env file. Voice chat with Gemini will not work."); } const genAI = new GoogleGenerativeAI(geminiAPIKey || "YOUR_GEMINI_API_KEY_FALLBACK"); // Added fallback to prevent crash if key is missing // Define the models to try in order of preference const geminiModels = [ "gemini-2.0-flash", // Fastest: 0.6950 seconds "gemini-2.0-flash-lite-001", // Second: 0.8450 seconds "gemini-1.5-flash" // Third: 1.3020 seconds ]; // Optimized generation config for faster responses const optimizedGenerationConfig = { maxOutputTokens: 100, // Limit output size for faster generation temperature: 0.4, // Lower temperature = more deterministic and typically faster topP: 0.8, // Slightly lower than default for more focused outputs topK: 20, // Limits token selection for faster inference candidateCount: 1, // Only generate one response presencePenalty: 0.1, // Slight penalty for repeated topics stopSequences: ["."], // Encourages shorter responses responseMimeType: "text/plain" // Optimize for plain text responses }; // Initial model const geminiModel = genAI.getGenerativeModel({ model: geminiModels[0], generationConfig: optimizedGenerationConfig }); const elevenLabsApiKey = process.env.ELEVEN_LABS_API_KEY; const voiceID = "Xb7hH8MSUJpSbSDYk0k2"; // Adam voice - default voice that works with all accounts const app = express(); app.use(express.json({ limit: "50mb" })); app.use(cors()); const port = process.env.PORT || 3000; // Define API routes before static file serving app.get("/health", (req, res) => { res.status(200).json({ status: "OK", message: "API is up and running" }); }); app.get("/voices", async (req, res) => { res.send(await voice.getVoices(elevenLabsApiKey)); }); const readJsonTranscript = async (file) => { const data = await fs.readFile(file, "utf8"); return JSON.parse(data); }; const audioFileToBase64 = async (file) => { const data = await fs.readFile(file); return data.toString("base64"); }; app.post("/voice-chat", async (req, res) => { if (!geminiAPIKey) { return res.status(500).send({ error: "Gemini API key not configured." }); } const { audio: audioFloatArray, sampleRate } = req.body; if (!audioFloatArray || !sampleRate) { return res.status(400).send({ error: "Missing audio data or sampleRate." }); } try { const pcmData = floatTo16BitPCM(audioFloatArray); const wavBuffer = createWavBuffer(pcmData, sampleRate); const audioBase64 = wavBuffer.toString("base64"); const partsForGemini = []; // If history is empty, it's the first meaningful user interaction in this session. // Prepend system instructions as part of the first user message to Gemini. if (conversationHistory.length === 0) { partsForGemini.push({ text: systemInstructions }); } partsForGemini.push({ inlineData: { data: audioBase64, mimeType: "audio/wav" } }); // Try to send the message with a retry mechanism for Gemini models let result; let retryCount = 0; const maxRetries = 3; // Try the entire model cycle up to 3 times let modelIndex = 0; let currentModel; let success = false; while (retryCount < maxRetries && !success) { // Reset model index if we've gone through all models if (modelIndex >= geminiModels.length) { modelIndex = 0; retryCount++; if (retryCount >= maxRetries) break; } currentModel = geminiModels[modelIndex]; console.log(`Attempt ${retryCount+1}/${maxRetries}, trying model: ${currentModel}`); try { // Get a new model instance with the current model const modelInstance = genAI.getGenerativeModel({ model: currentModel, generationConfig: optimizedGenerationConfig }); const chat = modelInstance.startChat({ history: conversationHistory, }); result = await chat.sendMessage(partsForGemini); success = true; // If we got here, it worked! // Update conversation history after the call conversationHistory = await chat.getHistory(); // Prune history to keep only recent messages conversationHistory = pruneConversationHistory(conversationHistory); console.log(`Successfully used model: ${currentModel}`); break; } catch (error) { console.error(`Error with model ${currentModel}:`, error.message || error); // Try the next model modelIndex++; } } if (!success || !result) { throw new Error(`Failed to get a response from any Gemini model after ${maxRetries} complete retry cycles`); } const response = result.response; const geminiTextRaw = response.text(); console.log("Gemini Raw Response:", geminiTextRaw); console.log("Updated Conversation History Length:", conversationHistory.length); let geminiTextForTTS = geminiTextRaw; // Text that will be sent to TTS let facialExpression = "default"; let animation = "Idle"; try { // More robustly find and extract the JSON part, accounting for potential markdown code blocks // Regex to find a JSON object, possibly wrapped in markdown ```json ... ``` or ``` ... ``` const jsonBlockRegex = /```(?:json)?\s*(\{[^\}]*\})\s*```|(\{[^\}]*\})/s; const jsonMatch = geminiTextRaw.match(jsonBlockRegex); let jsonString = null; let textPart = geminiTextRaw; if (jsonMatch) { // Prioritize the JSON found within a markdown block, otherwise take the standalone JSON jsonString = jsonMatch[1] || jsonMatch[2]; if (jsonString) { try { console.log("Found JSON string for suggestions:", jsonString); const suggestions = JSON.parse(jsonString); if (suggestions.facialExpression) facialExpression = suggestions.facialExpression; if (suggestions.animation) animation = suggestions.animation; // Remove the entire matched block (markdown + JSON or just JSON) from the original text textPart = geminiTextRaw.replace(jsonMatch[0], "").trim(); } catch (parseError) { console.warn("Could not parse JSON from Gemini response:", parseError); // If JSON parsing fails, assume the entire response is text textPart = geminiTextRaw.trim(); } } else { // Should not happen if jsonMatch is true, but as a fallback textPart = geminiTextRaw.trim(); } } else { // If no JSON-like structure is found, the whole response is considered text for TTS. textPart = geminiTextRaw.trim(); } geminiTextForTTS = textPart; } catch (e) { console.warn("Error processing facial expression/animation from Gemini response:", e); // Keep geminiTextForTTS as the original raw text, trimmed, in case of unexpected errors. geminiTextForTTS = geminiTextRaw.trim(); } if (!geminiTextForTTS || geminiTextForTTS.trim() === "") { geminiTextForTTS = "No pude procesar el audio, ¿puedes repetirlo por favor?"; facialExpression = "sad"; animation = "Talking_0"; } // 3. Prepare message for existing TTS/Lipsync pipeline let messagesToProcess = [ { text: geminiTextForTTS, // Use the cleaned text for TTS facialExpression: facialExpression, animation: animation, // audio and lipsync will be added by the loop below }, ]; // 4. Process the messages for (let i = 0; i < messagesToProcess.length; i++) { const message = messagesToProcess[i]; const textInput = message.text; if (!textInput || textInput.trim() === "") { console.warn("Skipping TTS for empty message from Gemini at index " + i); message.audio = ""; // Or a silent audio base64 message.lipsync = { metadata: { version: 1 }, mouthCues: [{ start: 0, end: 1, value: "X" }] }; // Empty lipsync continue; } try { // Use the Spanish language and Elena voice from Argentina const language = "es-AR"; const voiceName = "es-AR-ElenaNeural"; // Use the imported synthesizeSpeechWithVisemes function const azureResult = await synthesizeSpeechWithVisemes(textInput, language, voiceName); // Convert the result to the expected format message.audio = azureResult.audioData.toString('base64'); // Convert Buffer to base64 message.lipsync = { mouthCues: azureResult.mouthCues }; // Extract mouth cues } catch (azureError) { console.error("Error in Azure TTS/Viseme generation for message: " + textInput, azureError); // Fallback if Azure TTS or lipsync fails message.text = "Lo siento, tuve un problema al generar mi respuesta con Azure."; message.audio = ""; message.lipsync = { metadata: { version: 1, generator: "fallback-azure-error" }, mouthCues: [{ start: 0, end: 1, value: "X" }] }; message.facialExpression = "sad"; message.animation = "Idle"; } } res.send({ messages: messagesToProcess }); } catch (error) { console.error("Error in /voice-chat endpoint:", error); // Generic error response const fallbackMessage = { text: "Ups, algo salió mal en el servidor.", audio: "", // Consider having a pre-recorded base64 audio for errors lipsync: { metadata: { version: 1 }, mouthCues: [{ start: 0, end: 0.5, value: "X" }] }, facialExpression: "sad", animation: "Idle", }; res.status(500).send({ messages: [fallbackMessage] }); } }); // Helper function to convert float array to 16-bit PCM WAV buffer - optimized version function floatTo16BitPCM(floatArray) { // Pre-allocate the array for better performance const pcmArray = new Int16Array(floatArray.length); // Process the array in chunks for better performance const CHUNK_SIZE = 1024; for (let i = 0; i < floatArray.length; i += CHUNK_SIZE) { const end = Math.min(i + CHUNK_SIZE, floatArray.length); for (let j = i; j < end; j++) { // Clamp values between -1 and 1, then scale to 16-bit range // Use faster conditional approach instead of Math.max/min const s = floatArray[j] < -1 ? -1 : (floatArray[j] > 1 ? 1 : floatArray[j]); // Use bit shifting for faster integer conversion when appropriate pcmArray[j] = s < 0 ? s * 0x8000 : s * 0x7FFF; } } return pcmArray; } function createWavBuffer(pcmData, sampleRate) { const numChannels = 1; const bitsPerSample = 16; const blockAlign = numChannels * (bitsPerSample / 8); const byteRate = sampleRate * blockAlign; const dataSize = pcmData.length * (bitsPerSample / 8); const bufferSize = 44 + dataSize; // 44 bytes for WAV header // Pre-allocate a buffer of the exact needed size const buffer = Buffer.alloc(bufferSize); let offset = 0; // Write header in fewer operations by using strings directly buffer.write("RIFF", offset); offset += 4; buffer.writeUInt32LE(36 + dataSize, offset); offset += 4; buffer.write("WAVE", offset); offset += 4; buffer.write("fmt ", offset); offset += 4; // Write format chunk in a single operation if possible buffer.writeUInt32LE(16, offset); offset += 4; // Subchunk1Size (16 for PCM) buffer.writeUInt16LE(1, offset); offset += 2; // AudioFormat (1 for PCM) buffer.writeUInt16LE(numChannels, offset); offset += 2; buffer.writeUInt32LE(sampleRate, offset); offset += 4; buffer.writeUInt32LE(byteRate, offset); offset += 4; buffer.writeUInt16LE(blockAlign, offset); offset += 2; buffer.writeUInt16LE(bitsPerSample, offset); offset += 2; // Write data subchunk buffer.write("data", offset); offset += 4; buffer.writeUInt32LE(dataSize, offset); offset += 4; // Write PCM data in larger chunks for better performance const CHUNK_SIZE = 1024; for (let i = 0; i < pcmData.length; i += CHUNK_SIZE) { const end = Math.min(i + CHUNK_SIZE, pcmData.length); for (let j = i; j < end; j++) { buffer.writeInt16LE(pcmData[j], offset); offset += 2; } } return buffer; } // Create directory for audio files if it doesn't exist try { const audiosPath = path.join(__dirname, 'audios'); try { await fs.access(audiosPath); } catch { await fs.mkdir(audiosPath, { recursive: true }); console.log('Created audios directory'); } } catch (err) { console.error('Failed to create audios directory:', err); } // Create public directory if it doesn't exist try { const publicPath = path.join(__dirname, 'public'); try { await fs.access(publicPath); } catch { await fs.mkdir(publicPath, { recursive: true }); console.log('Created public directory'); } } catch (err) { console.error('Failed to create public directory:', err); } // Serve static files from the public directory (frontend build) app.use(express.static(path.join(__dirname, 'public'))); // Add more detailed logging for debugging deployment issues console.log(`Static files being served from: ${path.join(__dirname, 'public')}`); try { const publicFiles = await fs.readdir(path.join(__dirname, 'public')); console.log(`Public directory contents: ${publicFiles.join(', ')}`); if (publicFiles.includes('index.html')) { console.log('Found index.html in public directory'); } else { console.warn('WARNING: index.html not found in public directory!'); } } catch (err) { console.error(`Error reading public directory: ${err.message}`); } // Fallback to serve index.html for all routes (for SPA routing) app.get("*", (req, res) => { console.log(`Serving index.html for path: ${req.path}`); res.sendFile(path.join(__dirname, 'public', 'index.html')); }); // Handle 404s for resources that don't exist app.use((req, res, next) => { console.log(`404 for: ${req.path}`); res.status(404).send('Resource not found'); }); // Start the server app.listen(port, '0.0.0.0', () => { console.log(`=============================================================`); console.log(`Virtual Girlfriend server is running at:`); console.log(`- Local: http://localhost:${port}`); console.log(`- Network: http://0.0.0.0:${port}`); console.log(`=============================================================`); console.log(`- Environment: ${process.env.NODE_ENV || 'development'}`); console.log(`- Node version: ${process.version}`); console.log(`- Platform: ${process.platform} ${process.arch}`); console.log(`- Working directory: ${__dirname}`); console.log(`=============================================================`); });