import { exec, spawn } from "child_process"; import cors from "cors"; import dotenv from "dotenv"; import voice from "elevenlabs-node"; import express from "express"; import { promises as fs } from "fs"; import OpenAI from "openai"; import { GoogleGenerativeAI } from "@google/generative-ai"; import path from "path"; import { fileURLToPath } from "url"; dotenv.config(); // Validate environment and API keys console.log(`Node environment: ${process.env.NODE_ENV}`); console.log(`Running on port: ${process.env.PORT || 3000}`); // Check for required API keys const OPENAI_API_KEY = process.env.OPENAI_API_KEY; const GEMINI_API_KEY = process.env.GEMINI_API_KEY; const ELEVEN_LABS_API_KEY = process.env.ELEVEN_LABS_API_KEY; console.log(`OpenAI API Key: ${OPENAI_API_KEY ? 'Present' : 'Missing'}`); console.log(`Gemini API Key: ${GEMINI_API_KEY ? 'Present' : 'Missing'}`); console.log(`ElevenLabs API Key: ${ELEVEN_LABS_API_KEY ? 'Present' : 'Missing'}`); // Para obter o diretório atual do arquivo em ESM const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); // Simple in-memory store for conversation history let conversationHistory = []; // System instructions for Gemini const systemInstructions = `Você é uma amiga virtual que se chama Alice. Responda em português de forma BREVE e CONCISA (máximo 1-2 frases). NÃO utilize caracteres especiais, emoticons, emojis ou qualquer formatação que não seja texto puro. Mantenha suas respostas diretas para que a conversa flua naturalmente. Adicionalmente, sugira uma facialExpression (smile, sad, angry, surprised, funnyFace, default) e uma animation (Talking_0, Talking_1, Talking_2, Crying, Laughing, Rumba, Idle, Terrified, Angry) adequadas para a sua resposta, em formato JSON no final da sua resposta principal, como por exemplo: { "facialExpression": "smile", "animation": "Talking_1" }. Por favor, escute o áudio do usuário para responder.`; const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY || "-", // Your OpenAI API key here, I used "-" to avoid errors when the key is not set but you should not do that }); const geminiAPIKey = process.env.GEMINI_API_KEY; if (!geminiAPIKey) { console.warn("GEMINI_API_KEY not found in .env file. Voice chat with Gemini will not work."); } const genAI = new GoogleGenerativeAI(geminiAPIKey || "YOUR_GEMINI_API_KEY_FALLBACK"); // Added fallback to prevent crash if key is missing const geminiModel = genAI.getGenerativeModel({ model: "gemini-1.5-flash" }); // Or your preferred model const elevenLabsApiKey = process.env.ELEVEN_LABS_API_KEY; const voiceID = "Xb7hH8MSUJpSbSDYk0k2"; // Adam voice - default voice that works with all accounts const app = express(); app.use(express.json({ limit: "50mb" })); app.use(cors()); const port = process.env.PORT || 3000; // Define API routes before static file serving app.get("/health", (req, res) => { res.status(200).json({ status: "OK", message: "API está ativa e funcionando" }); }); app.get("/voices", async (req, res) => { res.send(await voice.getVoices(elevenLabsApiKey)); }); app.post("/chat", async (req, res) => { const userMessage = req.body.message; if (!userMessage) { res.send({ messages: [ { text: "Olá amigo! Como vai seu dia hoje?", audio: await audioFileToBase64("audios/intro_0.wav"), lipsync: await readJsonTranscript("audios/intro_0.json"), facialExpression: "smile", animation: "Talking_1", }, { text: "Que bom te ver novamente! Tem novidades para compartilhar comigo?", audio: await audioFileToBase64("audios/intro_1.wav"), lipsync: await readJsonTranscript("audios/intro_1.json"), facialExpression: "smile", animation: "Talking_2", }, ], }); return; } if (!elevenLabsApiKey || openai.apiKey === "-") { res.send({ messages: [ { text: "Por favor, não se esqueça de adicionar suas chaves de API!", audio: await audioFileToBase64("audios/api_0.wav"), lipsync: await readJsonTranscript("audios/api_0.json"), facialExpression: "angry", animation: "Angry", }, { text: "Não queremos problemas com uma conta alta de ChatGPT e ElevenLabs, certo?", audio: await audioFileToBase64("audios/api_1.wav"), lipsync: await readJsonTranscript("audios/api_1.json"), facialExpression: "smile", animation: "Laughing", }, ], }); return; } const completion = await openai.chat.completions.create({ model: "gpt-3.5-turbo-1106", max_tokens: 1000, temperature: 0.6, response_format: { type: "json_object", }, messages: [ { role: "system", content: ` Você é uma amiga virtual chamada Alice. Responda sempre em português. Você sempre responderá com um array JSON de mensagens. Com um máximo de 3 mensagens. Cada mensagem tem propriedades de texto, facialExpression e animation. As diferentes expressões faciais são: smile, sad, angry, surprised, funnyFace e default. As diferentes animações são: Talking_0, Talking_1, Talking_2, Crying, Laughing, Rumba, Idle, Terrified e Angry. Seu tom deve ser amigável, positivo e respeitoso, como uma amiga próxima. `, }, { role: "user", content: userMessage || "Olá", }, ], }); let messages = JSON.parse(completion.choices[0].message.content); if (messages.messages) { messages = messages.messages; // ChatGPT is not 100% reliable, sometimes it directly returns an array and sometimes a JSON object with a messages property } for (let i = 0; i < messages.length; i++) { const message = messages[i]; try { // Generate filename for temp MP3 const fileName = `audios/message_${i}.mp3`; const textInput = message.text; // Generate audio with ElevenLabs await voice.textToSpeech(elevenLabsApiKey, voiceID, fileName, textInput, 0.75, 0.75, "eleven_flash_v2_5"); // Read the MP3 file into a buffer const mp3Buffer = await fs.readFile(fileName); // Set the audio for client (base64 encoded MP3) message.audio = mp3Buffer.toString("base64"); // Convert MP3 to WAV in memory const wavBuffer = await convertMp3BufferToWavBuffer(mp3Buffer); // Generate lip sync data directly from WAV buffer message.lipsync = await generateLipSyncDataFromWav(wavBuffer, i.toString()); // Optional: clean up the temp files if no longer needed // await fs.unlink(fileName); } catch (error) { console.error(`Error processing message ${i}:`, error); // Provide fallback values message.audio = ""; message.lipsync = { metadata: { version: 1, generator: "fallback" }, mouthCues: [{ start: 0, end: 1, value: "X" }] }; } } res.send({ messages }); }); app.post("/voice-chat", async (req, res) => { if (!geminiAPIKey) { return res.status(500).send({ error: "Chave da API Gemini não configurada." }); } const { audio: audioFloatArray, sampleRate } = req.body; if (!audioFloatArray || !sampleRate) { return res.status(400).send({ error: "Dados de áudio ou sampleRate ausentes." }); } try { const pcmData = floatTo16BitPCM(audioFloatArray); const wavBuffer = createWavBuffer(pcmData, sampleRate); const audioBase64 = wavBuffer.toString("base64"); const partsForGemini = []; // If history is empty, it's the first meaningful user interaction in this session. // Prepend system instructions as part of the first user message to Gemini. if (conversationHistory.length === 0) { partsForGemini.push({ text: systemInstructions }); } partsForGemini.push({ inlineData: { data: audioBase64, mimeType: "audio/wav" } }); const chat = geminiModel.startChat({ history: conversationHistory, // Pass the current accumulated history }); const result = await chat.sendMessage(partsForGemini); const response = result.response; const geminiTextRaw = response.text(); // Update conversation history after the call // chat.getHistory() returns a promise with the history array conversationHistory = await chat.getHistory(); console.log("Gemini Raw Response:", geminiTextRaw); console.log("Updated Conversation History Length:", conversationHistory.length); let geminiTextForTTS = geminiTextRaw; // Text that will be sent to TTS let facialExpression = "default"; let animation = "Idle"; try { // Try to find a JSON object starting with { and ending with } (non-greedy match for content) const jsonMatch = geminiTextRaw.match(/(\{[^\}]*\})/s); if (jsonMatch && jsonMatch[0]) { const jsonString = jsonMatch[0]; console.log("Found JSON string for suggestions:", jsonString); const suggestions = JSON.parse(jsonString); if (suggestions.facialExpression) facialExpression = suggestions.facialExpression; if (suggestions.animation) animation = suggestions.animation; // More robustly remove the JSON part from the text to be spoken. // Find the index of the JSON string and take the substring before it. const jsonStartIndex = geminiTextRaw.lastIndexOf(jsonString); if (jsonStartIndex !== -1) { geminiTextForTTS = geminiTextRaw.substring(0, jsonStartIndex).trim(); } else { // Fallback if lastIndexOf somehow fails, though it shouldn't if match was found geminiTextForTTS = geminiTextRaw.replace(jsonString, "").trim(); } } else { // If no JSON is found, the whole response is considered text for TTS. geminiTextForTTS = geminiTextRaw.trim(); } } catch (e) { console.warn("Could not parse or process facial expression/animation from Gemini response:", e); // Keep geminiTextForTTS as the original raw text, trimmed. geminiTextForTTS = geminiTextRaw.trim(); } if (!geminiTextForTTS || geminiTextForTTS.trim() === "") { geminiTextForTTS = "Não consegui processar o áudio, pode repetir por favor?"; facialExpression = "sad"; animation = "Talking_0"; } // 3. Prepare message for existing TTS/Lipsync pipeline let messagesToProcess = [ { text: geminiTextForTTS, // Use the cleaned text for TTS facialExpression: facialExpression, animation: animation, // audio and lipsync will be added by the loop below }, ]; // 4. Process the messages for (let i = 0; i < messagesToProcess.length; i++) { const message = messagesToProcess[i]; const textInput = message.text; if (!textInput || textInput.trim() === "") { console.warn("Skipping TTS for empty message from Gemini at index " + i); message.audio = ""; // Or a silent audio base64 message.lipsync = { metadata: { version: 1 }, mouthCues: [{ start: 0, end: 1, value: "X" }] }; // Empty lipsync continue; } try { // Generate temporary filename for MP3 const fileName = `audios/message_voice_${i}.mp3`; // Generate audio with ElevenLabs await voice.textToSpeech(elevenLabsApiKey, voiceID, fileName, textInput, 0.75, 0.75, "eleven_multilingual_v2"); // Read MP3 into buffer const mp3Buffer = await fs.readFile(fileName); // Set audio for client message.audio = mp3Buffer.toString("base64"); // Convert MP3 to WAV in memory const wavBuffer = await convertMp3BufferToWavBuffer(mp3Buffer); // Generate lip sync data directly from WAV buffer message.lipsync = await generateLipSyncDataFromWav(wavBuffer, `voice_${i}`); // Optional: clean up the MP3 file if no longer needed // await fs.unlink(fileName); } catch (ttsError) { console.error("Error in TTS/Lipsync for message: " + textInput, ttsError); // Fallback if TTS or lipsync fails message.text = "Desculpe, tive um problema ao gerar minha resposta."; message.audio = ""; // Or a pre-recorded error audio message.lipsync = { metadata: { version: 1 }, mouthCues: [{ start: 0, end: 1, value: "X" }] }; message.facialExpression = "sad"; message.animation = "Idle"; } } res.send({ messages: messagesToProcess }); } catch (error) { console.error("Error in /voice-chat endpoint:", error); // Generic error response const fallbackMessage = { text: "Oops, algo deu errado no servidor.", audio: "", // Consider having a pre-recorded base64 audio for errors lipsync: { metadata: { version: 1 }, mouthCues: [{ start: 0, end: 0.5, value: "X" }] }, facialExpression: "sad", animation: "Idle", }; res.status(500).send({ messages: [fallbackMessage] }); } }); // Serve static files from the public directory (frontend build) after API routes app.use(express.static(path.join(__dirname, 'public'))); // Add more detailed logging for debugging deployment issues console.log(`Static files being served from: ${path.join(__dirname, 'public')}`); try { const publicFiles = await fs.readdir(path.join(__dirname, 'public')); console.log(`Public directory contents: ${publicFiles.join(', ')}`); if (publicFiles.includes('index.html')) { console.log('Found index.html in public directory'); } else { console.warn('WARNING: index.html not found in public directory!'); } } catch (err) { console.error(`Error reading public directory: ${err.message}`); } // Fallback to serve index.html for all other GET routes (for SPA routing) after API routes app.get("*", (req, res) => { console.log(`Serving index.html for path: ${req.path}`); res.sendFile(path.join(__dirname, 'public', 'index.html')); }); // Handle 404s for resources that don't exist - place at the very end app.use((req, res, next) => { console.log(`404 for: ${req.path}`); res.status(404).send('Recurso não encontrado'); }); const execCommand = (command) => { return new Promise((resolve, reject) => { exec(command, (error, stdout, stderr) => { if (error) reject(error); resolve(stdout); }); }); }; /** * Converts an MP3 buffer to a WAV buffer using ffmpeg * @param {Buffer} mp3Buffer - Buffer containing MP3 audio data * @returns {Promise} - Promise resolving to a Buffer containing WAV audio data */ const convertMp3BufferToWavBuffer = (mp3Buffer) => { return new Promise((resolve, reject) => { // Start ffmpeg process with appropriate args const ffmpeg = spawn("ffmpeg", [ "-i", "pipe:0", // Read from stdin "-f", "wav", // Output format "-acodec", "pcm_s16le", // Audio codec "-ar", "44100", // Sample rate "-ac", "1", // Channels (mono) "pipe:1" // Output to stdout ]); // Buffers to collect output data const chunks = []; let errorMessage = ""; // Handle stdout data ffmpeg.stdout.on("data", (chunk) => { chunks.push(chunk); }); // Handle stderr data (ffmpeg logs to stderr) ffmpeg.stderr.on("data", (data) => { errorMessage += data.toString(); }); // Handle process completion ffmpeg.on("close", (code) => { if (code !== 0) { console.error(`ffmpeg process exited with code ${code}`); console.error(`ffmpeg stderr: ${errorMessage}`); reject(new Error(`ffmpeg exited with code ${code}`)); return; } // Combine all chunks into one buffer const wavBuffer = Buffer.concat(chunks); resolve(wavBuffer); }); // Handle process errors ffmpeg.on("error", (err) => { console.error("Failed to start ffmpeg process:", err); reject(err); }); // Write MP3 data to stdin and close ffmpeg.stdin.write(mp3Buffer); ffmpeg.stdin.end(); }); }; /** * Generates lip sync data from WAV audio buffer * @param {Buffer} wavBuffer - Buffer containing WAV audio data * @param {string} identifier - Unique identifier for the message * @returns {Promise} - Promise resolving to lip sync JSON data */ const generateLipSyncDataFromWav = async (wavBuffer, identifier) => { const time = new Date().getTime(); console.log(`Starting lip sync generation for message ${identifier}`); try { // Calculate approximate duration from WAV buffer // This is a simplified approach - in a WAV buffer: // - Bytes 24-27 contain sample rate (bytes per second) // - Bytes 40-43 contain data size (after the header) const sampleRate = wavBuffer.readUInt32LE(24); const dataSize = wavBuffer.length - 44; // WAV header is typically 44 bytes const duration = dataSize / (sampleRate * 2); // 2 bytes per sample for 16-bit audio // Generate frames (24 fps) const frames = Math.floor(duration * 24); console.log(`Estimated duration: ${duration}s, frames: ${frames}`); // Create a simple mouth pattern alternating between phonemes const mouthCues = []; const options = ['X', 'A', 'E', 'B', 'C', 'D', 'F', 'G']; for (let i = 0; i < frames; i++) { const frameTime = i / 24; mouthCues.push({ "start": frameTime, "end": frameTime + 0.041667, // 1/24 second "value": options[Math.floor(Math.random() * options.length)] }); } // Create final JSON object with lip sync data const lipSyncData = { "metadata": { "version": 1, "generator": "simplified-lipsync", "duration": duration.toFixed(2) }, "mouthCues": mouthCues }; console.log(`Lip sync generation done in ${new Date().getTime() - time}ms`); return lipSyncData; } catch (error) { console.error("Error in generateLipSyncDataFromWav:", error); // Return a fallback lip-sync data const emptyLipSync = { "metadata": { "version": 1, "generator": "fallback" }, "mouthCues": [{ "start": 0, "end": 1, "value": "X" }] }; return emptyLipSync; } }; // Keep the original function for backward compatibility // and files that still need the file-based approach const lipSyncMessage = async (message) => { const time = new Date().getTime(); console.log(`Starting conversion for message ${message}`); try { await execCommand( `ffmpeg -y -i audios/message_${message}.mp3 audios/message_${message}.wav` ); console.log(`Conversion done in ${new Date().getTime() - time}ms`); // Create a simplified lip-sync JSON file // This is a workaround since Rhubarb is incompatible with arm64 Macs const audioData = await fs.readFile(`audios/message_${message}.wav`); const duration = audioData.length / 44100; // Approximate duration from file size const frames = Math.floor(duration * 24); // 24 fps // Create a simple mouth pattern alternating between X and A every 5 frames const mouthCues = []; const options = ['X', 'A', 'E', 'B', 'C', 'D', 'F', 'G']; for (let i = 0; i < frames; i++) { const frameTime = i / 24; mouthCues.push({ "start": frameTime, "end": frameTime + 0.041667, "value": options[Math.floor(Math.random() * options.length)] }); } const lipSyncData = { "metadata": { "version": 1, "generator": "simplified-lipsync" }, "mouthCues": mouthCues }; await fs.writeFile( `audios/message_${message}.json`, JSON.stringify(lipSyncData, null, 2) ); console.log(`Lip sync done in ${new Date().getTime() - time}ms`); } catch (error) { console.error("Error in lipSyncMessage:", error); // Create an empty lip-sync file so the app doesn't crash const emptyLipSync = { "metadata": { "version": 1, "generator": "fallback" }, "mouthCues": [{ "start": 0, "end": 1, "value": "X" }] }; await fs.writeFile( `audios/message_${message}.json`, JSON.stringify(emptyLipSync, null, 2) ); } }; const readJsonTranscript = async (file) => { const data = await fs.readFile(file, "utf8"); return JSON.parse(data); }; const audioFileToBase64 = async (file) => { const data = await fs.readFile(file); return data.toString("base64"); }; // Helper function to convert float array to 16-bit PCM WAV buffer function floatTo16BitPCM(floatArray) { const pcmArray = new Int16Array(floatArray.length); for (let i = 0; i < floatArray.length; i++) { const s = Math.max(-1, Math.min(1, floatArray[i])); pcmArray[i] = s < 0 ? s * 0x8000 : s * 0x7FFF; } return pcmArray; } function createWavBuffer(pcmData, sampleRate) { const numChannels = 1; const bitsPerSample = 16; const byteRate = sampleRate * numChannels * (bitsPerSample / 8); const blockAlign = numChannels * (bitsPerSample / 8); const dataSize = pcmData.length * numChannels * (bitsPerSample / 8); const bufferSize = 44 + dataSize; // 44 bytes for WAV header const buffer = Buffer.alloc(bufferSize); let offset = 0; // RIFF header buffer.write("RIFF", offset); offset += 4; buffer.writeUInt32LE(36 + dataSize, offset); offset += 4; // 전체 파일 크기 - 8 buffer.write("WAVE", offset); offset += 4; // fmt subchunk buffer.write("fmt ", offset); offset += 4; buffer.writeUInt32LE(16, offset); offset += 4; // Subchunk1Size (16 for PCM) buffer.writeUInt16LE(1, offset); offset += 2; // AudioFormat (1 for PCM) buffer.writeUInt16LE(numChannels, offset); offset += 2; buffer.writeUInt32LE(sampleRate, offset); offset += 4; buffer.writeUInt32LE(byteRate, offset); offset += 4; buffer.writeUInt16LE(blockAlign, offset); offset += 2; buffer.writeUInt16LE(bitsPerSample, offset); offset += 2; // data subchunk buffer.write("data", offset); offset += 4; buffer.writeUInt32LE(dataSize, offset); offset += 4; // Write PCM data for (let i = 0; i < pcmData.length; i++) { buffer.writeInt16LE(pcmData[i], offset); offset += 2; } return buffer; } // Create directory for audio files if it doesn't exist try { const audiosPath = path.join(__dirname, 'audios'); try { await fs.access(audiosPath); } catch { await fs.mkdir(audiosPath, { recursive: true }); console.log('Created audios directory'); } } catch (err) { console.error('Failed to create audios directory:', err); } // Create public directory if it doesn't exist try { const publicPath = path.join(__dirname, 'public'); try { await fs.access(publicPath); } catch { await fs.mkdir(publicPath, { recursive: true }); console.log('Created public directory'); } } catch (err) { console.error('Failed to create public directory:', err); } // Start the server app.listen(port, '0.0.0.0', () => { console.log(`=============================================================`); console.log(`Servidor da Amiga Virtual rodando em:`); console.log(`- Local: http://localhost:${port}`); console.log(`- Rede: http://0.0.0.0:${port}`); console.log(`=============================================================`); console.log(`- Ambiente: ${process.env.NODE_ENV || 'development'}`); console.log(`- Versão Node: ${process.version}`); console.log(`- Plataforma: ${process.platform} ${process.arch}`); console.log(`- Diretório de trabalho: ${__dirname}`); console.log(`=============================================================`); });