import * as sdk from 'microsoft-cognitiveservices-speech-sdk'; import dotenv from 'dotenv'; dotenv.config(); // Hardcoded values for Azure Speech service const SPEECH_KEY = "dd32a86f9b52496b9504bddadb2a5602"; const SPEECH_REGION = "westeurope"; // Example mapping from Azure numeric viseme IDs to custom character values const azureVisemeToCharacterMap = { 0: "X", // Silence 1: "C", // æ, ə, ʌ (e.g., cat, ago, cup) 2: "C", // ɑ (e.g., father) 3: "C", // ɔ (e.g., thought) 4: "C", // ɛ, ʊ (e.g., bed, book) 5: "C", // ɝ (e.g., bird) 6: "C", // j, i, ɪ (e.g., yes, see, sit) 7: "C", // w, u (e.g., wit, too) 8: "C", // o (e.g., go) 9: "C", // aʊ (e.g., now) 10: "C", // ɔɪ (e.g., boy) 11: "C", // aɪ (e.g., my) 12: "D", // h (e.g., he) 13: "D", // ɹ (e.g., red) 14: "D", // l (e.g., lie) 15: "D", // s, z (e.g., see, zoo) 16: "D", // ʃ, tʃ, dʒ, ʒ (e.g., she, chin, gin) 17: "D", // ð (e.g., the) 18: "D", // f, v (e.g., fee, vie) - Could also be a separate category like "F" 19: "D", // d, t, n, θ (e.g., do, to, no, thin) 20: "D", // k, g, ŋ (e.g., koo, go, sing) 21: "B" // p, b, m (e.g., pie, buy, my) }; async function synthesizeSpeechWithVisemes(textToSpeak, language = "pt-BR", voiceName = null) { // Remove the check for environment variables since we're using hardcoded values const speechConfig = sdk.SpeechConfig.fromSubscription(SPEECH_KEY, SPEECH_REGION); speechConfig.speechSynthesisLanguage = language; if (voiceName) { speechConfig.speechSynthesisVoiceName = voiceName; } // Synthesize to memory buffer directly. // If audioConfig is null, SpeechSynthesizer returns audio data in result.audioData (ArrayBuffer) const synthesizer = new sdk.SpeechSynthesizer(speechConfig, null); const visemesData = []; synthesizer.visemeReceived = (s, e) => { // Audio offset is in 100-nanosecond ticks. visemesData.push({ id: e.visemeId, audioOffsetTicks: e.audioOffset }); // console.log(`Viseme received: ID=${e.visemeId}, Audio offset: ${(e.audioOffset / 10000).toFixed(2)} ms, Animation: ${e.animation}`); }; console.log(`Synthesizing text: '${textToSpeak}' in ${language}${voiceName ? ' using voice ' + voiceName : ''}`); return new Promise((resolve, reject) => { synthesizer.speakTextAsync( textToSpeak, result => { synthesizer.close(); if (result.reason === sdk.ResultReason.SynthesizingAudioCompleted) { console.log("Speech synthesized successfully."); const mouthCues = []; if (visemesData.length > 0 && result.audioDuration) { const totalAudioDurationSecs = result.audioDuration / 10000000.0; for (let i = 0; i < visemesData.length; i++) { const currentViseme = visemesData[i]; const startSecs = currentViseme.audioOffsetTicks / 10000000.0; let endSecs; if (i < visemesData.length - 1) { endSecs = visemesData[i + 1].audioOffsetTicks / 10000000.0; } else { endSecs = totalAudioDurationSecs; } if (endSecs <= startSecs) { endSecs = startSecs + 0.01; // Minimal duration } mouthCues.push({ start: Number(startSecs.toFixed(2)), end: Number(endSecs.toFixed(2)), value: azureVisemeToCharacterMap[currentViseme.id] || "X" }); } } resolve({ audioData: Buffer.from(result.audioData), // Convert ArrayBuffer to Node.js Buffer mouthCues: mouthCues, audioDurationTicks: result.audioDuration }); } else if (result.reason === sdk.ResultReason.Canceled) { const cancellation = sdk.CancellationDetails.fromResult(result); console.error(`Speech synthesis CANCELED: Reason=${cancellation.reason}`); if (cancellation.reason === sdk.CancellationReason.Error) { console.error(`CANCELED: ErrorCode=${cancellation.ErrorCode}`); console.error(`CANCELED: ErrorDetails=[${cancellation.errorDetails}]`); } reject(new Error(`Speech synthesis CANCELED: ${cancellation.reason} - ${cancellation.errorDetails}`)); } }, error => { synthesizer.close(); console.error(`Speech synthesis error: ${error}`); reject(new Error(`Speech synthesis error: ${error}`)); } ); }); } export { synthesizeSpeechWithVisemes }; // Example usage (for testing this module directly): /* async function test() { if (require.main === module) { // Only run if executed directly const TEXT_TO_SPEAK = "Olá mundo, isto é um teste de síntese de voz com visemas."; const LANGUAGE = "pt-BR"; // const VOICE = "pt-BR-FranciscaNeural"; // Example voice try { console.log("Attempting Azure TTS test..."); const result = await synthesizeSpeechWithVisemes(TEXT_TO_SPEAK, LANGUAGE); console.log("--- Mouth Cues JSON ---"); console.log(JSON.stringify({ mouthCues: result.mouthCues }, null, 2)); console.log("Audio data length (bytes):", result.audioData.length); console.log("Audio duration (ticks):", result.audioDurationTicks); // To save to file for testing: // import fs from 'fs'; // fs.writeFileSync('test_output.wav', result.audioData); // console.log('Audio saved to test_output.wav'); } catch (error) { console.error("Test function error:", error); } } } test(); */