Spaces:
Sleeping
Sleeping
Marcos
commited on
Commit
·
dd76c65
1
Parent(s):
0cdb42e
Automated push by deployment script
Browse files- Dockerfile +3 -39
- backend/azureTtsService.js +143 -0
- backend/package.json +1 -2
Dockerfile
CHANGED
@@ -4,37 +4,7 @@ FROM node:18-slim
|
|
4 |
RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg curl ca-certificates wget unzip file && \
|
5 |
apt-get clean && rm -rf /var/lib/apt/lists/*
|
6 |
|
7 |
-
#
|
8 |
-
# Switch to root to install Rhubarb, then switch back to node user later if needed before app execution
|
9 |
-
USER root
|
10 |
-
|
11 |
-
ARG RHUBARB_VERSION=1.13.0
|
12 |
-
RUN \
|
13 |
-
echo "Downloading Rhubarb version ${RHUBARB_VERSION}" && \
|
14 |
-
wget https://github.com/DanielSWolf/rhubarb-lip-sync/releases/download/v${RHUBARB_VERSION}/rhubarb-lip-sync-${RHUBARB_VERSION}-linux.zip -O /tmp/rhubarb.zip && \
|
15 |
-
echo "--- Listing /tmp to check zip file: ---" && \
|
16 |
-
ls -l /tmp/rhubarb.zip && \
|
17 |
-
echo "--- File type of /tmp/rhubarb.zip: ---" && \
|
18 |
-
file /tmp/rhubarb.zip && \
|
19 |
-
echo "--- Creating temporary extraction directory /tmp/rhubarb_extracted ---" && \
|
20 |
-
mkdir -p /tmp/rhubarb_extracted && \
|
21 |
-
echo "--- Unzipping to /tmp/rhubarb_extracted ---" && \
|
22 |
-
unzip /tmp/rhubarb.zip -d /tmp/rhubarb_extracted && \
|
23 |
-
echo "--- Listing /tmp/rhubarb_extracted (recursive): ---" && \
|
24 |
-
ls -R /tmp/rhubarb_extracted && \
|
25 |
-
echo "--- Finding rhubarb executable in /tmp/rhubarb_extracted ---" && \
|
26 |
-
RHUBARB_EXEC_PATH=$(find /tmp/rhubarb_extracted -name rhubarb -type f -executable -print -quit) && \
|
27 |
-
if [ -z "${RHUBARB_EXEC_PATH}" ]; then \
|
28 |
-
echo "ERROR: Could not find rhubarb executable in /tmp/rhubarb_extracted"; \
|
29 |
-
ls -R /tmp/rhubarb_extracted; \
|
30 |
-
exit 1; \
|
31 |
-
fi && \
|
32 |
-
echo "Found rhubarb executable at: ${RHUBARB_EXEC_PATH}" && \
|
33 |
-
echo "--- Moving ${RHUBARB_EXEC_PATH} to /usr/local/bin/rhubarb ---" && \
|
34 |
-
mv "${RHUBARB_EXEC_PATH}" "/usr/local/bin/rhubarb" && \
|
35 |
-
chmod +x "/usr/local/bin/rhubarb" && \
|
36 |
-
echo "--- Cleaning up /tmp/rhubarb.zip and /tmp/rhubarb_extracted ---" && \
|
37 |
-
rm -rf "/tmp/rhubarb.zip" "/tmp/rhubarb_extracted"
|
38 |
|
39 |
USER node
|
40 |
|
@@ -43,7 +13,7 @@ USER node
|
|
43 |
# Diagnostic commands for the node user
|
44 |
RUN echo "Running diagnostics as $(whoami)"
|
45 |
RUN echo "PATH is: $PATH"
|
46 |
-
RUN which rhubarb || echo "rhubarb not found in PATH for node user during build"
|
47 |
|
48 |
# Set environment variables
|
49 |
ENV NODE_ENV=production
|
@@ -76,13 +46,7 @@ RUN mkdir -p /home/node/app/audios /home/node/app/public
|
|
76 |
# This prevents ENOENT errors when serving index.html
|
77 |
RUN echo '<html><body><h1>Placeholder for Virtual Girlfriend App</h1><p>This is a placeholder page. Update with actual content if needed.</p></body></html>' > /home/node/app/public/index.html
|
78 |
|
79 |
-
#
|
80 |
-
# Replace the URL below with the actual source for rhubarb if available
|
81 |
-
# RUN mkdir -p /home/node/app/backend/bin && \
|
82 |
-
# curl -L -o /home/node/app/backend/bin/rhubarb 'https://example.com/path/to/rhubarb-binary' && \
|
83 |
-
# chmod +x /home/node/app/backend/bin/rhubarb
|
84 |
-
# Note: The above command is commented out as the URL is a placeholder. Please provide the correct URL or method to obtain rhubarb.
|
85 |
-
# Alternatively, consider using a script at startup to download it if it cannot be done at build time.
|
86 |
|
87 |
# Expose the port the app runs on
|
88 |
EXPOSE 7860
|
|
|
4 |
RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg curl ca-certificates wget unzip file && \
|
5 |
apt-get clean && rm -rf /var/lib/apt/lists/*
|
6 |
|
7 |
+
# Rhubarb installation has been removed.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
USER node
|
10 |
|
|
|
13 |
# Diagnostic commands for the node user
|
14 |
RUN echo "Running diagnostics as $(whoami)"
|
15 |
RUN echo "PATH is: $PATH"
|
16 |
+
# Removed: RUN which rhubarb || echo "rhubarb not found in PATH for node user during build"
|
17 |
|
18 |
# Set environment variables
|
19 |
ENV NODE_ENV=production
|
|
|
46 |
# This prevents ENOENT errors when serving index.html
|
47 |
RUN echo '<html><body><h1>Placeholder for Virtual Girlfriend App</h1><p>This is a placeholder page. Update with actual content if needed.</p></body></html>' > /home/node/app/public/index.html
|
48 |
|
49 |
+
# Rhubarb download/installation section removed.
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
# Expose the port the app runs on
|
52 |
EXPOSE 7860
|
backend/azureTtsService.js
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import * as sdk from 'microsoft-cognitiveservices-speech-sdk';
|
2 |
+
import dotenv from 'dotenv';
|
3 |
+
|
4 |
+
dotenv.config();
|
5 |
+
|
6 |
+
// User-provided Azure Speech service details from environment variables
|
7 |
+
const SPEECH_KEY = process.env.AZURE_SPEECH_KEY;
|
8 |
+
const SPEECH_REGION = process.env.AZURE_SPEECH_REGION;
|
9 |
+
|
10 |
+
// Example mapping from Azure numeric viseme IDs to custom character values
|
11 |
+
const azureVisemeToCharacterMap = {
|
12 |
+
0: "X", // Silence
|
13 |
+
1: "C", // æ, ə, ʌ (e.g., cat, ago, cup)
|
14 |
+
2: "C", // ɑ (e.g., father)
|
15 |
+
3: "C", // ɔ (e.g., thought)
|
16 |
+
4: "C", // ɛ, ʊ (e.g., bed, book)
|
17 |
+
5: "C", // ɝ (e.g., bird)
|
18 |
+
6: "C", // j, i, ɪ (e.g., yes, see, sit)
|
19 |
+
7: "C", // w, u (e.g., wit, too)
|
20 |
+
8: "C", // o (e.g., go)
|
21 |
+
9: "C", // aʊ (e.g., now)
|
22 |
+
10: "C", // ɔɪ (e.g., boy)
|
23 |
+
11: "C", // aɪ (e.g., my)
|
24 |
+
12: "D", // h (e.g., he)
|
25 |
+
13: "D", // ɹ (e.g., red)
|
26 |
+
14: "D", // l (e.g., lie)
|
27 |
+
15: "D", // s, z (e.g., see, zoo)
|
28 |
+
16: "D", // ʃ, tʃ, dʒ, ʒ (e.g., she, chin, gin)
|
29 |
+
17: "D", // ð (e.g., the)
|
30 |
+
18: "D", // f, v (e.g., fee, vie) - Could also be a separate category like "F"
|
31 |
+
19: "D", // d, t, n, θ (e.g., do, to, no, thin)
|
32 |
+
20: "D", // k, g, ŋ (e.g., koo, go, sing)
|
33 |
+
21: "B" // p, b, m (e.g., pie, buy, my)
|
34 |
+
};
|
35 |
+
|
36 |
+
async function synthesizeSpeechWithVisemes(textToSpeak, language = "pt-BR", voiceName = null) {
|
37 |
+
if (!SPEECH_KEY || !SPEECH_REGION) {
|
38 |
+
console.error("Azure Speech Key or Region not configured in environment variables.");
|
39 |
+
throw new Error("Azure Speech Key or Region not configured.");
|
40 |
+
}
|
41 |
+
|
42 |
+
const speechConfig = sdk.SpeechConfig.fromSubscription(SPEECH_KEY, SPEECH_REGION);
|
43 |
+
speechConfig.speechSynthesisLanguage = language;
|
44 |
+
|
45 |
+
if (voiceName) {
|
46 |
+
speechConfig.speechSynthesisVoiceName = voiceName;
|
47 |
+
}
|
48 |
+
|
49 |
+
// Synthesize to memory buffer directly.
|
50 |
+
// If audioConfig is null, SpeechSynthesizer returns audio data in result.audioData (ArrayBuffer)
|
51 |
+
const synthesizer = new sdk.SpeechSynthesizer(speechConfig, null);
|
52 |
+
|
53 |
+
const visemesData = [];
|
54 |
+
|
55 |
+
synthesizer.visemeReceived = (s, e) => {
|
56 |
+
// Audio offset is in 100-nanosecond ticks.
|
57 |
+
visemesData.push({ id: e.visemeId, audioOffsetTicks: e.audioOffset });
|
58 |
+
// console.log(`Viseme received: ID=${e.visemeId}, Audio offset: ${(e.audioOffset / 10000).toFixed(2)} ms, Animation: ${e.animation}`);
|
59 |
+
};
|
60 |
+
|
61 |
+
console.log(`Synthesizing text: '${textToSpeak}' in ${language}${voiceName ? ' using voice ' + voiceName : ''}`);
|
62 |
+
|
63 |
+
return new Promise((resolve, reject) => {
|
64 |
+
synthesizer.speakTextAsync(
|
65 |
+
textToSpeak,
|
66 |
+
result => {
|
67 |
+
synthesizer.close();
|
68 |
+
if (result.reason === sdk.ResultReason.SynthesizingAudioCompleted) {
|
69 |
+
console.log("Speech synthesized successfully.");
|
70 |
+
const mouthCues = [];
|
71 |
+
if (visemesData.length > 0 && result.audioDuration) {
|
72 |
+
const totalAudioDurationSecs = result.audioDuration / 10000000.0;
|
73 |
+
for (let i = 0; i < visemesData.length; i++) {
|
74 |
+
const currentViseme = visemesData[i];
|
75 |
+
const startSecs = currentViseme.audioOffsetTicks / 10000000.0;
|
76 |
+
let endSecs;
|
77 |
+
if (i < visemesData.length - 1) {
|
78 |
+
endSecs = visemesData[i + 1].audioOffsetTicks / 10000000.0;
|
79 |
+
} else {
|
80 |
+
endSecs = totalAudioDurationSecs;
|
81 |
+
}
|
82 |
+
if (endSecs <= startSecs) {
|
83 |
+
endSecs = startSecs + 0.01; // Minimal duration
|
84 |
+
}
|
85 |
+
mouthCues.push({
|
86 |
+
start: Number(startSecs.toFixed(2)),
|
87 |
+
end: Number(endSecs.toFixed(2)),
|
88 |
+
value: azureVisemeToCharacterMap[currentViseme.id] || "X"
|
89 |
+
});
|
90 |
+
}
|
91 |
+
}
|
92 |
+
resolve({
|
93 |
+
audioData: Buffer.from(result.audioData), // Convert ArrayBuffer to Node.js Buffer
|
94 |
+
mouthCues: mouthCues,
|
95 |
+
audioDurationTicks: result.audioDuration
|
96 |
+
});
|
97 |
+
} else if (result.reason === sdk.ResultReason.Canceled) {
|
98 |
+
const cancellation = sdk.CancellationDetails.fromResult(result);
|
99 |
+
console.error(`Speech synthesis CANCELED: Reason=${cancellation.reason}`);
|
100 |
+
if (cancellation.reason === sdk.CancellationReason.Error) {
|
101 |
+
console.error(`CANCELED: ErrorCode=${cancellation.ErrorCode}`);
|
102 |
+
console.error(`CANCELED: ErrorDetails=[${cancellation.errorDetails}]`);
|
103 |
+
}
|
104 |
+
reject(new Error(`Speech synthesis CANCELED: ${cancellation.reason} - ${cancellation.errorDetails}`));
|
105 |
+
}
|
106 |
+
},
|
107 |
+
error => {
|
108 |
+
synthesizer.close();
|
109 |
+
console.error(`Speech synthesis error: ${error}`);
|
110 |
+
reject(new Error(`Speech synthesis error: ${error}`));
|
111 |
+
}
|
112 |
+
);
|
113 |
+
});
|
114 |
+
}
|
115 |
+
|
116 |
+
export { synthesizeSpeechWithVisemes };
|
117 |
+
|
118 |
+
// Example usage (for testing this module directly):
|
119 |
+
/*
|
120 |
+
async function test() {
|
121 |
+
if (require.main === module) { // Only run if executed directly
|
122 |
+
const TEXT_TO_SPEAK = "Olá mundo, isto é um teste de síntese de voz com visemas.";
|
123 |
+
const LANGUAGE = "pt-BR";
|
124 |
+
// const VOICE = "pt-BR-FranciscaNeural"; // Example voice
|
125 |
+
try {
|
126 |
+
console.log("Attempting Azure TTS test...");
|
127 |
+
const result = await synthesizeSpeechWithVisemes(TEXT_TO_SPEAK, LANGUAGE);
|
128 |
+
console.log("--- Mouth Cues JSON ---");
|
129 |
+
console.log(JSON.stringify({ mouthCues: result.mouthCues }, null, 2));
|
130 |
+
console.log("Audio data length (bytes):", result.audioData.length);
|
131 |
+
console.log("Audio duration (ticks):", result.audioDurationTicks);
|
132 |
+
// To save to file for testing:
|
133 |
+
// import fs from 'fs';
|
134 |
+
// fs.writeFileSync('test_output.wav', result.audioData);
|
135 |
+
// console.log('Audio saved to test_output.wav');
|
136 |
+
} catch (error) {
|
137 |
+
console.error("Test function error:", error);
|
138 |
+
}
|
139 |
+
}
|
140 |
+
}
|
141 |
+
|
142 |
+
test();
|
143 |
+
*/
|
backend/package.json
CHANGED
@@ -16,8 +16,7 @@
|
|
16 |
"dotenv": "^16.3.1",
|
17 |
"elevenlabs-node": "^1.2.0",
|
18 |
"express": "^4.18.2",
|
19 |
-
"microsoft-cognitiveservices-speech-sdk": "^1.38.0"
|
20 |
-
"openai": "^4.26.0"
|
21 |
},
|
22 |
"devDependencies": {
|
23 |
"nodemon": "^3.0.1"
|
|
|
16 |
"dotenv": "^16.3.1",
|
17 |
"elevenlabs-node": "^1.2.0",
|
18 |
"express": "^4.18.2",
|
19 |
+
"microsoft-cognitiveservices-speech-sdk": "^1.38.0"
|
|
|
20 |
},
|
21 |
"devDependencies": {
|
22 |
"nodemon": "^3.0.1"
|