Spaces:
Sleeping
Sleeping
api updated
Browse files- .gitignore +2 -0
- api/README.md +232 -13
- api/audio_concatenator.py +196 -0
- api/demo.py +177 -0
- api/models.py +28 -9
- api/test_api.py +455 -0
- api/text_processing.py +174 -0
- api/tts_service.py +154 -1
.gitignore
CHANGED
@@ -45,3 +45,5 @@ wheels/
|
|
45 |
.installed.cfg
|
46 |
*.egg
|
47 |
MANIFEST
|
|
|
|
|
|
45 |
.installed.cfg
|
46 |
*.egg
|
47 |
MANIFEST
|
48 |
+
|
49 |
+
**/output/
|
api/README.md
CHANGED
@@ -1,6 +1,15 @@
|
|
1 |
-
# API
|
2 |
|
3 |
-
This package contains the modular components of the Chatterbox TTS API.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
## Structure
|
6 |
|
@@ -8,9 +17,11 @@ This package contains the modular components of the Chatterbox TTS API.
|
|
8 |
api/
|
9 |
βββ __init__.py # Package initialization and exports
|
10 |
βββ config.py # Modal app configuration and container image setup
|
11 |
-
βββ models.py # Pydantic request/response models
|
12 |
βββ audio_utils.py # Audio processing utilities and helper functions
|
|
|
13 |
βββ tts_service.py # Main TTS service class with all API endpoints
|
|
|
14 |
βββ README.md # This file
|
15 |
```
|
16 |
|
@@ -18,16 +29,25 @@ api/
|
|
18 |
|
19 |
### config.py
|
20 |
|
21 |
-
- Modal app configuration
|
22 |
- Container image setup with required dependencies
|
23 |
- Centralized configuration management
|
|
|
24 |
|
25 |
### models.py
|
26 |
|
27 |
-
- `TTSRequest`:
|
28 |
-
- `
|
|
|
|
|
29 |
- `HealthResponse`: Response model for health checks
|
30 |
-
- All models include proper type hints and documentation
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
### audio_utils.py
|
33 |
|
@@ -42,6 +62,120 @@ api/
|
|
42 |
- GPU-accelerated TTS model loading and inference
|
43 |
- Multiple API endpoints for different use cases
|
44 |
- Comprehensive error handling and validation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
## Usage
|
47 |
|
@@ -52,10 +186,95 @@ from api import app, ChatterboxTTSService
|
|
52 |
# The service class contains all the endpoints
|
53 |
```
|
54 |
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Enhanced Chatterbox TTS API
|
2 |
|
3 |
+
This package contains the modular components of the Enhanced Chatterbox TTS API with GPU-accelerated processing, intelligent text chunking, and server-side audio concatenation.
|
4 |
+
|
5 |
+
## Features
|
6 |
+
|
7 |
+
- **GPU-Accelerated Processing**: Leverage server GPU for parallel chunk processing
|
8 |
+
- **Intelligent Text Chunking**: Smart text splitting that respects sentence and paragraph boundaries
|
9 |
+
- **Server-Side Concatenation**: Seamless audio merging with fade effects and silence control
|
10 |
+
- **Voice Cloning**: Optional voice prompt for personalized speech generation
|
11 |
+
- **Multiple Response Formats**: Streaming audio, complete files, or JSON with base64 encoding
|
12 |
+
- **Scalable Architecture**: Handles texts of any length efficiently
|
13 |
|
14 |
## Structure
|
15 |
|
|
|
17 |
api/
|
18 |
βββ __init__.py # Package initialization and exports
|
19 |
βββ config.py # Modal app configuration and container image setup
|
20 |
+
βββ models.py # Pydantic request/response models (enhanced with full-text support)
|
21 |
βββ audio_utils.py # Audio processing utilities and helper functions
|
22 |
+
βββ text_processing.py # Server-side text chunking and audio concatenation
|
23 |
βββ tts_service.py # Main TTS service class with all API endpoints
|
24 |
+
βββ test_api.py # Comprehensive API testing suite
|
25 |
βββ README.md # This file
|
26 |
```
|
27 |
|
|
|
29 |
|
30 |
### config.py
|
31 |
|
32 |
+
- Modal app configuration with GPU support (A10G)
|
33 |
- Container image setup with required dependencies
|
34 |
- Centralized configuration management
|
35 |
+
- Memory snapshot and scaling configuration
|
36 |
|
37 |
### models.py
|
38 |
|
39 |
+
- `TTSRequest`: Standard request model for TTS generation
|
40 |
+
- `FullTextTTSRequest`: Enhanced request model for full-text processing with chunking parameters
|
41 |
+
- `TTSResponse`: Standard response model for JSON endpoints
|
42 |
+
- `FullTextTTSResponse`: Enhanced response with processing information
|
43 |
- `HealthResponse`: Response model for health checks
|
44 |
+
- All models include proper type hints, validation, and documentation
|
45 |
+
|
46 |
+
### text_processing.py
|
47 |
+
|
48 |
+
- `TextChunker`: Intelligent server-side text chunking with configurable parameters
|
49 |
+
- `AudioConcatenator`: Server-side audio concatenation with fade effects and silence control
|
50 |
+
- Optimized for GPU processing and large text handling
|
51 |
|
52 |
### audio_utils.py
|
53 |
|
|
|
62 |
- GPU-accelerated TTS model loading and inference
|
63 |
- Multiple API endpoints for different use cases
|
64 |
- Comprehensive error handling and validation
|
65 |
+
- New full-text processing endpoints with parallel chunk processing
|
66 |
+
|
67 |
+
### test_api.py
|
68 |
+
|
69 |
+
- Comprehensive testing suite for all API endpoints
|
70 |
+
- Tests for basic generation, voice cloning, file uploads, and full-text processing
|
71 |
+
- Performance benchmarking and validation scripts
|
72 |
+
|
73 |
+
## API Endpoints
|
74 |
+
|
75 |
+
### Standard Endpoints
|
76 |
+
|
77 |
+
#### `GET /health`
|
78 |
+
|
79 |
+
Health check endpoint to verify model status and service availability.
|
80 |
+
|
81 |
+
```bash
|
82 |
+
curl -X GET "YOUR-ENDPOINT/health"
|
83 |
+
```
|
84 |
+
|
85 |
+
#### `POST /generate_audio`
|
86 |
+
|
87 |
+
Generate speech audio from text with optional voice cloning (streaming response).
|
88 |
+
|
89 |
+
```bash
|
90 |
+
curl -X POST "YOUR-ENDPOINT/generate_audio" \
|
91 |
+
-H "Content-Type: application/json" \
|
92 |
+
-d '{"text": "Hello world!"}' \
|
93 |
+
--output output.wav
|
94 |
+
```
|
95 |
+
|
96 |
+
#### `POST /generate_json`
|
97 |
+
|
98 |
+
Generate speech and return JSON with base64 encoded audio.
|
99 |
+
|
100 |
+
```bash
|
101 |
+
curl -X POST "YOUR-ENDPOINT/generate_json" \
|
102 |
+
-H "Content-Type: application/json" \
|
103 |
+
-d '{"text": "Hello world!"}'
|
104 |
+
```
|
105 |
+
|
106 |
+
#### `POST /generate_with_file`
|
107 |
+
|
108 |
+
Generate speech with file upload for voice cloning.
|
109 |
+
|
110 |
+
```bash
|
111 |
+
curl -X POST "YOUR-ENDPOINT/generate_with_file" \
|
112 |
+
-F "text=Hello world!" \
|
113 |
+
-F "voice_prompt=@voice_sample.wav" \
|
114 |
+
--output output.wav
|
115 |
+
```
|
116 |
+
|
117 |
+
### Enhanced Full-Text Endpoints
|
118 |
+
|
119 |
+
#### `POST /generate_full_text_audio`
|
120 |
+
|
121 |
+
π Generate speech from full text with server-side chunking and parallel processing.
|
122 |
+
|
123 |
+
```bash
|
124 |
+
curl -X POST "YOUR-ENDPOINT/generate_full_text_audio" \
|
125 |
+
-H "Content-Type: application/json" \
|
126 |
+
-d '{
|
127 |
+
"text": "Your very long text here...",
|
128 |
+
"max_chunk_size": 800,
|
129 |
+
"silence_duration": 0.5,
|
130 |
+
"fade_duration": 0.1,
|
131 |
+
"overlap_sentences": 0
|
132 |
+
}' \
|
133 |
+
--output full_text_output.wav
|
134 |
+
```
|
135 |
+
|
136 |
+
#### `POST /generate_full_text_json`
|
137 |
+
|
138 |
+
π Generate speech from full text and return JSON with processing information.
|
139 |
+
|
140 |
+
```bash
|
141 |
+
curl -X POST "YOUR-ENDPOINT/generate_full_text_json" \
|
142 |
+
-H "Content-Type: application/json" \
|
143 |
+
-d '{
|
144 |
+
"text": "Your very long text here...",
|
145 |
+
"max_chunk_size": 800,
|
146 |
+
"silence_duration": 0.5
|
147 |
+
}'
|
148 |
+
```
|
149 |
+
|
150 |
+
### Legacy Endpoints
|
151 |
+
|
152 |
+
#### `POST /generate`
|
153 |
+
|
154 |
+
Legacy endpoint for backward compatibility.
|
155 |
+
|
156 |
+
```bash
|
157 |
+
curl -X POST "YOUR-ENDPOINT/generate?prompt=Hello%20world!" \
|
158 |
+
--output legacy_output.wav
|
159 |
+
```
|
160 |
+
|
161 |
+
## Request Parameters
|
162 |
+
|
163 |
+
### FullTextTTSRequest Parameters
|
164 |
+
|
165 |
+
- **`text`** (required): The text to convert to speech (any length)
|
166 |
+
- **`voice_prompt_base64`** (optional): Base64 encoded voice prompt for cloning
|
167 |
+
- **`max_chunk_size`** (optional, default: 800): Maximum characters per chunk
|
168 |
+
- **`silence_duration`** (optional, default: 0.5): Silence between chunks in seconds
|
169 |
+
- **`fade_duration`** (optional, default: 0.1): Fade in/out duration in seconds
|
170 |
+
- **`overlap_sentences`** (optional, default: 0): Sentences to overlap between chunks
|
171 |
+
|
172 |
+
## Response Headers
|
173 |
+
|
174 |
+
Enhanced endpoints include additional headers with processing information:
|
175 |
+
|
176 |
+
- **`X-Audio-Duration`**: Duration of generated audio in seconds
|
177 |
+
- **`X-Chunks-Processed`**: Number of text chunks processed
|
178 |
+
- **`X-Total-Characters`**: Total characters in the input text
|
179 |
|
180 |
## Usage
|
181 |
|
|
|
186 |
# The service class contains all the endpoints
|
187 |
```
|
188 |
|
189 |
+
### Python Client Example
|
190 |
+
|
191 |
+
```python
|
192 |
+
import requests
|
193 |
+
|
194 |
+
# Generate audio from long text
|
195 |
+
response = requests.post(
|
196 |
+
"YOUR-ENDPOINT/generate_full_text_audio",
|
197 |
+
json={
|
198 |
+
"text": "Your long document text here...",
|
199 |
+
"max_chunk_size": 800,
|
200 |
+
"silence_duration": 0.5
|
201 |
+
}
|
202 |
+
)
|
203 |
+
|
204 |
+
if response.status_code == 200:
|
205 |
+
with open("output.wav", "wb") as f:
|
206 |
+
f.write(response.content)
|
207 |
+
print("Audio generated successfully!")
|
208 |
+
```
|
209 |
+
|
210 |
+
## Performance Characteristics
|
211 |
+
|
212 |
+
### Standard Processing
|
213 |
+
|
214 |
+
- **Text Length**: Up to ~1000 characters optimal
|
215 |
+
- **Processing Time**: ~2-5 seconds per request
|
216 |
+
- **Use Case**: Short texts, real-time applications
|
217 |
+
|
218 |
+
### Full-Text Processing
|
219 |
+
|
220 |
+
- **Text Length**: Unlimited (automatically chunked)
|
221 |
+
- **Processing Time**: ~5-15 seconds for long documents
|
222 |
+
- **Parallelization**: Up to 4 concurrent chunks
|
223 |
+
- **Use Case**: Documents, articles, books
|
224 |
+
|
225 |
+
## Deployment
|
226 |
+
|
227 |
+
```bash
|
228 |
+
# Deploy the enhanced API
|
229 |
+
modal deploy tts_service.py
|
230 |
+
|
231 |
+
# Test the deployment
|
232 |
+
python test_api.py
|
233 |
+
```
|
234 |
+
|
235 |
+
````
|
236 |
+
|
237 |
+
## Benefits of Enhanced Architecture
|
238 |
+
|
239 |
+
1. **GPU Acceleration**: Server-side processing leverages GPU resources for faster inference
|
240 |
+
2. **Intelligent Chunking**: Smart text splitting that preserves sentence integrity
|
241 |
+
3. **Parallel Processing**: Multiple chunks processed simultaneously for better performance
|
242 |
+
4. **Scalability**: Handles texts of any length without client-side limitations
|
243 |
+
5. **Separation of Concerns**: Each file has a specific responsibility
|
244 |
+
6. **Maintainability**: Easier to update and modify individual components
|
245 |
+
7. **Testability**: Components can be tested in isolation
|
246 |
+
8. **Reusability**: Components can be imported and used in other projects
|
247 |
+
9. **Readability**: Smaller files are easier to understand and navigate
|
248 |
|
249 |
+
## Testing
|
250 |
+
|
251 |
+
Run the comprehensive test suite:
|
252 |
+
|
253 |
+
```bash
|
254 |
+
cd api/
|
255 |
+
python test_api.py
|
256 |
+
````
|
257 |
+
|
258 |
+
The test suite includes:
|
259 |
+
|
260 |
+
- Health check validation
|
261 |
+
- Basic text-to-speech generation
|
262 |
+
- JSON response testing
|
263 |
+
- Voice cloning functionality
|
264 |
+
- File upload testing
|
265 |
+
- Full-text processing validation
|
266 |
+
- Performance benchmarking
|
267 |
+
|
268 |
+
## Environment Variables
|
269 |
+
|
270 |
+
Set these environment variables for testing:
|
271 |
+
|
272 |
+
```bash
|
273 |
+
HEALTH_ENDPOINT=https://your-modal-endpoint.modal.run/health
|
274 |
+
GENERATE_AUDIO_ENDPOINT=https://your-modal-endpoint.modal.run/generate_audio
|
275 |
+
GENERATE_JSON_ENDPOINT=https://your-modal-endpoint.modal.run/generate_json
|
276 |
+
GENERATE_WITH_FILE_ENDPOINT=https://your-modal-endpoint.modal.run/generate_with_file
|
277 |
+
GENERATE_ENDPOINT=https://your-modal-endpoint.modal.run/generate
|
278 |
+
FULL_TEXT_TTS_ENDPOINT=https://your-modal-endpoint.modal.run/generate_full_text_audio
|
279 |
+
FULL_TEXT_JSON_ENDPOINT=https://your-modal-endpoint.modal.run/generate_full_text_json
|
280 |
+
```
|
api/audio_concatenator.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Audio processing utilities for the TTS API."""
|
2 |
+
|
3 |
+
import re
|
4 |
+
from typing import List
|
5 |
+
|
6 |
+
|
7 |
+
class AudioConcatenator:
|
8 |
+
"""Server-side audio concatenation with GPU acceleration."""
|
9 |
+
|
10 |
+
def __init__(self, silence_duration: float = 0.5, fade_duration: float = 0.1):
|
11 |
+
"""
|
12 |
+
Initialize the audio concatenator.
|
13 |
+
|
14 |
+
Args:
|
15 |
+
silence_duration: Duration of silence between chunks (seconds)
|
16 |
+
fade_duration: Duration of fade in/out effects (seconds)
|
17 |
+
"""
|
18 |
+
self.silence_duration = silence_duration
|
19 |
+
self.fade_duration = fade_duration
|
20 |
+
|
21 |
+
def concatenate_audio_chunks(self, audio_chunks: List, sample_rate: int):
|
22 |
+
"""
|
23 |
+
Concatenate multiple audio chunks into a single audio file.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
audio_chunks: List of audio arrays
|
27 |
+
sample_rate: Sample rate for the audio
|
28 |
+
|
29 |
+
Returns:
|
30 |
+
Concatenated audio array
|
31 |
+
"""
|
32 |
+
if not audio_chunks:
|
33 |
+
raise ValueError("No audio chunks to concatenate")
|
34 |
+
|
35 |
+
if len(audio_chunks) == 1:
|
36 |
+
# Handle single chunk case
|
37 |
+
audio = audio_chunks[0]
|
38 |
+
if isinstance(audio, tuple):
|
39 |
+
return audio[0] # Extract audio data from tuple
|
40 |
+
return audio
|
41 |
+
|
42 |
+
import numpy as np
|
43 |
+
import torch
|
44 |
+
|
45 |
+
# Normalize and prepare audio data
|
46 |
+
normalized_chunks = []
|
47 |
+
for i, audio_data in enumerate(audio_chunks):
|
48 |
+
print(f"Processing chunk {i}: type={type(audio_data)}")
|
49 |
+
|
50 |
+
# Handle tuple format (common from TTS models)
|
51 |
+
if isinstance(audio_data, tuple):
|
52 |
+
audio_data = audio_data[0] # Extract audio array from tuple
|
53 |
+
print(f" Extracted from tuple: type={type(audio_data)}")
|
54 |
+
|
55 |
+
# Convert torch tensor to numpy if needed
|
56 |
+
if hasattr(audio_data, 'cpu'): # It's a torch tensor
|
57 |
+
audio_data = audio_data.cpu().numpy()
|
58 |
+
print(f" Converted from torch: shape={audio_data.shape}")
|
59 |
+
|
60 |
+
# Convert to numpy array if needed
|
61 |
+
if not isinstance(audio_data, np.ndarray):
|
62 |
+
audio_data = np.array(audio_data)
|
63 |
+
|
64 |
+
print(f" Final shape before processing: {audio_data.shape}")
|
65 |
+
|
66 |
+
# Handle different audio shapes
|
67 |
+
if audio_data.ndim == 1:
|
68 |
+
# Already 1D, perfect
|
69 |
+
normalized_audio = audio_data
|
70 |
+
elif audio_data.ndim == 2:
|
71 |
+
# Handle 2D audio - could be (channels, samples) or (samples, channels)
|
72 |
+
if audio_data.shape[0] < audio_data.shape[1]:
|
73 |
+
# Likely (channels, samples) - take first channel
|
74 |
+
normalized_audio = audio_data[0, :]
|
75 |
+
print(f" Used first channel from (C, L) format: {normalized_audio.shape}")
|
76 |
+
else:
|
77 |
+
# Likely (samples, channels) - take first channel
|
78 |
+
normalized_audio = audio_data[:, 0]
|
79 |
+
print(f" Used first channel from (L, C) format: {normalized_audio.shape}")
|
80 |
+
else:
|
81 |
+
# Flatten higher dimensional arrays
|
82 |
+
normalized_audio = audio_data.flatten()
|
83 |
+
print(f" Flattened {audio_data.ndim}D array: {normalized_audio.shape}")
|
84 |
+
|
85 |
+
# Ensure we have valid audio data
|
86 |
+
if len(normalized_audio) == 0:
|
87 |
+
print(f" Warning: Empty audio chunk {i}")
|
88 |
+
continue
|
89 |
+
|
90 |
+
print(f" Chunk {i} final length: {len(normalized_audio)} samples ({len(normalized_audio)/sample_rate:.2f}s)")
|
91 |
+
|
92 |
+
# Normalize audio levels
|
93 |
+
normalized_audio = self._normalize_audio(normalized_audio)
|
94 |
+
|
95 |
+
# Apply fade effects
|
96 |
+
normalized_audio = self._apply_fade_effects(normalized_audio, sample_rate)
|
97 |
+
|
98 |
+
normalized_chunks.append(normalized_audio)
|
99 |
+
|
100 |
+
if not normalized_chunks:
|
101 |
+
raise ValueError("No valid audio chunks after processing")
|
102 |
+
|
103 |
+
print(f"Successfully processed {len(normalized_chunks)} chunks")
|
104 |
+
|
105 |
+
# Create silence segments
|
106 |
+
silence_samples = int(self.silence_duration * sample_rate)
|
107 |
+
silence = np.zeros(silence_samples, dtype=np.float32)
|
108 |
+
print(f"Adding {silence_samples} silence samples ({self.silence_duration}s) between chunks")
|
109 |
+
|
110 |
+
# Concatenate all chunks with silence in between
|
111 |
+
concatenated_segments = []
|
112 |
+
total_audio_length = 0
|
113 |
+
|
114 |
+
for i, chunk in enumerate(normalized_chunks):
|
115 |
+
concatenated_segments.append(chunk)
|
116 |
+
total_audio_length += len(chunk)
|
117 |
+
print(f"Added chunk {i}: {len(chunk)} samples")
|
118 |
+
|
119 |
+
# Add silence between chunks (but not after the last chunk)
|
120 |
+
if i < len(normalized_chunks) - 1:
|
121 |
+
concatenated_segments.append(silence)
|
122 |
+
total_audio_length += len(silence)
|
123 |
+
print(f"Added silence: {len(silence)} samples")
|
124 |
+
|
125 |
+
# Combine all segments
|
126 |
+
final_audio = np.concatenate(concatenated_segments)
|
127 |
+
print(f"Final concatenated audio: {len(final_audio)} samples ({len(final_audio)/sample_rate:.2f}s)")
|
128 |
+
|
129 |
+
# Final normalization and cleanup
|
130 |
+
final_audio = self._normalize_audio(final_audio)
|
131 |
+
final_audio = self._remove_clicks_and_pops(final_audio)
|
132 |
+
|
133 |
+
return final_audio
|
134 |
+
|
135 |
+
def _normalize_audio(self, audio_data):
|
136 |
+
"""Normalize audio to prevent clipping."""
|
137 |
+
import numpy as np
|
138 |
+
|
139 |
+
# Convert to numpy array if it's not already
|
140 |
+
if not isinstance(audio_data, np.ndarray):
|
141 |
+
audio_data = np.array(audio_data)
|
142 |
+
|
143 |
+
# Ensure it's a 1D array
|
144 |
+
if audio_data.ndim > 1:
|
145 |
+
audio_data = audio_data.flatten()
|
146 |
+
|
147 |
+
# Find the maximum absolute value
|
148 |
+
max_val = np.max(np.abs(audio_data))
|
149 |
+
|
150 |
+
if max_val == 0:
|
151 |
+
return audio_data
|
152 |
+
|
153 |
+
# Normalize to 95% of maximum to leave some headroom
|
154 |
+
normalized = audio_data * (0.95 / max_val)
|
155 |
+
|
156 |
+
return normalized.astype(np.float32)
|
157 |
+
|
158 |
+
def _apply_fade_effects(self, audio_data, sample_rate: int):
|
159 |
+
"""Apply fade in and fade out effects to reduce pops and clicks."""
|
160 |
+
import numpy as np
|
161 |
+
|
162 |
+
fade_samples = int(self.fade_duration * sample_rate)
|
163 |
+
|
164 |
+
if len(audio_data) < 2 * fade_samples:
|
165 |
+
# If audio is too short for fade effects, return as-is
|
166 |
+
return audio_data
|
167 |
+
|
168 |
+
audio_with_fades = audio_data.copy()
|
169 |
+
|
170 |
+
# Apply fade in
|
171 |
+
fade_in = np.linspace(0, 1, fade_samples)
|
172 |
+
audio_with_fades[:fade_samples] *= fade_in
|
173 |
+
|
174 |
+
# Apply fade out
|
175 |
+
fade_out = np.linspace(1, 0, fade_samples)
|
176 |
+
audio_with_fades[-fade_samples:] *= fade_out
|
177 |
+
|
178 |
+
return audio_with_fades
|
179 |
+
|
180 |
+
def _remove_clicks_and_pops(self, audio_data):
|
181 |
+
"""Apply basic filtering to remove clicks and pops."""
|
182 |
+
try:
|
183 |
+
# Simple high-pass filter to remove DC offset and low-frequency artifacts
|
184 |
+
from scipy import signal
|
185 |
+
import numpy as np
|
186 |
+
|
187 |
+
# Design a high-pass filter (removes frequencies below 80 Hz)
|
188 |
+
# This helps remove some pops and clicks while preserving speech
|
189 |
+
nyquist = 22050 / 2 # Assuming common sample rate
|
190 |
+
low = 80 / nyquist
|
191 |
+
b, a = signal.butter(4, low, btype='high')
|
192 |
+
filtered_audio = signal.filtfilt(b, a, audio_data)
|
193 |
+
return filtered_audio.astype(np.float32)
|
194 |
+
except ImportError:
|
195 |
+
# If scipy is not available, return original audio
|
196 |
+
return audio_data
|
api/demo.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Quick demonstration script for the Enhanced Chatterbox TTS API
|
4 |
+
Shows how to use the new full-text endpoints for processing long documents
|
5 |
+
"""
|
6 |
+
|
7 |
+
import requests
|
8 |
+
import os
|
9 |
+
from pathlib import Path
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
|
12 |
+
# Load environment variables
|
13 |
+
load_dotenv()
|
14 |
+
|
15 |
+
def demo_full_text_processing():
|
16 |
+
"""Demonstrate full-text processing with a sample document"""
|
17 |
+
|
18 |
+
# Sample long text (like from a PDF)
|
19 |
+
sample_document = """
|
20 |
+
Artificial Intelligence has revolutionized numerous industries and continues to shape our world in unprecedented ways. From healthcare to transportation, AI systems are becoming increasingly sophisticated and capable of performing complex tasks that were once thought to be exclusively human domains.
|
21 |
+
|
22 |
+
In healthcare, AI-powered diagnostic systems can now identify diseases with remarkable accuracy, sometimes surpassing human doctors in specific areas. Machine learning algorithms analyze medical images, predict patient outcomes, and assist in drug discovery processes. This technological advancement has the potential to make healthcare more accessible and effective globally.
|
23 |
+
|
24 |
+
The transportation sector has also witnessed significant AI integration. Autonomous vehicles use computer vision, sensor fusion, and deep learning to navigate complex environments safely. These systems process vast amounts of real-time data to make split-second decisions, potentially reducing traffic accidents and improving transportation efficiency.
|
25 |
+
|
26 |
+
However, with these advancements come important ethical considerations. Issues of privacy, job displacement, and algorithmic bias must be carefully addressed as AI systems become more prevalent in society. It is crucial that we develop AI responsibly, ensuring that these powerful technologies benefit humanity while minimizing potential risks.
|
27 |
+
|
28 |
+
The future of AI holds immense promise, but it requires thoughtful implementation and continuous oversight to ensure that its development aligns with human values and societal needs.
|
29 |
+
"""
|
30 |
+
|
31 |
+
endpoint = os.getenv("FULL_TEXT_TTS_ENDPOINT")
|
32 |
+
if not endpoint:
|
33 |
+
print("β FULL_TEXT_TTS_ENDPOINT not configured")
|
34 |
+
print("Please set the environment variable or update your .env file")
|
35 |
+
return False
|
36 |
+
|
37 |
+
print("ποΈ Enhanced Chatterbox TTS API Demo")
|
38 |
+
print("=" * 50)
|
39 |
+
print(f"Processing document ({len(sample_document)} characters)...")
|
40 |
+
|
41 |
+
try:
|
42 |
+
# Send request to full-text endpoint
|
43 |
+
response = requests.post(
|
44 |
+
endpoint,
|
45 |
+
json={
|
46 |
+
"text": sample_document.strip(),
|
47 |
+
"max_chunk_size": 600, # Smaller chunks for better processing
|
48 |
+
"silence_duration": 0.6, # Slightly longer pause between chunks
|
49 |
+
"fade_duration": 0.2, # Smooth transitions
|
50 |
+
"overlap_sentences": 1 # Overlap for better continuity
|
51 |
+
},
|
52 |
+
timeout=180 # Allow time for processing
|
53 |
+
)
|
54 |
+
|
55 |
+
if response.status_code == 200:
|
56 |
+
# Save the generated audio
|
57 |
+
Path("demo_output").mkdir(exist_ok=True)
|
58 |
+
output_file = "demo_output/ai_document_speech.wav"
|
59 |
+
|
60 |
+
with open(output_file, "wb") as f:
|
61 |
+
f.write(response.content)
|
62 |
+
|
63 |
+
# Extract processing information from headers
|
64 |
+
duration = response.headers.get('X-Audio-Duration', 'unknown')
|
65 |
+
chunks = response.headers.get('X-Chunks-Processed', 'unknown')
|
66 |
+
characters = response.headers.get('X-Total-Characters', 'unknown')
|
67 |
+
|
68 |
+
print("β
Success! Audio generated and saved")
|
69 |
+
print(f"π File: {output_file}")
|
70 |
+
print(f"β±οΈ Duration: {duration} seconds")
|
71 |
+
print(f"π§© Chunks processed: {chunks}")
|
72 |
+
print(f"π Characters: {characters}")
|
73 |
+
print(f"πΎ File size: {Path(output_file).stat().st_size / 1024:.1f} KB")
|
74 |
+
|
75 |
+
return True
|
76 |
+
else:
|
77 |
+
print(f"β Request failed with status {response.status_code}")
|
78 |
+
print(f"Response: {response.text}")
|
79 |
+
return False
|
80 |
+
|
81 |
+
except requests.exceptions.Timeout:
|
82 |
+
print("β° Request timed out - the document might be too long")
|
83 |
+
return False
|
84 |
+
except Exception as e:
|
85 |
+
print(f"β Error: {e}")
|
86 |
+
return False
|
87 |
+
|
88 |
+
|
89 |
+
def demo_comparison():
|
90 |
+
"""Compare standard vs full-text processing"""
|
91 |
+
|
92 |
+
short_text = "This is a short text for comparison."
|
93 |
+
medium_text = """
|
94 |
+
This is a medium-length text that demonstrates the difference between
|
95 |
+
standard and full-text processing endpoints. The full-text endpoint
|
96 |
+
provides better handling for longer content with intelligent chunking
|
97 |
+
and server-side concatenation.
|
98 |
+
"""
|
99 |
+
|
100 |
+
standard_endpoint = os.getenv("GENERATE_AUDIO_ENDPOINT")
|
101 |
+
fulltext_endpoint = os.getenv("FULL_TEXT_TTS_ENDPOINT")
|
102 |
+
|
103 |
+
if not (standard_endpoint and fulltext_endpoint):
|
104 |
+
print("β οΈ Missing endpoint configuration for comparison")
|
105 |
+
return False
|
106 |
+
|
107 |
+
print("\nπ Comparison Demo")
|
108 |
+
print("=" * 30)
|
109 |
+
|
110 |
+
try:
|
111 |
+
import time
|
112 |
+
|
113 |
+
# Test standard endpoint
|
114 |
+
print("Testing standard endpoint...")
|
115 |
+
start_time = time.time()
|
116 |
+
response1 = requests.post(
|
117 |
+
standard_endpoint,
|
118 |
+
json={"text": short_text},
|
119 |
+
timeout=30
|
120 |
+
)
|
121 |
+
standard_time = time.time() - start_time
|
122 |
+
|
123 |
+
# Test full-text endpoint
|
124 |
+
print("Testing full-text endpoint...")
|
125 |
+
start_time = time.time()
|
126 |
+
response2 = requests.post(
|
127 |
+
fulltext_endpoint,
|
128 |
+
json={"text": medium_text.strip(), "max_chunk_size": 400},
|
129 |
+
timeout=60
|
130 |
+
)
|
131 |
+
fulltext_time = time.time() - start_time
|
132 |
+
|
133 |
+
print(f"\nπ Results:")
|
134 |
+
print(f"Standard endpoint: {standard_time:.2f}s (short text)")
|
135 |
+
print(f"Full-text endpoint: {fulltext_time:.2f}s (medium text)")
|
136 |
+
|
137 |
+
if response2.status_code == 200:
|
138 |
+
chunks = response2.headers.get('X-Chunks-Processed', 'unknown')
|
139 |
+
print(f"Full-text chunks processed: {chunks}")
|
140 |
+
|
141 |
+
return True
|
142 |
+
|
143 |
+
except Exception as e:
|
144 |
+
print(f"β Comparison error: {e}")
|
145 |
+
return False
|
146 |
+
|
147 |
+
|
148 |
+
def main():
|
149 |
+
"""Run the demonstration"""
|
150 |
+
print("π Enhanced Chatterbox TTS API Demonstration")
|
151 |
+
print("This demo showcases the new full-text processing capabilities")
|
152 |
+
print()
|
153 |
+
|
154 |
+
# Check if .env file exists
|
155 |
+
if not Path(".env").exists():
|
156 |
+
print("π Creating sample .env file...")
|
157 |
+
print("Please update it with your actual Modal endpoint URLs")
|
158 |
+
|
159 |
+
env_content = """# Enhanced Chatterbox TTS API Endpoints
|
160 |
+
FULL_TEXT_TTS_ENDPOINT=https://YOUR-MODAL-ENDPOINT.modal.run/generate_full_text_audio
|
161 |
+
GENERATE_AUDIO_ENDPOINT=https://YOUR-MODAL-ENDPOINT.modal.run/generate_audio
|
162 |
+
"""
|
163 |
+
with open(".env", "w") as f:
|
164 |
+
f.write(env_content)
|
165 |
+
print("β
Sample .env file created")
|
166 |
+
return
|
167 |
+
|
168 |
+
# Run demonstrations
|
169 |
+
demo_full_text_processing()
|
170 |
+
demo_comparison()
|
171 |
+
|
172 |
+
print("\nπ Demo complete!")
|
173 |
+
print("Check the demo_output/ directory for generated audio files")
|
174 |
+
|
175 |
+
|
176 |
+
if __name__ == "__main__":
|
177 |
+
main()
|
api/models.py
CHANGED
@@ -4,24 +4,43 @@ Pydantic models for request/response validation and API documentation.
|
|
4 |
"""
|
5 |
|
6 |
from typing import Optional
|
7 |
-
from pydantic import BaseModel
|
8 |
|
9 |
|
10 |
class TTSRequest(BaseModel):
|
11 |
"""Request model for TTS generation with optional voice cloning."""
|
12 |
-
text: str
|
13 |
-
voice_prompt_base64: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
|
16 |
class TTSResponse(BaseModel):
|
17 |
"""Response model for TTS generation with JSON output."""
|
18 |
-
success: bool
|
19 |
-
message: str
|
20 |
-
audio_base64: Optional[str] = None
|
21 |
-
duration_seconds: Optional[float] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
|
24 |
class HealthResponse(BaseModel):
|
25 |
"""Response model for health check endpoint."""
|
26 |
-
status: str
|
27 |
-
model_loaded: bool
|
|
|
4 |
"""
|
5 |
|
6 |
from typing import Optional
|
7 |
+
from pydantic import BaseModel, Field
|
8 |
|
9 |
|
10 |
class TTSRequest(BaseModel):
|
11 |
"""Request model for TTS generation with optional voice cloning."""
|
12 |
+
text: str = Field(..., description="Text to convert to speech", max_length=5000)
|
13 |
+
voice_prompt_base64: Optional[str] = Field(None, description="Base64 encoded voice prompt audio")
|
14 |
+
|
15 |
+
|
16 |
+
class FullTextTTSRequest(BaseModel):
|
17 |
+
"""Request model for full-text TTS generation with server-side processing."""
|
18 |
+
text: str = Field(..., description="Full text to convert to speech (any length)")
|
19 |
+
voice_prompt_base64: Optional[str] = Field(None, description="Base64 encoded voice prompt audio")
|
20 |
+
max_chunk_size: Optional[int] = Field(800, description="Maximum characters per chunk")
|
21 |
+
silence_duration: Optional[float] = Field(0.5, description="Silence duration between chunks (seconds)")
|
22 |
+
fade_duration: Optional[float] = Field(0.1, description="Fade in/out duration (seconds)")
|
23 |
+
overlap_sentences: Optional[int] = Field(0, description="Number of sentences to overlap between chunks")
|
24 |
|
25 |
|
26 |
class TTSResponse(BaseModel):
|
27 |
"""Response model for TTS generation with JSON output."""
|
28 |
+
success: bool = Field(..., description="Whether the request was successful")
|
29 |
+
message: str = Field(..., description="Status message")
|
30 |
+
audio_base64: Optional[str] = Field(None, description="Base64 encoded audio data")
|
31 |
+
duration_seconds: Optional[float] = Field(None, description="Duration of generated audio in seconds")
|
32 |
+
|
33 |
+
|
34 |
+
class FullTextTTSResponse(BaseModel):
|
35 |
+
"""Response model for full-text TTS generation."""
|
36 |
+
success: bool = Field(..., description="Whether the request was successful")
|
37 |
+
message: str = Field(..., description="Status message")
|
38 |
+
audio_base64: Optional[str] = Field(None, description="Base64 encoded audio data")
|
39 |
+
duration_seconds: Optional[float] = Field(None, description="Duration of generated audio in seconds")
|
40 |
+
processing_info: Optional[dict] = Field(None, description="Information about the processing (chunks, etc.)")
|
41 |
|
42 |
|
43 |
class HealthResponse(BaseModel):
|
44 |
"""Response model for health check endpoint."""
|
45 |
+
status: str = Field(..., description="Service status")
|
46 |
+
model_loaded: bool = Field(..., description="Whether the TTS model is loaded")
|
api/test_api.py
ADDED
@@ -0,0 +1,455 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Test script for the enhanced Chatterbox TTS Modal API
|
4 |
+
This script demonstrates how to interact with all the new endpoints
|
5 |
+
"""
|
6 |
+
|
7 |
+
import requests
|
8 |
+
import base64
|
9 |
+
import json
|
10 |
+
import os
|
11 |
+
from pathlib import Path
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
|
14 |
+
# Load environment variables from .env file
|
15 |
+
load_dotenv()
|
16 |
+
|
17 |
+
# Base URLs for the deployed endpoints
|
18 |
+
ENDPOINTS = {
|
19 |
+
"health": os.getenv("HEALTH_ENDPOINT"),
|
20 |
+
"generate_audio": os.getenv("GENERATE_AUDIO_ENDPOINT"),
|
21 |
+
"generate_json": os.getenv("GENERATE_JSON_ENDPOINT"),
|
22 |
+
"generate_with_file": os.getenv("GENERATE_WITH_FILE_ENDPOINT"),
|
23 |
+
"generate": os.getenv("GENERATE_ENDPOINT"),
|
24 |
+
"generate_full_text_audio": os.getenv("GENERATE_FULL_TEXT_AUDIO_ENDPOINT"),
|
25 |
+
"generate_full_text_json": os.getenv("GENERATE_FULL_TEXT_JSON_ENDPOINT")
|
26 |
+
}
|
27 |
+
|
28 |
+
def test_health_check():
|
29 |
+
"""Test the health check endpoint"""
|
30 |
+
print("Testing health check...")
|
31 |
+
try:
|
32 |
+
response = requests.get(ENDPOINTS["health"])
|
33 |
+
print(f"Status: {response.status_code}")
|
34 |
+
print(f"Response: {response.json()}")
|
35 |
+
return response.status_code == 200
|
36 |
+
except Exception as e:
|
37 |
+
print(f"Health check failed: {e}")
|
38 |
+
return False
|
39 |
+
|
40 |
+
def test_basic_generation():
|
41 |
+
"""Test basic text-to-speech generation"""
|
42 |
+
print("\nTesting basic audio generation...")
|
43 |
+
try:
|
44 |
+
response = requests.post(
|
45 |
+
ENDPOINTS["generate_audio"],
|
46 |
+
json={"text": "Hello, this is Chatterbox TTS running on Modal!"}
|
47 |
+
)
|
48 |
+
if response.status_code == 200:
|
49 |
+
Path("output").mkdir(exist_ok=True)
|
50 |
+
with open("output/basic_output.wav", "wb") as f:
|
51 |
+
f.write(response.content)
|
52 |
+
print("β Basic generation successful - saved as output/basic_output.wav")
|
53 |
+
return True
|
54 |
+
else:
|
55 |
+
print(f"β Basic generation failed: {response.status_code}")
|
56 |
+
print(f"Response: {response.text}")
|
57 |
+
return False
|
58 |
+
except Exception as e:
|
59 |
+
print(f"β Basic generation error: {e}")
|
60 |
+
return False
|
61 |
+
|
62 |
+
def test_json_generation():
|
63 |
+
"""Test JSON response with base64 audio"""
|
64 |
+
print("\nTesting JSON audio generation...")
|
65 |
+
try:
|
66 |
+
response = requests.post(
|
67 |
+
ENDPOINTS["generate_json"],
|
68 |
+
json={"text": "This returns JSON with base64 audio data"}
|
69 |
+
)
|
70 |
+
if response.status_code == 200:
|
71 |
+
data = response.json()
|
72 |
+
if data['success'] and data['audio_base64']:
|
73 |
+
# Decode base64 audio and save
|
74 |
+
Path("output").mkdir(exist_ok=True)
|
75 |
+
audio_data = base64.b64decode(data['audio_base64'])
|
76 |
+
with open("output/json_output.wav", "wb") as f:
|
77 |
+
f.write(audio_data)
|
78 |
+
print(f"β JSON generation successful - Duration: {data['duration_seconds']:.2f}s")
|
79 |
+
print(" Saved as output/json_output.wav")
|
80 |
+
return True
|
81 |
+
else:
|
82 |
+
print(f"β JSON generation failed: {data['message']}")
|
83 |
+
return False
|
84 |
+
else:
|
85 |
+
print(f"β JSON generation failed: {response.status_code}")
|
86 |
+
print(f"Response: {response.text}")
|
87 |
+
return False
|
88 |
+
except Exception as e:
|
89 |
+
print(f"β JSON generation error: {e}")
|
90 |
+
return False
|
91 |
+
|
92 |
+
def test_voice_cloning():
|
93 |
+
"""Test voice cloning with audio prompt"""
|
94 |
+
print("\nTesting voice cloning...")
|
95 |
+
|
96 |
+
# First, check if we have a sample audio file
|
97 |
+
sample_file = Path("voice_sample.wav")
|
98 |
+
if not sample_file.exists():
|
99 |
+
print("β No voice_sample.wav found - skipping voice cloning test")
|
100 |
+
print(" To test voice cloning, add a voice_sample.wav file")
|
101 |
+
return True
|
102 |
+
|
103 |
+
try:
|
104 |
+
# Read the voice sample and encode as base64
|
105 |
+
with open(sample_file, "rb") as f:
|
106 |
+
voice_data = base64.b64encode(f.read()).decode()
|
107 |
+
|
108 |
+
response = requests.post(
|
109 |
+
ENDPOINTS["generate_audio"],
|
110 |
+
json={
|
111 |
+
"text": "This should sound like the provided voice sample!",
|
112 |
+
"voice_prompt_base64": voice_data
|
113 |
+
}
|
114 |
+
)
|
115 |
+
|
116 |
+
if response.status_code == 200:
|
117 |
+
Path("output").mkdir(exist_ok=True)
|
118 |
+
with open("output/cloned_output.wav", "wb") as f:
|
119 |
+
f.write(response.content)
|
120 |
+
print("β Voice cloning successful - saved as output/cloned_output.wav")
|
121 |
+
return True
|
122 |
+
else:
|
123 |
+
print(f"β Voice cloning failed: {response.status_code}")
|
124 |
+
print(f"Response: {response.text}")
|
125 |
+
return False
|
126 |
+
except Exception as e:
|
127 |
+
print(f"β Voice cloning error: {e}")
|
128 |
+
return False
|
129 |
+
|
130 |
+
def test_file_upload():
|
131 |
+
"""Test file upload endpoint"""
|
132 |
+
print("\nTesting file upload...")
|
133 |
+
|
134 |
+
sample_file = Path("voice_sample.wav")
|
135 |
+
if not sample_file.exists():
|
136 |
+
print("β No voice_sample.wav found - testing without voice prompt")
|
137 |
+
files = None
|
138 |
+
else:
|
139 |
+
files = {"voice_prompt": open(sample_file, "rb")}
|
140 |
+
|
141 |
+
try:
|
142 |
+
data = {"text": "Testing the file upload endpoint!"}
|
143 |
+
response = requests.post(ENDPOINTS["generate_with_file"], data=data, files=files)
|
144 |
+
|
145 |
+
if files:
|
146 |
+
files["voice_prompt"].close()
|
147 |
+
|
148 |
+
if response.status_code == 200:
|
149 |
+
Path("output").mkdir(exist_ok=True)
|
150 |
+
with open("output/upload_output.wav", "wb") as f:
|
151 |
+
f.write(response.content)
|
152 |
+
print("β File upload successful - saved as output/upload_output.wav")
|
153 |
+
return True
|
154 |
+
else:
|
155 |
+
print(f"β File upload failed: {response.status_code}")
|
156 |
+
print(f"Response: {response.text}")
|
157 |
+
return False
|
158 |
+
except Exception as e:
|
159 |
+
print(f"β File upload error: {e}")
|
160 |
+
return False
|
161 |
+
|
162 |
+
def test_legacy_endpoint():
|
163 |
+
"""Test backward compatibility with legacy endpoint"""
|
164 |
+
print("\nTesting legacy endpoint...")
|
165 |
+
try:
|
166 |
+
# Legacy endpoint expects query parameters, not form data
|
167 |
+
response = requests.post(
|
168 |
+
ENDPOINTS["generate"],
|
169 |
+
params={"prompt": "Testing the legacy endpoint for backward compatibility"}
|
170 |
+
)
|
171 |
+
if response.status_code == 200:
|
172 |
+
Path("output").mkdir(exist_ok=True)
|
173 |
+
with open("output/legacy_output.wav", "wb") as f:
|
174 |
+
f.write(response.content)
|
175 |
+
print("β Legacy endpoint successful - saved as output/legacy_output.wav")
|
176 |
+
return True
|
177 |
+
else:
|
178 |
+
print(f"β Legacy endpoint failed: {response.status_code}")
|
179 |
+
print(f"Response: {response.text}")
|
180 |
+
return False
|
181 |
+
except Exception as e:
|
182 |
+
print(f"β Legacy endpoint error: {e}")
|
183 |
+
return False
|
184 |
+
|
185 |
+
def test_full_text_generation():
|
186 |
+
"""Test full-text audio generation with server-side chunking"""
|
187 |
+
print("\nTesting full-text audio generation...")
|
188 |
+
|
189 |
+
# Create a long text that will require chunking
|
190 |
+
long_text = """
|
191 |
+
This is a comprehensive test of the full-text audio generation endpoint.
|
192 |
+
The text is intentionally long to demonstrate the server-side chunking capabilities.
|
193 |
+
|
194 |
+
The enhanced API will automatically split this text into appropriate chunks,
|
195 |
+
process them in parallel using GPU acceleration, and then concatenate the
|
196 |
+
resulting audio segments with proper transitions and fade effects.
|
197 |
+
|
198 |
+
This approach significantly improves performance for long documents while
|
199 |
+
maintaining high audio quality and natural speech flow. The server handles
|
200 |
+
all the complex processing, allowing the client to simply send the full text
|
201 |
+
and receive the final audio file.
|
202 |
+
|
203 |
+
The chunking algorithm respects sentence and paragraph boundaries to ensure
|
204 |
+
natural speech patterns and maintains proper context across chunk boundaries.
|
205 |
+
This results in more natural-sounding speech for long-form content.
|
206 |
+
"""
|
207 |
+
|
208 |
+
try:
|
209 |
+
if not ENDPOINTS["generate_full_text_audio"]:
|
210 |
+
print("β FULL_TEXT_TTS_ENDPOINT not configured - skipping full-text test")
|
211 |
+
return True
|
212 |
+
|
213 |
+
response = requests.post(
|
214 |
+
ENDPOINTS["generate_full_text_audio"],
|
215 |
+
json={
|
216 |
+
"text": long_text.strip(),
|
217 |
+
"max_chunk_size": 400, # Smaller chunks for testing
|
218 |
+
"silence_duration": 0.3,
|
219 |
+
"fade_duration": 0.1,
|
220 |
+
"overlap_sentences": 0
|
221 |
+
},
|
222 |
+
timeout=120 # Longer timeout for processing
|
223 |
+
)
|
224 |
+
|
225 |
+
if response.status_code == 200:
|
226 |
+
Path("output").mkdir(exist_ok=True)
|
227 |
+
with open("output/full_text_output.wav", "wb") as f:
|
228 |
+
f.write(response.content)
|
229 |
+
|
230 |
+
# Check response headers for processing info
|
231 |
+
duration = response.headers.get('X-Audio-Duration', 'unknown')
|
232 |
+
chunks = response.headers.get('X-Chunks-Processed', 'unknown')
|
233 |
+
characters = response.headers.get('X-Total-Characters', len(long_text))
|
234 |
+
|
235 |
+
print(f"β Full-text generation successful")
|
236 |
+
print(f" Duration: {duration}s")
|
237 |
+
print(f" Chunks processed: {chunks}")
|
238 |
+
print(f" Characters: {characters}")
|
239 |
+
print(" Saved as output/full_text_output.wav")
|
240 |
+
return True
|
241 |
+
else:
|
242 |
+
print(f"β Full-text generation failed: {response.status_code}")
|
243 |
+
print(f"Response: {response.text}")
|
244 |
+
return False
|
245 |
+
except requests.exceptions.Timeout:
|
246 |
+
print("β Full-text generation timed out (this may be normal for very long texts)")
|
247 |
+
return False
|
248 |
+
except Exception as e:
|
249 |
+
print(f"β Full-text generation error: {e}")
|
250 |
+
return False
|
251 |
+
|
252 |
+
|
253 |
+
def test_full_text_json():
|
254 |
+
"""Test full-text JSON response with processing information"""
|
255 |
+
print("\nTesting full-text JSON response...")
|
256 |
+
|
257 |
+
test_text = """
|
258 |
+
This is a test of the full-text JSON endpoint that returns detailed
|
259 |
+
processing information along with the base64 encoded audio data.
|
260 |
+
|
261 |
+
The response includes chunk information, processing parameters,
|
262 |
+
and timing details that can be useful for monitoring and debugging.
|
263 |
+
"""
|
264 |
+
|
265 |
+
try:
|
266 |
+
if not ENDPOINTS["generate_full_text_json"]:
|
267 |
+
print("β FULL_TEXT_JSON_ENDPOINT not configured - skipping test")
|
268 |
+
return True
|
269 |
+
|
270 |
+
response = requests.post(
|
271 |
+
ENDPOINTS["generate_full_text_json"],
|
272 |
+
json={
|
273 |
+
"text": test_text.strip(),
|
274 |
+
"max_chunk_size": 300,
|
275 |
+
"silence_duration": 0.4,
|
276 |
+
"fade_duration": 0.15
|
277 |
+
},
|
278 |
+
timeout=60
|
279 |
+
)
|
280 |
+
|
281 |
+
if response.status_code == 200:
|
282 |
+
data = response.json()
|
283 |
+
if data['success'] and data['audio_base64']:
|
284 |
+
# Decode and save audio
|
285 |
+
Path("output").mkdir(exist_ok=True)
|
286 |
+
audio_data = base64.b64decode(data['audio_base64'])
|
287 |
+
with open("output/full_text_json_output.wav", "wb") as f:
|
288 |
+
f.write(audio_data)
|
289 |
+
|
290 |
+
# Display processing information
|
291 |
+
print(f"β Full-text JSON generation successful")
|
292 |
+
print(f" Duration: {data['duration_seconds']:.2f}s")
|
293 |
+
|
294 |
+
if 'processing_info' in data:
|
295 |
+
info = data['processing_info']
|
296 |
+
if 'chunk_info' in info:
|
297 |
+
chunk_info = info['chunk_info']
|
298 |
+
print(f" Chunks: {chunk_info.get('total_chunks', 'unknown')}")
|
299 |
+
print(f" Characters: {chunk_info.get('total_characters', 'unknown')}")
|
300 |
+
print(f" Avg chunk size: {chunk_info.get('avg_chunk_size', 'unknown'):.0f}")
|
301 |
+
|
302 |
+
print(" Saved as output/full_text_json_output.wav")
|
303 |
+
return True
|
304 |
+
else:
|
305 |
+
print(f"β Full-text JSON generation failed: {data['message']}")
|
306 |
+
return False
|
307 |
+
else:
|
308 |
+
print(f"β Full-text JSON generation failed: {response.status_code}")
|
309 |
+
print(f"Response: {response.text}")
|
310 |
+
return False
|
311 |
+
except Exception as e:
|
312 |
+
print(f"β Full-text JSON generation error: {e}")
|
313 |
+
return False
|
314 |
+
|
315 |
+
|
316 |
+
def test_performance_comparison():
|
317 |
+
"""Compare performance between standard and full-text endpoints"""
|
318 |
+
print("\nTesting performance comparison...")
|
319 |
+
|
320 |
+
# Short text for standard endpoint
|
321 |
+
short_text = "This is a short text for performance comparison testing."
|
322 |
+
|
323 |
+
# Medium text that benefits from chunking
|
324 |
+
medium_text = """
|
325 |
+
This is a medium-length text designed to test the performance differences
|
326 |
+
between the standard endpoint and the enhanced full-text endpoint.
|
327 |
+
|
328 |
+
The full-text endpoint should show its advantages when processing longer
|
329 |
+
texts that require intelligent chunking and parallel processing.
|
330 |
+
|
331 |
+
This text is long enough to require multiple chunks but not so long
|
332 |
+
that it becomes unwieldy for testing purposes.
|
333 |
+
"""
|
334 |
+
|
335 |
+
results = {}
|
336 |
+
|
337 |
+
try:
|
338 |
+
# Test standard endpoint with short text
|
339 |
+
import time
|
340 |
+
start_time = time.time()
|
341 |
+
response = requests.post(
|
342 |
+
ENDPOINTS["generate_audio"],
|
343 |
+
json={"text": short_text},
|
344 |
+
timeout=30
|
345 |
+
)
|
346 |
+
if response.status_code == 200:
|
347 |
+
results['standard_short'] = time.time() - start_time
|
348 |
+
print(f"β Standard endpoint (short): {results['standard_short']:.2f}s")
|
349 |
+
|
350 |
+
# Test full-text endpoint with medium text
|
351 |
+
if ENDPOINTS["generate_full_text_audio"]:
|
352 |
+
start_time = time.time()
|
353 |
+
response = requests.post(
|
354 |
+
ENDPOINTS["generate_full_text_audio"],
|
355 |
+
json={
|
356 |
+
"text": medium_text.strip(),
|
357 |
+
"max_chunk_size": 300
|
358 |
+
},
|
359 |
+
timeout=60
|
360 |
+
)
|
361 |
+
if response.status_code == 200:
|
362 |
+
results['fulltext_medium'] = time.time() - start_time
|
363 |
+
chunks = response.headers.get('X-Chunks-Processed', 'unknown')
|
364 |
+
print(f"β Full-text endpoint (medium, {chunks} chunks): {results['fulltext_medium']:.2f}s")
|
365 |
+
|
366 |
+
# Summary
|
367 |
+
if results:
|
368 |
+
print(" Performance comparison complete!")
|
369 |
+
return True
|
370 |
+
else:
|
371 |
+
print(" Could not complete performance comparison")
|
372 |
+
return False
|
373 |
+
|
374 |
+
except Exception as e:
|
375 |
+
print(f"β Performance comparison error: {e}")
|
376 |
+
return False
|
377 |
+
|
378 |
+
def main():
|
379 |
+
"""Run all tests"""
|
380 |
+
print("Enhanced Chatterbox TTS API Test Suite")
|
381 |
+
print("=" * 50)
|
382 |
+
|
383 |
+
# Check if required endpoints are configured
|
384 |
+
missing_endpoints = [name for name, url in ENDPOINTS.items() if not url]
|
385 |
+
if missing_endpoints:
|
386 |
+
print("β Warning: Some endpoints not configured:")
|
387 |
+
for endpoint in missing_endpoints:
|
388 |
+
print(f" {endpoint}")
|
389 |
+
print(" Set environment variables in .env file")
|
390 |
+
print()
|
391 |
+
|
392 |
+
tests = [
|
393 |
+
test_health_check,
|
394 |
+
test_basic_generation,
|
395 |
+
test_json_generation,
|
396 |
+
test_voice_cloning,
|
397 |
+
test_file_upload,
|
398 |
+
test_legacy_endpoint,
|
399 |
+
test_full_text_generation,
|
400 |
+
test_performance_comparison
|
401 |
+
]
|
402 |
+
|
403 |
+
results = []
|
404 |
+
for test in tests:
|
405 |
+
results.append(test())
|
406 |
+
|
407 |
+
print("\n" + "=" * 50)
|
408 |
+
print("Test Results:")
|
409 |
+
passed = sum(results)
|
410 |
+
total = len(results)
|
411 |
+
print(f"β {passed}/{total} tests passed")
|
412 |
+
|
413 |
+
if passed == total:
|
414 |
+
print("π All tests passed!")
|
415 |
+
print("\nGenerated files in output/ directory:")
|
416 |
+
output_dir = Path("output")
|
417 |
+
if output_dir.exists():
|
418 |
+
for file in output_dir.glob("*.wav"):
|
419 |
+
size_kb = file.stat().st_size / 1024
|
420 |
+
print(f" {file.name} ({size_kb:.1f} KB)")
|
421 |
+
else:
|
422 |
+
print("β Some tests failed - check your Modal deployment")
|
423 |
+
|
424 |
+
print(f"\nAPI Endpoints tested:")
|
425 |
+
for name, url in ENDPOINTS.items():
|
426 |
+
status = "β" if url else "β"
|
427 |
+
print(f" {status} {name}: {url or 'Not configured'}")
|
428 |
+
|
429 |
+
|
430 |
+
def create_sample_env_file():
|
431 |
+
"""Create a sample .env file with endpoint placeholders"""
|
432 |
+
env_content = """# Enhanced Chatterbox TTS API Endpoints
|
433 |
+
# Replace YOUR-MODAL-ENDPOINT with your actual Modal deployment URL
|
434 |
+
|
435 |
+
HEALTH_ENDPOINT=https://YOUR-MODAL-ENDPOINT.modal.run/health
|
436 |
+
GENERATE_AUDIO_ENDPOINT=https://YOUR-MODAL-ENDPOINT.modal.run/generate_audio
|
437 |
+
GENERATE_JSON_ENDPOINT=https://YOUR-MODAL-ENDPOINT.modal.run/generate_json
|
438 |
+
GENERATE_WITH_FILE_ENDPOINT=https://YOUR-MODAL-ENDPOINT.modal.run/generate_with_file
|
439 |
+
GENERATE_ENDPOINT=https://YOUR-MODAL-ENDPOINT.modal.run/generate
|
440 |
+
|
441 |
+
# New enhanced endpoints
|
442 |
+
FULL_TEXT_TTS_ENDPOINT=https://YOUR-MODAL-ENDPOINT.modal.run/generate_full_text_audio
|
443 |
+
FULL_TEXT_JSON_ENDPOINT=https://YOUR-MODAL-ENDPOINT.modal.run/generate_full_text_json
|
444 |
+
"""
|
445 |
+
|
446 |
+
if not Path(".env").exists():
|
447 |
+
with open(".env", "w") as f:
|
448 |
+
f.write(env_content)
|
449 |
+
print("Created sample .env file - please update with your actual endpoints")
|
450 |
+
|
451 |
+
|
452 |
+
if __name__ == "__main__":
|
453 |
+
# Create sample .env if it doesn't exist
|
454 |
+
create_sample_env_file()
|
455 |
+
main()
|
api/text_processing.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Text processing utilities for the TTS API."""
|
2 |
+
|
3 |
+
import re
|
4 |
+
from typing import List
|
5 |
+
|
6 |
+
|
7 |
+
class TextChunker:
|
8 |
+
"""Server-side text chunking for optimal GPU processing."""
|
9 |
+
|
10 |
+
def __init__(self, max_chunk_size: int = 800, overlap_sentences: int = 0):
|
11 |
+
"""
|
12 |
+
Initialize the text chunker.
|
13 |
+
|
14 |
+
Args:
|
15 |
+
max_chunk_size: Maximum number of characters per chunk
|
16 |
+
overlap_sentences: Number of sentences to overlap between chunks for continuity
|
17 |
+
"""
|
18 |
+
self.max_chunk_size = max_chunk_size
|
19 |
+
self.overlap_sentences = overlap_sentences
|
20 |
+
|
21 |
+
def chunk_text(self, text: str) -> List[str]:
|
22 |
+
"""
|
23 |
+
Break text into smaller chunks based on paragraphs and sentence boundaries.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
text: The input text to chunk
|
27 |
+
|
28 |
+
Returns:
|
29 |
+
List of text chunks
|
30 |
+
"""
|
31 |
+
if not text or not text.strip():
|
32 |
+
return []
|
33 |
+
|
34 |
+
# Clean the text
|
35 |
+
text = text.strip()
|
36 |
+
|
37 |
+
# If text is within the limit, return as single chunk
|
38 |
+
if len(text) <= self.max_chunk_size:
|
39 |
+
return [text]
|
40 |
+
|
41 |
+
chunks = []
|
42 |
+
|
43 |
+
# First, try to split by paragraphs
|
44 |
+
paragraphs = self._split_into_paragraphs(text)
|
45 |
+
|
46 |
+
current_chunk = ""
|
47 |
+
|
48 |
+
for paragraph in paragraphs:
|
49 |
+
# If adding this paragraph would exceed the limit
|
50 |
+
if len(current_chunk) + len(paragraph) + 1 > self.max_chunk_size:
|
51 |
+
# If we have content in current chunk, save it
|
52 |
+
if current_chunk.strip():
|
53 |
+
chunks.append(current_chunk.strip())
|
54 |
+
current_chunk = ""
|
55 |
+
|
56 |
+
# If the paragraph itself is too long, split it by sentences
|
57 |
+
if len(paragraph) > self.max_chunk_size:
|
58 |
+
sentence_chunks = self._split_paragraph_into_sentences(paragraph)
|
59 |
+
for sentence_chunk in sentence_chunks:
|
60 |
+
if len(current_chunk) + len(sentence_chunk) + 1 > self.max_chunk_size:
|
61 |
+
if current_chunk.strip():
|
62 |
+
chunks.append(current_chunk.strip())
|
63 |
+
current_chunk = sentence_chunk
|
64 |
+
else:
|
65 |
+
if current_chunk:
|
66 |
+
current_chunk += " " + sentence_chunk
|
67 |
+
else:
|
68 |
+
current_chunk = sentence_chunk
|
69 |
+
else:
|
70 |
+
current_chunk = paragraph
|
71 |
+
else:
|
72 |
+
# Add paragraph to current chunk
|
73 |
+
if current_chunk:
|
74 |
+
current_chunk += "\n\n" + paragraph
|
75 |
+
else:
|
76 |
+
current_chunk = paragraph
|
77 |
+
|
78 |
+
# Add any remaining content
|
79 |
+
if current_chunk.strip():
|
80 |
+
chunks.append(current_chunk.strip())
|
81 |
+
|
82 |
+
# Apply overlap if specified
|
83 |
+
if self.overlap_sentences > 0 and len(chunks) > 1:
|
84 |
+
chunks = self._add_overlap(chunks)
|
85 |
+
|
86 |
+
return chunks
|
87 |
+
|
88 |
+
def _split_into_paragraphs(self, text: str) -> List[str]:
|
89 |
+
"""Split text into paragraphs."""
|
90 |
+
# Split by double newlines or multiple spaces
|
91 |
+
paragraphs = re.split(r'\n\s*\n|(?:\n\s*){2,}', text)
|
92 |
+
# Filter out empty paragraphs and strip whitespace
|
93 |
+
return [p.strip() for p in paragraphs if p.strip()]
|
94 |
+
|
95 |
+
def _split_paragraph_into_sentences(self, paragraph: str) -> List[str]:
|
96 |
+
"""Split a long paragraph into sentence-based chunks."""
|
97 |
+
# Split by sentence boundaries
|
98 |
+
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
|
99 |
+
|
100 |
+
chunks = []
|
101 |
+
current_chunk = ""
|
102 |
+
|
103 |
+
for sentence in sentences:
|
104 |
+
# If a single sentence is longer than max_chunk_size, we need to force-split it
|
105 |
+
if len(sentence) > self.max_chunk_size:
|
106 |
+
# Save current chunk if it has content
|
107 |
+
if current_chunk.strip():
|
108 |
+
chunks.append(current_chunk.strip())
|
109 |
+
current_chunk = ""
|
110 |
+
|
111 |
+
# Force-split the long sentence into smaller pieces
|
112 |
+
while len(sentence) > self.max_chunk_size:
|
113 |
+
# Find a good breaking point (prefer spaces)
|
114 |
+
break_point = self.max_chunk_size
|
115 |
+
if ' ' in sentence[:self.max_chunk_size]:
|
116 |
+
# Find the last space within the limit
|
117 |
+
break_point = sentence[:self.max_chunk_size].rfind(' ')
|
118 |
+
|
119 |
+
chunk_part = sentence[:break_point]
|
120 |
+
chunks.append(chunk_part)
|
121 |
+
sentence = sentence[break_point:].strip()
|
122 |
+
|
123 |
+
# Add the remaining part of the sentence
|
124 |
+
if sentence:
|
125 |
+
current_chunk = sentence
|
126 |
+
|
127 |
+
elif len(current_chunk) + len(sentence) + 1 > self.max_chunk_size:
|
128 |
+
if current_chunk.strip():
|
129 |
+
chunks.append(current_chunk.strip())
|
130 |
+
current_chunk = sentence
|
131 |
+
else:
|
132 |
+
if current_chunk:
|
133 |
+
current_chunk += " " + sentence
|
134 |
+
else:
|
135 |
+
current_chunk = sentence
|
136 |
+
|
137 |
+
if current_chunk.strip():
|
138 |
+
chunks.append(current_chunk.strip())
|
139 |
+
|
140 |
+
return chunks
|
141 |
+
|
142 |
+
def _add_overlap(self, chunks: List[str]) -> List[str]:
|
143 |
+
"""Add sentence overlap between chunks for better continuity."""
|
144 |
+
if len(chunks) <= 1:
|
145 |
+
return chunks
|
146 |
+
|
147 |
+
overlapped_chunks = [chunks[0]] # First chunk stays the same
|
148 |
+
|
149 |
+
for i in range(1, len(chunks)):
|
150 |
+
# Get last few sentences from previous chunk
|
151 |
+
prev_chunk = chunks[i - 1]
|
152 |
+
current_chunk = chunks[i]
|
153 |
+
|
154 |
+
prev_sentences = re.split(r'(?<=[.!?])\s+', prev_chunk)
|
155 |
+
overlap_text = " ".join(prev_sentences[-self.overlap_sentences:]) if len(prev_sentences) > self.overlap_sentences else ""
|
156 |
+
|
157 |
+
if overlap_text:
|
158 |
+
overlapped_chunk = overlap_text + " " + current_chunk
|
159 |
+
else:
|
160 |
+
overlapped_chunk = current_chunk
|
161 |
+
|
162 |
+
overlapped_chunks.append(overlapped_chunk)
|
163 |
+
|
164 |
+
return overlapped_chunks
|
165 |
+
|
166 |
+
def get_chunk_info(self, chunks: List[str]) -> dict:
|
167 |
+
"""Get information about the chunks."""
|
168 |
+
return {
|
169 |
+
"total_chunks": len(chunks),
|
170 |
+
"total_characters": sum(len(chunk) for chunk in chunks),
|
171 |
+
"avg_chunk_size": sum(len(chunk) for chunk in chunks) / len(chunks) if chunks else 0,
|
172 |
+
"max_chunk_size": max(len(chunk) for chunk in chunks) if chunks else 0,
|
173 |
+
"min_chunk_size": min(len(chunk) for chunk in chunks) if chunks else 0
|
174 |
+
}
|
api/tts_service.py
CHANGED
@@ -12,11 +12,14 @@ from fastapi.responses import StreamingResponse, Response
|
|
12 |
from fastapi import HTTPException, File, UploadFile, Form
|
13 |
|
14 |
from .config import app, image
|
15 |
-
from .models import TTSRequest, TTSResponse, HealthResponse
|
16 |
from .audio_utils import AudioUtils
|
|
|
|
|
17 |
|
18 |
with image.imports():
|
19 |
from chatterbox.tts import ChatterboxTTS
|
|
|
20 |
# Suppress specific transformers deprecation warnings
|
21 |
warnings.filterwarnings("ignore", message=".*past_key_values.*", category=FutureWarning)
|
22 |
|
@@ -276,3 +279,153 @@ class ChatterboxTTSService:
|
|
276 |
except Exception as e:
|
277 |
print(f"Error generating audio: {str(e)}")
|
278 |
raise HTTPException(status_code=500, detail=f"Audio generation failed: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
from fastapi import HTTPException, File, UploadFile, Form
|
13 |
|
14 |
from .config import app, image
|
15 |
+
from .models import TTSRequest, TTSResponse, HealthResponse, FullTextTTSRequest, FullTextTTSResponse
|
16 |
from .audio_utils import AudioUtils
|
17 |
+
from .text_processing import TextChunker
|
18 |
+
from .audio_concatenator import AudioConcatenator
|
19 |
|
20 |
with image.imports():
|
21 |
from chatterbox.tts import ChatterboxTTS
|
22 |
+
import torch # Add torch import here
|
23 |
# Suppress specific transformers deprecation warnings
|
24 |
warnings.filterwarnings("ignore", message=".*past_key_values.*", category=FutureWarning)
|
25 |
|
|
|
279 |
except Exception as e:
|
280 |
print(f"Error generating audio: {str(e)}")
|
281 |
raise HTTPException(status_code=500, detail=f"Audio generation failed: {str(e)}")
|
282 |
+
|
283 |
+
@modal.fastapi_endpoint(docs=True, method="POST")
|
284 |
+
def generate_full_text_audio(self, request: FullTextTTSRequest) -> StreamingResponse:
|
285 |
+
"""
|
286 |
+
Generate speech audio from full text with server-side chunking and parallel processing.
|
287 |
+
|
288 |
+
This endpoint handles texts of any length by:
|
289 |
+
1. Chunking the text intelligently (respecting sentence/paragraph boundaries)
|
290 |
+
2. Processing chunks in parallel using GPU resources
|
291 |
+
3. Concatenating audio chunks with proper transitions
|
292 |
+
4. Returning the final audio file
|
293 |
+
|
294 |
+
Args:
|
295 |
+
request: FullTextTTSRequest containing text and processing parameters
|
296 |
+
|
297 |
+
Returns:
|
298 |
+
StreamingResponse with final concatenated audio as WAV file
|
299 |
+
"""
|
300 |
+
try:
|
301 |
+
self._validate_text_input(request.text)
|
302 |
+
audio_prompt_path = self._process_voice_prompt(request.voice_prompt_base64)
|
303 |
+
|
304 |
+
print(f"Processing full text ({len(request.text)} chars) with server-side chunking...")
|
305 |
+
|
306 |
+
# Initialize text chunker with request parameters
|
307 |
+
chunker = TextChunker(
|
308 |
+
max_chunk_size=request.max_chunk_size,
|
309 |
+
overlap_sentences=request.overlap_sentences
|
310 |
+
)
|
311 |
+
|
312 |
+
# Chunk the text
|
313 |
+
text_chunks = chunker.chunk_text(request.text)
|
314 |
+
chunk_info = chunker.get_chunk_info(text_chunks)
|
315 |
+
print(f"Split text into {len(text_chunks)} chunks for processing")
|
316 |
+
|
317 |
+
# Initialize audio_chunks variable for processing info
|
318 |
+
audio_chunks = []
|
319 |
+
# If only one chunk, process directly
|
320 |
+
if len(text_chunks) == 1:
|
321 |
+
wav = self._generate_audio(text_chunks[0], audio_prompt_path)
|
322 |
+
# For single chunk, pass the full wav object to maintain consistency
|
323 |
+
final_audio = wav
|
324 |
+
audio_chunks = [wav] # For consistent processing info
|
325 |
+
else:
|
326 |
+
# Process chunks in parallel
|
327 |
+
import concurrent.futures
|
328 |
+
import numpy as np
|
329 |
+
|
330 |
+
def process_chunk(chunk_text: str):
|
331 |
+
"""Process a single chunk."""
|
332 |
+
wav_result = self._generate_audio(chunk_text, audio_prompt_path)
|
333 |
+
# Return the full wav result, not just wav[0]
|
334 |
+
return wav_result
|
335 |
+
|
336 |
+
# Use ThreadPoolExecutor for parallel processing
|
337 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
338 |
+
# Submit all chunks for processing
|
339 |
+
future_to_chunk = {
|
340 |
+
executor.submit(process_chunk, chunk): i
|
341 |
+
for i, chunk in enumerate(text_chunks)
|
342 |
+
}
|
343 |
+
|
344 |
+
# Collect results in order
|
345 |
+
results = [None] * len(text_chunks)
|
346 |
+
for future in concurrent.futures.as_completed(future_to_chunk):
|
347 |
+
chunk_index = future_to_chunk[future]
|
348 |
+
try:
|
349 |
+
audio_result = future.result()
|
350 |
+
results[chunk_index] = audio_result
|
351 |
+
except Exception as exc:
|
352 |
+
print(f'Chunk {chunk_index} generated an exception: {exc}')
|
353 |
+
raise HTTPException(status_code=500, detail=f"Failed to process chunk {chunk_index}: {str(exc)}")
|
354 |
+
|
355 |
+
# Filter out None results
|
356 |
+
audio_chunks = [result for result in results if result is not None]
|
357 |
+
|
358 |
+
if len(audio_chunks) != len(text_chunks):
|
359 |
+
raise HTTPException(status_code=500, detail=f"Only {len(audio_chunks)} out of {len(text_chunks)} chunks processed successfully")
|
360 |
+
|
361 |
+
# Concatenate audio chunks
|
362 |
+
print("Concatenating audio chunks...")
|
363 |
+
concatenator = AudioConcatenator(
|
364 |
+
silence_duration=request.silence_duration,
|
365 |
+
fade_duration=request.fade_duration
|
366 |
+
)
|
367 |
+
|
368 |
+
final_audio = concatenator.concatenate_audio_chunks(audio_chunks, self.model.sr)
|
369 |
+
|
370 |
+
# --- Start of new audio processing logic ---
|
371 |
+
import torch
|
372 |
+
import numpy as np
|
373 |
+
|
374 |
+
processed_tensor = final_audio
|
375 |
+
# Unwrap if it's a single-element tuple repeatedly
|
376 |
+
while isinstance(processed_tensor, tuple) and len(processed_tensor) == 1:
|
377 |
+
processed_tensor = processed_tensor[0]
|
378 |
+
|
379 |
+
# Convert to PyTorch tensor if it's a NumPy array
|
380 |
+
if isinstance(processed_tensor, np.ndarray):
|
381 |
+
processed_tensor = torch.from_numpy(processed_tensor.astype(np.float32))
|
382 |
+
|
383 |
+
if not isinstance(processed_tensor, torch.Tensor): # Check if it's a tensor now
|
384 |
+
raise TypeError(f"Audio data after concatenation is not a tensor. Got type: {type(processed_tensor)}")
|
385 |
+
|
386 |
+
# Ensure correct shape (C, L) for torchaudio.save
|
387 |
+
if processed_tensor.ndim == 1: # Shape (L,)
|
388 |
+
audio_to_save = processed_tensor.unsqueeze(0) # Convert to (1, L)
|
389 |
+
elif processed_tensor.ndim == 2: # Shape (C, L)
|
390 |
+
if processed_tensor.shape[0] == 0:
|
391 |
+
raise ValueError(f"Audio tensor has 0 channels: {processed_tensor.shape}")
|
392 |
+
if processed_tensor.shape[0] > 1: # If C > 1 (stereo/multi-channel)
|
393 |
+
print(f"Multi-channel audio (shape {processed_tensor.shape}) detected. Taking the first channel.")
|
394 |
+
audio_to_save = processed_tensor[0, :].unsqueeze(0) # Result is (1, L)
|
395 |
+
else: # Already (1, L)
|
396 |
+
audio_to_save = processed_tensor
|
397 |
+
else:
|
398 |
+
raise ValueError(f"Unexpected audio tensor dimensions: {processed_tensor.ndim}, shape: {processed_tensor.shape}")
|
399 |
+
buffer = AudioUtils.save_audio_to_buffer(audio_to_save, self.model.sr)
|
400 |
+
duration = audio_to_save.shape[1] / self.model.sr # Use shape[1] for length
|
401 |
+
|
402 |
+
# Reset buffer position for reading
|
403 |
+
buffer.seek(0)
|
404 |
+
# --- End of new audio processing logic --- # Prepare processing info
|
405 |
+
processing_info = {
|
406 |
+
"total_chunks": len(text_chunks),
|
407 |
+
"processed_chunks": len(audio_chunks),
|
408 |
+
"failed_chunks": len(text_chunks) - len(audio_chunks),
|
409 |
+
"sample_rate": self.model.sr,
|
410 |
+
"duration": duration
|
411 |
+
}
|
412 |
+
|
413 |
+
print(f"Full text processing complete! Final audio duration: {duration:.2f} seconds")
|
414 |
+
return StreamingResponse(
|
415 |
+
buffer,
|
416 |
+
media_type="audio/wav",
|
417 |
+
headers={
|
418 |
+
"Content-Disposition": "attachment; filename=generated_full_text_speech.wav",
|
419 |
+
"X-Audio-Duration": str(duration),
|
420 |
+
"X-Chunks-Processed": str(len(audio_chunks)),
|
421 |
+
"X-Total-Characters": str(len(request.text))
|
422 |
+
}
|
423 |
+
)
|
424 |
+
|
425 |
+
except HTTPException as http_exc:
|
426 |
+
print(f"HTTP exception in full text generation: {http_exc.detail}")
|
427 |
+
raise http_exc
|
428 |
+
except Exception as e:
|
429 |
+
error_msg = f"Full text audio generation failed: {str(e)}"
|
430 |
+
print(f"Exception in full text generation: {error_msg}")
|
431 |
+
raise HTTPException(status_code=500, detail=error_msg)
|