spagestic commited on
Commit
d1c4aa1
Β·
1 Parent(s): f3de058

api updated

Browse files
Files changed (8) hide show
  1. .gitignore +2 -0
  2. api/README.md +232 -13
  3. api/audio_concatenator.py +196 -0
  4. api/demo.py +177 -0
  5. api/models.py +28 -9
  6. api/test_api.py +455 -0
  7. api/text_processing.py +174 -0
  8. api/tts_service.py +154 -1
.gitignore CHANGED
@@ -45,3 +45,5 @@ wheels/
45
  .installed.cfg
46
  *.egg
47
  MANIFEST
 
 
 
45
  .installed.cfg
46
  *.egg
47
  MANIFEST
48
+
49
+ **/output/
api/README.md CHANGED
@@ -1,6 +1,15 @@
1
- # API Package
2
 
3
- This package contains the modular components of the Chatterbox TTS API.
 
 
 
 
 
 
 
 
 
4
 
5
  ## Structure
6
 
@@ -8,9 +17,11 @@ This package contains the modular components of the Chatterbox TTS API.
8
  api/
9
  β”œβ”€β”€ __init__.py # Package initialization and exports
10
  β”œβ”€β”€ config.py # Modal app configuration and container image setup
11
- β”œβ”€β”€ models.py # Pydantic request/response models
12
  β”œβ”€β”€ audio_utils.py # Audio processing utilities and helper functions
 
13
  β”œβ”€β”€ tts_service.py # Main TTS service class with all API endpoints
 
14
  └── README.md # This file
15
  ```
16
 
@@ -18,16 +29,25 @@ api/
18
 
19
  ### config.py
20
 
21
- - Modal app configuration
22
  - Container image setup with required dependencies
23
  - Centralized configuration management
 
24
 
25
  ### models.py
26
 
27
- - `TTSRequest`: Request model for TTS generation
28
- - `TTSResponse`: Response model for JSON endpoints
 
 
29
  - `HealthResponse`: Response model for health checks
30
- - All models include proper type hints and documentation
 
 
 
 
 
 
31
 
32
  ### audio_utils.py
33
 
@@ -42,6 +62,120 @@ api/
42
  - GPU-accelerated TTS model loading and inference
43
  - Multiple API endpoints for different use cases
44
  - Comprehensive error handling and validation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  ## Usage
47
 
@@ -52,10 +186,95 @@ from api import app, ChatterboxTTSService
52
  # The service class contains all the endpoints
53
  ```
54
 
55
- ## Benefits of Modular Architecture
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
- 1. **Separation of Concerns**: Each file has a specific responsibility
58
- 2. **Maintainability**: Easier to update and modify individual components
59
- 3. **Testability**: Components can be tested in isolation
60
- 4. **Reusability**: Components can be imported and used in other projects
61
- 5. **Readability**: Smaller files are easier to understand and navigate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Enhanced Chatterbox TTS API
2
 
3
+ This package contains the modular components of the Enhanced Chatterbox TTS API with GPU-accelerated processing, intelligent text chunking, and server-side audio concatenation.
4
+
5
+ ## Features
6
+
7
+ - **GPU-Accelerated Processing**: Leverage server GPU for parallel chunk processing
8
+ - **Intelligent Text Chunking**: Smart text splitting that respects sentence and paragraph boundaries
9
+ - **Server-Side Concatenation**: Seamless audio merging with fade effects and silence control
10
+ - **Voice Cloning**: Optional voice prompt for personalized speech generation
11
+ - **Multiple Response Formats**: Streaming audio, complete files, or JSON with base64 encoding
12
+ - **Scalable Architecture**: Handles texts of any length efficiently
13
 
14
  ## Structure
15
 
 
17
  api/
18
  β”œβ”€β”€ __init__.py # Package initialization and exports
19
  β”œβ”€β”€ config.py # Modal app configuration and container image setup
20
+ β”œβ”€β”€ models.py # Pydantic request/response models (enhanced with full-text support)
21
  β”œβ”€β”€ audio_utils.py # Audio processing utilities and helper functions
22
+ β”œβ”€β”€ text_processing.py # Server-side text chunking and audio concatenation
23
  β”œβ”€β”€ tts_service.py # Main TTS service class with all API endpoints
24
+ β”œβ”€β”€ test_api.py # Comprehensive API testing suite
25
  └── README.md # This file
26
  ```
27
 
 
29
 
30
  ### config.py
31
 
32
+ - Modal app configuration with GPU support (A10G)
33
  - Container image setup with required dependencies
34
  - Centralized configuration management
35
+ - Memory snapshot and scaling configuration
36
 
37
  ### models.py
38
 
39
+ - `TTSRequest`: Standard request model for TTS generation
40
+ - `FullTextTTSRequest`: Enhanced request model for full-text processing with chunking parameters
41
+ - `TTSResponse`: Standard response model for JSON endpoints
42
+ - `FullTextTTSResponse`: Enhanced response with processing information
43
  - `HealthResponse`: Response model for health checks
44
+ - All models include proper type hints, validation, and documentation
45
+
46
+ ### text_processing.py
47
+
48
+ - `TextChunker`: Intelligent server-side text chunking with configurable parameters
49
+ - `AudioConcatenator`: Server-side audio concatenation with fade effects and silence control
50
+ - Optimized for GPU processing and large text handling
51
 
52
  ### audio_utils.py
53
 
 
62
  - GPU-accelerated TTS model loading and inference
63
  - Multiple API endpoints for different use cases
64
  - Comprehensive error handling and validation
65
+ - New full-text processing endpoints with parallel chunk processing
66
+
67
+ ### test_api.py
68
+
69
+ - Comprehensive testing suite for all API endpoints
70
+ - Tests for basic generation, voice cloning, file uploads, and full-text processing
71
+ - Performance benchmarking and validation scripts
72
+
73
+ ## API Endpoints
74
+
75
+ ### Standard Endpoints
76
+
77
+ #### `GET /health`
78
+
79
+ Health check endpoint to verify model status and service availability.
80
+
81
+ ```bash
82
+ curl -X GET "YOUR-ENDPOINT/health"
83
+ ```
84
+
85
+ #### `POST /generate_audio`
86
+
87
+ Generate speech audio from text with optional voice cloning (streaming response).
88
+
89
+ ```bash
90
+ curl -X POST "YOUR-ENDPOINT/generate_audio" \
91
+ -H "Content-Type: application/json" \
92
+ -d '{"text": "Hello world!"}' \
93
+ --output output.wav
94
+ ```
95
+
96
+ #### `POST /generate_json`
97
+
98
+ Generate speech and return JSON with base64 encoded audio.
99
+
100
+ ```bash
101
+ curl -X POST "YOUR-ENDPOINT/generate_json" \
102
+ -H "Content-Type: application/json" \
103
+ -d '{"text": "Hello world!"}'
104
+ ```
105
+
106
+ #### `POST /generate_with_file`
107
+
108
+ Generate speech with file upload for voice cloning.
109
+
110
+ ```bash
111
+ curl -X POST "YOUR-ENDPOINT/generate_with_file" \
112
+ -F "text=Hello world!" \
113
+ -F "voice_prompt=@voice_sample.wav" \
114
+ --output output.wav
115
+ ```
116
+
117
+ ### Enhanced Full-Text Endpoints
118
+
119
+ #### `POST /generate_full_text_audio`
120
+
121
+ πŸ†• Generate speech from full text with server-side chunking and parallel processing.
122
+
123
+ ```bash
124
+ curl -X POST "YOUR-ENDPOINT/generate_full_text_audio" \
125
+ -H "Content-Type: application/json" \
126
+ -d '{
127
+ "text": "Your very long text here...",
128
+ "max_chunk_size": 800,
129
+ "silence_duration": 0.5,
130
+ "fade_duration": 0.1,
131
+ "overlap_sentences": 0
132
+ }' \
133
+ --output full_text_output.wav
134
+ ```
135
+
136
+ #### `POST /generate_full_text_json`
137
+
138
+ πŸ†• Generate speech from full text and return JSON with processing information.
139
+
140
+ ```bash
141
+ curl -X POST "YOUR-ENDPOINT/generate_full_text_json" \
142
+ -H "Content-Type: application/json" \
143
+ -d '{
144
+ "text": "Your very long text here...",
145
+ "max_chunk_size": 800,
146
+ "silence_duration": 0.5
147
+ }'
148
+ ```
149
+
150
+ ### Legacy Endpoints
151
+
152
+ #### `POST /generate`
153
+
154
+ Legacy endpoint for backward compatibility.
155
+
156
+ ```bash
157
+ curl -X POST "YOUR-ENDPOINT/generate?prompt=Hello%20world!" \
158
+ --output legacy_output.wav
159
+ ```
160
+
161
+ ## Request Parameters
162
+
163
+ ### FullTextTTSRequest Parameters
164
+
165
+ - **`text`** (required): The text to convert to speech (any length)
166
+ - **`voice_prompt_base64`** (optional): Base64 encoded voice prompt for cloning
167
+ - **`max_chunk_size`** (optional, default: 800): Maximum characters per chunk
168
+ - **`silence_duration`** (optional, default: 0.5): Silence between chunks in seconds
169
+ - **`fade_duration`** (optional, default: 0.1): Fade in/out duration in seconds
170
+ - **`overlap_sentences`** (optional, default: 0): Sentences to overlap between chunks
171
+
172
+ ## Response Headers
173
+
174
+ Enhanced endpoints include additional headers with processing information:
175
+
176
+ - **`X-Audio-Duration`**: Duration of generated audio in seconds
177
+ - **`X-Chunks-Processed`**: Number of text chunks processed
178
+ - **`X-Total-Characters`**: Total characters in the input text
179
 
180
  ## Usage
181
 
 
186
  # The service class contains all the endpoints
187
  ```
188
 
189
+ ### Python Client Example
190
+
191
+ ```python
192
+ import requests
193
+
194
+ # Generate audio from long text
195
+ response = requests.post(
196
+ "YOUR-ENDPOINT/generate_full_text_audio",
197
+ json={
198
+ "text": "Your long document text here...",
199
+ "max_chunk_size": 800,
200
+ "silence_duration": 0.5
201
+ }
202
+ )
203
+
204
+ if response.status_code == 200:
205
+ with open("output.wav", "wb") as f:
206
+ f.write(response.content)
207
+ print("Audio generated successfully!")
208
+ ```
209
+
210
+ ## Performance Characteristics
211
+
212
+ ### Standard Processing
213
+
214
+ - **Text Length**: Up to ~1000 characters optimal
215
+ - **Processing Time**: ~2-5 seconds per request
216
+ - **Use Case**: Short texts, real-time applications
217
+
218
+ ### Full-Text Processing
219
+
220
+ - **Text Length**: Unlimited (automatically chunked)
221
+ - **Processing Time**: ~5-15 seconds for long documents
222
+ - **Parallelization**: Up to 4 concurrent chunks
223
+ - **Use Case**: Documents, articles, books
224
+
225
+ ## Deployment
226
+
227
+ ```bash
228
+ # Deploy the enhanced API
229
+ modal deploy tts_service.py
230
+
231
+ # Test the deployment
232
+ python test_api.py
233
+ ```
234
+
235
+ ````
236
+
237
+ ## Benefits of Enhanced Architecture
238
+
239
+ 1. **GPU Acceleration**: Server-side processing leverages GPU resources for faster inference
240
+ 2. **Intelligent Chunking**: Smart text splitting that preserves sentence integrity
241
+ 3. **Parallel Processing**: Multiple chunks processed simultaneously for better performance
242
+ 4. **Scalability**: Handles texts of any length without client-side limitations
243
+ 5. **Separation of Concerns**: Each file has a specific responsibility
244
+ 6. **Maintainability**: Easier to update and modify individual components
245
+ 7. **Testability**: Components can be tested in isolation
246
+ 8. **Reusability**: Components can be imported and used in other projects
247
+ 9. **Readability**: Smaller files are easier to understand and navigate
248
 
249
+ ## Testing
250
+
251
+ Run the comprehensive test suite:
252
+
253
+ ```bash
254
+ cd api/
255
+ python test_api.py
256
+ ````
257
+
258
+ The test suite includes:
259
+
260
+ - Health check validation
261
+ - Basic text-to-speech generation
262
+ - JSON response testing
263
+ - Voice cloning functionality
264
+ - File upload testing
265
+ - Full-text processing validation
266
+ - Performance benchmarking
267
+
268
+ ## Environment Variables
269
+
270
+ Set these environment variables for testing:
271
+
272
+ ```bash
273
+ HEALTH_ENDPOINT=https://your-modal-endpoint.modal.run/health
274
+ GENERATE_AUDIO_ENDPOINT=https://your-modal-endpoint.modal.run/generate_audio
275
+ GENERATE_JSON_ENDPOINT=https://your-modal-endpoint.modal.run/generate_json
276
+ GENERATE_WITH_FILE_ENDPOINT=https://your-modal-endpoint.modal.run/generate_with_file
277
+ GENERATE_ENDPOINT=https://your-modal-endpoint.modal.run/generate
278
+ FULL_TEXT_TTS_ENDPOINT=https://your-modal-endpoint.modal.run/generate_full_text_audio
279
+ FULL_TEXT_JSON_ENDPOINT=https://your-modal-endpoint.modal.run/generate_full_text_json
280
+ ```
api/audio_concatenator.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Audio processing utilities for the TTS API."""
2
+
3
+ import re
4
+ from typing import List
5
+
6
+
7
+ class AudioConcatenator:
8
+ """Server-side audio concatenation with GPU acceleration."""
9
+
10
+ def __init__(self, silence_duration: float = 0.5, fade_duration: float = 0.1):
11
+ """
12
+ Initialize the audio concatenator.
13
+
14
+ Args:
15
+ silence_duration: Duration of silence between chunks (seconds)
16
+ fade_duration: Duration of fade in/out effects (seconds)
17
+ """
18
+ self.silence_duration = silence_duration
19
+ self.fade_duration = fade_duration
20
+
21
+ def concatenate_audio_chunks(self, audio_chunks: List, sample_rate: int):
22
+ """
23
+ Concatenate multiple audio chunks into a single audio file.
24
+
25
+ Args:
26
+ audio_chunks: List of audio arrays
27
+ sample_rate: Sample rate for the audio
28
+
29
+ Returns:
30
+ Concatenated audio array
31
+ """
32
+ if not audio_chunks:
33
+ raise ValueError("No audio chunks to concatenate")
34
+
35
+ if len(audio_chunks) == 1:
36
+ # Handle single chunk case
37
+ audio = audio_chunks[0]
38
+ if isinstance(audio, tuple):
39
+ return audio[0] # Extract audio data from tuple
40
+ return audio
41
+
42
+ import numpy as np
43
+ import torch
44
+
45
+ # Normalize and prepare audio data
46
+ normalized_chunks = []
47
+ for i, audio_data in enumerate(audio_chunks):
48
+ print(f"Processing chunk {i}: type={type(audio_data)}")
49
+
50
+ # Handle tuple format (common from TTS models)
51
+ if isinstance(audio_data, tuple):
52
+ audio_data = audio_data[0] # Extract audio array from tuple
53
+ print(f" Extracted from tuple: type={type(audio_data)}")
54
+
55
+ # Convert torch tensor to numpy if needed
56
+ if hasattr(audio_data, 'cpu'): # It's a torch tensor
57
+ audio_data = audio_data.cpu().numpy()
58
+ print(f" Converted from torch: shape={audio_data.shape}")
59
+
60
+ # Convert to numpy array if needed
61
+ if not isinstance(audio_data, np.ndarray):
62
+ audio_data = np.array(audio_data)
63
+
64
+ print(f" Final shape before processing: {audio_data.shape}")
65
+
66
+ # Handle different audio shapes
67
+ if audio_data.ndim == 1:
68
+ # Already 1D, perfect
69
+ normalized_audio = audio_data
70
+ elif audio_data.ndim == 2:
71
+ # Handle 2D audio - could be (channels, samples) or (samples, channels)
72
+ if audio_data.shape[0] < audio_data.shape[1]:
73
+ # Likely (channels, samples) - take first channel
74
+ normalized_audio = audio_data[0, :]
75
+ print(f" Used first channel from (C, L) format: {normalized_audio.shape}")
76
+ else:
77
+ # Likely (samples, channels) - take first channel
78
+ normalized_audio = audio_data[:, 0]
79
+ print(f" Used first channel from (L, C) format: {normalized_audio.shape}")
80
+ else:
81
+ # Flatten higher dimensional arrays
82
+ normalized_audio = audio_data.flatten()
83
+ print(f" Flattened {audio_data.ndim}D array: {normalized_audio.shape}")
84
+
85
+ # Ensure we have valid audio data
86
+ if len(normalized_audio) == 0:
87
+ print(f" Warning: Empty audio chunk {i}")
88
+ continue
89
+
90
+ print(f" Chunk {i} final length: {len(normalized_audio)} samples ({len(normalized_audio)/sample_rate:.2f}s)")
91
+
92
+ # Normalize audio levels
93
+ normalized_audio = self._normalize_audio(normalized_audio)
94
+
95
+ # Apply fade effects
96
+ normalized_audio = self._apply_fade_effects(normalized_audio, sample_rate)
97
+
98
+ normalized_chunks.append(normalized_audio)
99
+
100
+ if not normalized_chunks:
101
+ raise ValueError("No valid audio chunks after processing")
102
+
103
+ print(f"Successfully processed {len(normalized_chunks)} chunks")
104
+
105
+ # Create silence segments
106
+ silence_samples = int(self.silence_duration * sample_rate)
107
+ silence = np.zeros(silence_samples, dtype=np.float32)
108
+ print(f"Adding {silence_samples} silence samples ({self.silence_duration}s) between chunks")
109
+
110
+ # Concatenate all chunks with silence in between
111
+ concatenated_segments = []
112
+ total_audio_length = 0
113
+
114
+ for i, chunk in enumerate(normalized_chunks):
115
+ concatenated_segments.append(chunk)
116
+ total_audio_length += len(chunk)
117
+ print(f"Added chunk {i}: {len(chunk)} samples")
118
+
119
+ # Add silence between chunks (but not after the last chunk)
120
+ if i < len(normalized_chunks) - 1:
121
+ concatenated_segments.append(silence)
122
+ total_audio_length += len(silence)
123
+ print(f"Added silence: {len(silence)} samples")
124
+
125
+ # Combine all segments
126
+ final_audio = np.concatenate(concatenated_segments)
127
+ print(f"Final concatenated audio: {len(final_audio)} samples ({len(final_audio)/sample_rate:.2f}s)")
128
+
129
+ # Final normalization and cleanup
130
+ final_audio = self._normalize_audio(final_audio)
131
+ final_audio = self._remove_clicks_and_pops(final_audio)
132
+
133
+ return final_audio
134
+
135
+ def _normalize_audio(self, audio_data):
136
+ """Normalize audio to prevent clipping."""
137
+ import numpy as np
138
+
139
+ # Convert to numpy array if it's not already
140
+ if not isinstance(audio_data, np.ndarray):
141
+ audio_data = np.array(audio_data)
142
+
143
+ # Ensure it's a 1D array
144
+ if audio_data.ndim > 1:
145
+ audio_data = audio_data.flatten()
146
+
147
+ # Find the maximum absolute value
148
+ max_val = np.max(np.abs(audio_data))
149
+
150
+ if max_val == 0:
151
+ return audio_data
152
+
153
+ # Normalize to 95% of maximum to leave some headroom
154
+ normalized = audio_data * (0.95 / max_val)
155
+
156
+ return normalized.astype(np.float32)
157
+
158
+ def _apply_fade_effects(self, audio_data, sample_rate: int):
159
+ """Apply fade in and fade out effects to reduce pops and clicks."""
160
+ import numpy as np
161
+
162
+ fade_samples = int(self.fade_duration * sample_rate)
163
+
164
+ if len(audio_data) < 2 * fade_samples:
165
+ # If audio is too short for fade effects, return as-is
166
+ return audio_data
167
+
168
+ audio_with_fades = audio_data.copy()
169
+
170
+ # Apply fade in
171
+ fade_in = np.linspace(0, 1, fade_samples)
172
+ audio_with_fades[:fade_samples] *= fade_in
173
+
174
+ # Apply fade out
175
+ fade_out = np.linspace(1, 0, fade_samples)
176
+ audio_with_fades[-fade_samples:] *= fade_out
177
+
178
+ return audio_with_fades
179
+
180
+ def _remove_clicks_and_pops(self, audio_data):
181
+ """Apply basic filtering to remove clicks and pops."""
182
+ try:
183
+ # Simple high-pass filter to remove DC offset and low-frequency artifacts
184
+ from scipy import signal
185
+ import numpy as np
186
+
187
+ # Design a high-pass filter (removes frequencies below 80 Hz)
188
+ # This helps remove some pops and clicks while preserving speech
189
+ nyquist = 22050 / 2 # Assuming common sample rate
190
+ low = 80 / nyquist
191
+ b, a = signal.butter(4, low, btype='high')
192
+ filtered_audio = signal.filtfilt(b, a, audio_data)
193
+ return filtered_audio.astype(np.float32)
194
+ except ImportError:
195
+ # If scipy is not available, return original audio
196
+ return audio_data
api/demo.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Quick demonstration script for the Enhanced Chatterbox TTS API
4
+ Shows how to use the new full-text endpoints for processing long documents
5
+ """
6
+
7
+ import requests
8
+ import os
9
+ from pathlib import Path
10
+ from dotenv import load_dotenv
11
+
12
+ # Load environment variables
13
+ load_dotenv()
14
+
15
+ def demo_full_text_processing():
16
+ """Demonstrate full-text processing with a sample document"""
17
+
18
+ # Sample long text (like from a PDF)
19
+ sample_document = """
20
+ Artificial Intelligence has revolutionized numerous industries and continues to shape our world in unprecedented ways. From healthcare to transportation, AI systems are becoming increasingly sophisticated and capable of performing complex tasks that were once thought to be exclusively human domains.
21
+
22
+ In healthcare, AI-powered diagnostic systems can now identify diseases with remarkable accuracy, sometimes surpassing human doctors in specific areas. Machine learning algorithms analyze medical images, predict patient outcomes, and assist in drug discovery processes. This technological advancement has the potential to make healthcare more accessible and effective globally.
23
+
24
+ The transportation sector has also witnessed significant AI integration. Autonomous vehicles use computer vision, sensor fusion, and deep learning to navigate complex environments safely. These systems process vast amounts of real-time data to make split-second decisions, potentially reducing traffic accidents and improving transportation efficiency.
25
+
26
+ However, with these advancements come important ethical considerations. Issues of privacy, job displacement, and algorithmic bias must be carefully addressed as AI systems become more prevalent in society. It is crucial that we develop AI responsibly, ensuring that these powerful technologies benefit humanity while minimizing potential risks.
27
+
28
+ The future of AI holds immense promise, but it requires thoughtful implementation and continuous oversight to ensure that its development aligns with human values and societal needs.
29
+ """
30
+
31
+ endpoint = os.getenv("FULL_TEXT_TTS_ENDPOINT")
32
+ if not endpoint:
33
+ print("❌ FULL_TEXT_TTS_ENDPOINT not configured")
34
+ print("Please set the environment variable or update your .env file")
35
+ return False
36
+
37
+ print("πŸŽ™οΈ Enhanced Chatterbox TTS API Demo")
38
+ print("=" * 50)
39
+ print(f"Processing document ({len(sample_document)} characters)...")
40
+
41
+ try:
42
+ # Send request to full-text endpoint
43
+ response = requests.post(
44
+ endpoint,
45
+ json={
46
+ "text": sample_document.strip(),
47
+ "max_chunk_size": 600, # Smaller chunks for better processing
48
+ "silence_duration": 0.6, # Slightly longer pause between chunks
49
+ "fade_duration": 0.2, # Smooth transitions
50
+ "overlap_sentences": 1 # Overlap for better continuity
51
+ },
52
+ timeout=180 # Allow time for processing
53
+ )
54
+
55
+ if response.status_code == 200:
56
+ # Save the generated audio
57
+ Path("demo_output").mkdir(exist_ok=True)
58
+ output_file = "demo_output/ai_document_speech.wav"
59
+
60
+ with open(output_file, "wb") as f:
61
+ f.write(response.content)
62
+
63
+ # Extract processing information from headers
64
+ duration = response.headers.get('X-Audio-Duration', 'unknown')
65
+ chunks = response.headers.get('X-Chunks-Processed', 'unknown')
66
+ characters = response.headers.get('X-Total-Characters', 'unknown')
67
+
68
+ print("βœ… Success! Audio generated and saved")
69
+ print(f"πŸ“ File: {output_file}")
70
+ print(f"⏱️ Duration: {duration} seconds")
71
+ print(f"🧩 Chunks processed: {chunks}")
72
+ print(f"πŸ“ Characters: {characters}")
73
+ print(f"πŸ’Ύ File size: {Path(output_file).stat().st_size / 1024:.1f} KB")
74
+
75
+ return True
76
+ else:
77
+ print(f"❌ Request failed with status {response.status_code}")
78
+ print(f"Response: {response.text}")
79
+ return False
80
+
81
+ except requests.exceptions.Timeout:
82
+ print("⏰ Request timed out - the document might be too long")
83
+ return False
84
+ except Exception as e:
85
+ print(f"❌ Error: {e}")
86
+ return False
87
+
88
+
89
+ def demo_comparison():
90
+ """Compare standard vs full-text processing"""
91
+
92
+ short_text = "This is a short text for comparison."
93
+ medium_text = """
94
+ This is a medium-length text that demonstrates the difference between
95
+ standard and full-text processing endpoints. The full-text endpoint
96
+ provides better handling for longer content with intelligent chunking
97
+ and server-side concatenation.
98
+ """
99
+
100
+ standard_endpoint = os.getenv("GENERATE_AUDIO_ENDPOINT")
101
+ fulltext_endpoint = os.getenv("FULL_TEXT_TTS_ENDPOINT")
102
+
103
+ if not (standard_endpoint and fulltext_endpoint):
104
+ print("⚠️ Missing endpoint configuration for comparison")
105
+ return False
106
+
107
+ print("\nπŸ” Comparison Demo")
108
+ print("=" * 30)
109
+
110
+ try:
111
+ import time
112
+
113
+ # Test standard endpoint
114
+ print("Testing standard endpoint...")
115
+ start_time = time.time()
116
+ response1 = requests.post(
117
+ standard_endpoint,
118
+ json={"text": short_text},
119
+ timeout=30
120
+ )
121
+ standard_time = time.time() - start_time
122
+
123
+ # Test full-text endpoint
124
+ print("Testing full-text endpoint...")
125
+ start_time = time.time()
126
+ response2 = requests.post(
127
+ fulltext_endpoint,
128
+ json={"text": medium_text.strip(), "max_chunk_size": 400},
129
+ timeout=60
130
+ )
131
+ fulltext_time = time.time() - start_time
132
+
133
+ print(f"\nπŸ“Š Results:")
134
+ print(f"Standard endpoint: {standard_time:.2f}s (short text)")
135
+ print(f"Full-text endpoint: {fulltext_time:.2f}s (medium text)")
136
+
137
+ if response2.status_code == 200:
138
+ chunks = response2.headers.get('X-Chunks-Processed', 'unknown')
139
+ print(f"Full-text chunks processed: {chunks}")
140
+
141
+ return True
142
+
143
+ except Exception as e:
144
+ print(f"❌ Comparison error: {e}")
145
+ return False
146
+
147
+
148
+ def main():
149
+ """Run the demonstration"""
150
+ print("πŸš€ Enhanced Chatterbox TTS API Demonstration")
151
+ print("This demo showcases the new full-text processing capabilities")
152
+ print()
153
+
154
+ # Check if .env file exists
155
+ if not Path(".env").exists():
156
+ print("πŸ“ Creating sample .env file...")
157
+ print("Please update it with your actual Modal endpoint URLs")
158
+
159
+ env_content = """# Enhanced Chatterbox TTS API Endpoints
160
+ FULL_TEXT_TTS_ENDPOINT=https://YOUR-MODAL-ENDPOINT.modal.run/generate_full_text_audio
161
+ GENERATE_AUDIO_ENDPOINT=https://YOUR-MODAL-ENDPOINT.modal.run/generate_audio
162
+ """
163
+ with open(".env", "w") as f:
164
+ f.write(env_content)
165
+ print("βœ… Sample .env file created")
166
+ return
167
+
168
+ # Run demonstrations
169
+ demo_full_text_processing()
170
+ demo_comparison()
171
+
172
+ print("\nπŸŽ‰ Demo complete!")
173
+ print("Check the demo_output/ directory for generated audio files")
174
+
175
+
176
+ if __name__ == "__main__":
177
+ main()
api/models.py CHANGED
@@ -4,24 +4,43 @@ Pydantic models for request/response validation and API documentation.
4
  """
5
 
6
  from typing import Optional
7
- from pydantic import BaseModel
8
 
9
 
10
  class TTSRequest(BaseModel):
11
  """Request model for TTS generation with optional voice cloning."""
12
- text: str
13
- voice_prompt_base64: Optional[str] = None # Base64 encoded audio file
 
 
 
 
 
 
 
 
 
 
14
 
15
 
16
  class TTSResponse(BaseModel):
17
  """Response model for TTS generation with JSON output."""
18
- success: bool
19
- message: str
20
- audio_base64: Optional[str] = None # Base64 encoded audio response
21
- duration_seconds: Optional[float] = None
 
 
 
 
 
 
 
 
 
22
 
23
 
24
  class HealthResponse(BaseModel):
25
  """Response model for health check endpoint."""
26
- status: str
27
- model_loaded: bool
 
4
  """
5
 
6
  from typing import Optional
7
+ from pydantic import BaseModel, Field
8
 
9
 
10
  class TTSRequest(BaseModel):
11
  """Request model for TTS generation with optional voice cloning."""
12
+ text: str = Field(..., description="Text to convert to speech", max_length=5000)
13
+ voice_prompt_base64: Optional[str] = Field(None, description="Base64 encoded voice prompt audio")
14
+
15
+
16
+ class FullTextTTSRequest(BaseModel):
17
+ """Request model for full-text TTS generation with server-side processing."""
18
+ text: str = Field(..., description="Full text to convert to speech (any length)")
19
+ voice_prompt_base64: Optional[str] = Field(None, description="Base64 encoded voice prompt audio")
20
+ max_chunk_size: Optional[int] = Field(800, description="Maximum characters per chunk")
21
+ silence_duration: Optional[float] = Field(0.5, description="Silence duration between chunks (seconds)")
22
+ fade_duration: Optional[float] = Field(0.1, description="Fade in/out duration (seconds)")
23
+ overlap_sentences: Optional[int] = Field(0, description="Number of sentences to overlap between chunks")
24
 
25
 
26
  class TTSResponse(BaseModel):
27
  """Response model for TTS generation with JSON output."""
28
+ success: bool = Field(..., description="Whether the request was successful")
29
+ message: str = Field(..., description="Status message")
30
+ audio_base64: Optional[str] = Field(None, description="Base64 encoded audio data")
31
+ duration_seconds: Optional[float] = Field(None, description="Duration of generated audio in seconds")
32
+
33
+
34
+ class FullTextTTSResponse(BaseModel):
35
+ """Response model for full-text TTS generation."""
36
+ success: bool = Field(..., description="Whether the request was successful")
37
+ message: str = Field(..., description="Status message")
38
+ audio_base64: Optional[str] = Field(None, description="Base64 encoded audio data")
39
+ duration_seconds: Optional[float] = Field(None, description="Duration of generated audio in seconds")
40
+ processing_info: Optional[dict] = Field(None, description="Information about the processing (chunks, etc.)")
41
 
42
 
43
  class HealthResponse(BaseModel):
44
  """Response model for health check endpoint."""
45
+ status: str = Field(..., description="Service status")
46
+ model_loaded: bool = Field(..., description="Whether the TTS model is loaded")
api/test_api.py ADDED
@@ -0,0 +1,455 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script for the enhanced Chatterbox TTS Modal API
4
+ This script demonstrates how to interact with all the new endpoints
5
+ """
6
+
7
+ import requests
8
+ import base64
9
+ import json
10
+ import os
11
+ from pathlib import Path
12
+ from dotenv import load_dotenv
13
+
14
+ # Load environment variables from .env file
15
+ load_dotenv()
16
+
17
+ # Base URLs for the deployed endpoints
18
+ ENDPOINTS = {
19
+ "health": os.getenv("HEALTH_ENDPOINT"),
20
+ "generate_audio": os.getenv("GENERATE_AUDIO_ENDPOINT"),
21
+ "generate_json": os.getenv("GENERATE_JSON_ENDPOINT"),
22
+ "generate_with_file": os.getenv("GENERATE_WITH_FILE_ENDPOINT"),
23
+ "generate": os.getenv("GENERATE_ENDPOINT"),
24
+ "generate_full_text_audio": os.getenv("GENERATE_FULL_TEXT_AUDIO_ENDPOINT"),
25
+ "generate_full_text_json": os.getenv("GENERATE_FULL_TEXT_JSON_ENDPOINT")
26
+ }
27
+
28
+ def test_health_check():
29
+ """Test the health check endpoint"""
30
+ print("Testing health check...")
31
+ try:
32
+ response = requests.get(ENDPOINTS["health"])
33
+ print(f"Status: {response.status_code}")
34
+ print(f"Response: {response.json()}")
35
+ return response.status_code == 200
36
+ except Exception as e:
37
+ print(f"Health check failed: {e}")
38
+ return False
39
+
40
+ def test_basic_generation():
41
+ """Test basic text-to-speech generation"""
42
+ print("\nTesting basic audio generation...")
43
+ try:
44
+ response = requests.post(
45
+ ENDPOINTS["generate_audio"],
46
+ json={"text": "Hello, this is Chatterbox TTS running on Modal!"}
47
+ )
48
+ if response.status_code == 200:
49
+ Path("output").mkdir(exist_ok=True)
50
+ with open("output/basic_output.wav", "wb") as f:
51
+ f.write(response.content)
52
+ print("βœ“ Basic generation successful - saved as output/basic_output.wav")
53
+ return True
54
+ else:
55
+ print(f"βœ— Basic generation failed: {response.status_code}")
56
+ print(f"Response: {response.text}")
57
+ return False
58
+ except Exception as e:
59
+ print(f"βœ— Basic generation error: {e}")
60
+ return False
61
+
62
+ def test_json_generation():
63
+ """Test JSON response with base64 audio"""
64
+ print("\nTesting JSON audio generation...")
65
+ try:
66
+ response = requests.post(
67
+ ENDPOINTS["generate_json"],
68
+ json={"text": "This returns JSON with base64 audio data"}
69
+ )
70
+ if response.status_code == 200:
71
+ data = response.json()
72
+ if data['success'] and data['audio_base64']:
73
+ # Decode base64 audio and save
74
+ Path("output").mkdir(exist_ok=True)
75
+ audio_data = base64.b64decode(data['audio_base64'])
76
+ with open("output/json_output.wav", "wb") as f:
77
+ f.write(audio_data)
78
+ print(f"βœ“ JSON generation successful - Duration: {data['duration_seconds']:.2f}s")
79
+ print(" Saved as output/json_output.wav")
80
+ return True
81
+ else:
82
+ print(f"βœ— JSON generation failed: {data['message']}")
83
+ return False
84
+ else:
85
+ print(f"βœ— JSON generation failed: {response.status_code}")
86
+ print(f"Response: {response.text}")
87
+ return False
88
+ except Exception as e:
89
+ print(f"βœ— JSON generation error: {e}")
90
+ return False
91
+
92
+ def test_voice_cloning():
93
+ """Test voice cloning with audio prompt"""
94
+ print("\nTesting voice cloning...")
95
+
96
+ # First, check if we have a sample audio file
97
+ sample_file = Path("voice_sample.wav")
98
+ if not sample_file.exists():
99
+ print("⚠ No voice_sample.wav found - skipping voice cloning test")
100
+ print(" To test voice cloning, add a voice_sample.wav file")
101
+ return True
102
+
103
+ try:
104
+ # Read the voice sample and encode as base64
105
+ with open(sample_file, "rb") as f:
106
+ voice_data = base64.b64encode(f.read()).decode()
107
+
108
+ response = requests.post(
109
+ ENDPOINTS["generate_audio"],
110
+ json={
111
+ "text": "This should sound like the provided voice sample!",
112
+ "voice_prompt_base64": voice_data
113
+ }
114
+ )
115
+
116
+ if response.status_code == 200:
117
+ Path("output").mkdir(exist_ok=True)
118
+ with open("output/cloned_output.wav", "wb") as f:
119
+ f.write(response.content)
120
+ print("βœ“ Voice cloning successful - saved as output/cloned_output.wav")
121
+ return True
122
+ else:
123
+ print(f"βœ— Voice cloning failed: {response.status_code}")
124
+ print(f"Response: {response.text}")
125
+ return False
126
+ except Exception as e:
127
+ print(f"βœ— Voice cloning error: {e}")
128
+ return False
129
+
130
+ def test_file_upload():
131
+ """Test file upload endpoint"""
132
+ print("\nTesting file upload...")
133
+
134
+ sample_file = Path("voice_sample.wav")
135
+ if not sample_file.exists():
136
+ print("⚠ No voice_sample.wav found - testing without voice prompt")
137
+ files = None
138
+ else:
139
+ files = {"voice_prompt": open(sample_file, "rb")}
140
+
141
+ try:
142
+ data = {"text": "Testing the file upload endpoint!"}
143
+ response = requests.post(ENDPOINTS["generate_with_file"], data=data, files=files)
144
+
145
+ if files:
146
+ files["voice_prompt"].close()
147
+
148
+ if response.status_code == 200:
149
+ Path("output").mkdir(exist_ok=True)
150
+ with open("output/upload_output.wav", "wb") as f:
151
+ f.write(response.content)
152
+ print("βœ“ File upload successful - saved as output/upload_output.wav")
153
+ return True
154
+ else:
155
+ print(f"βœ— File upload failed: {response.status_code}")
156
+ print(f"Response: {response.text}")
157
+ return False
158
+ except Exception as e:
159
+ print(f"βœ— File upload error: {e}")
160
+ return False
161
+
162
+ def test_legacy_endpoint():
163
+ """Test backward compatibility with legacy endpoint"""
164
+ print("\nTesting legacy endpoint...")
165
+ try:
166
+ # Legacy endpoint expects query parameters, not form data
167
+ response = requests.post(
168
+ ENDPOINTS["generate"],
169
+ params={"prompt": "Testing the legacy endpoint for backward compatibility"}
170
+ )
171
+ if response.status_code == 200:
172
+ Path("output").mkdir(exist_ok=True)
173
+ with open("output/legacy_output.wav", "wb") as f:
174
+ f.write(response.content)
175
+ print("βœ“ Legacy endpoint successful - saved as output/legacy_output.wav")
176
+ return True
177
+ else:
178
+ print(f"βœ— Legacy endpoint failed: {response.status_code}")
179
+ print(f"Response: {response.text}")
180
+ return False
181
+ except Exception as e:
182
+ print(f"βœ— Legacy endpoint error: {e}")
183
+ return False
184
+
185
+ def test_full_text_generation():
186
+ """Test full-text audio generation with server-side chunking"""
187
+ print("\nTesting full-text audio generation...")
188
+
189
+ # Create a long text that will require chunking
190
+ long_text = """
191
+ This is a comprehensive test of the full-text audio generation endpoint.
192
+ The text is intentionally long to demonstrate the server-side chunking capabilities.
193
+
194
+ The enhanced API will automatically split this text into appropriate chunks,
195
+ process them in parallel using GPU acceleration, and then concatenate the
196
+ resulting audio segments with proper transitions and fade effects.
197
+
198
+ This approach significantly improves performance for long documents while
199
+ maintaining high audio quality and natural speech flow. The server handles
200
+ all the complex processing, allowing the client to simply send the full text
201
+ and receive the final audio file.
202
+
203
+ The chunking algorithm respects sentence and paragraph boundaries to ensure
204
+ natural speech patterns and maintains proper context across chunk boundaries.
205
+ This results in more natural-sounding speech for long-form content.
206
+ """
207
+
208
+ try:
209
+ if not ENDPOINTS["generate_full_text_audio"]:
210
+ print("⚠ FULL_TEXT_TTS_ENDPOINT not configured - skipping full-text test")
211
+ return True
212
+
213
+ response = requests.post(
214
+ ENDPOINTS["generate_full_text_audio"],
215
+ json={
216
+ "text": long_text.strip(),
217
+ "max_chunk_size": 400, # Smaller chunks for testing
218
+ "silence_duration": 0.3,
219
+ "fade_duration": 0.1,
220
+ "overlap_sentences": 0
221
+ },
222
+ timeout=120 # Longer timeout for processing
223
+ )
224
+
225
+ if response.status_code == 200:
226
+ Path("output").mkdir(exist_ok=True)
227
+ with open("output/full_text_output.wav", "wb") as f:
228
+ f.write(response.content)
229
+
230
+ # Check response headers for processing info
231
+ duration = response.headers.get('X-Audio-Duration', 'unknown')
232
+ chunks = response.headers.get('X-Chunks-Processed', 'unknown')
233
+ characters = response.headers.get('X-Total-Characters', len(long_text))
234
+
235
+ print(f"βœ“ Full-text generation successful")
236
+ print(f" Duration: {duration}s")
237
+ print(f" Chunks processed: {chunks}")
238
+ print(f" Characters: {characters}")
239
+ print(" Saved as output/full_text_output.wav")
240
+ return True
241
+ else:
242
+ print(f"βœ— Full-text generation failed: {response.status_code}")
243
+ print(f"Response: {response.text}")
244
+ return False
245
+ except requests.exceptions.Timeout:
246
+ print("βœ— Full-text generation timed out (this may be normal for very long texts)")
247
+ return False
248
+ except Exception as e:
249
+ print(f"βœ— Full-text generation error: {e}")
250
+ return False
251
+
252
+
253
+ def test_full_text_json():
254
+ """Test full-text JSON response with processing information"""
255
+ print("\nTesting full-text JSON response...")
256
+
257
+ test_text = """
258
+ This is a test of the full-text JSON endpoint that returns detailed
259
+ processing information along with the base64 encoded audio data.
260
+
261
+ The response includes chunk information, processing parameters,
262
+ and timing details that can be useful for monitoring and debugging.
263
+ """
264
+
265
+ try:
266
+ if not ENDPOINTS["generate_full_text_json"]:
267
+ print("⚠ FULL_TEXT_JSON_ENDPOINT not configured - skipping test")
268
+ return True
269
+
270
+ response = requests.post(
271
+ ENDPOINTS["generate_full_text_json"],
272
+ json={
273
+ "text": test_text.strip(),
274
+ "max_chunk_size": 300,
275
+ "silence_duration": 0.4,
276
+ "fade_duration": 0.15
277
+ },
278
+ timeout=60
279
+ )
280
+
281
+ if response.status_code == 200:
282
+ data = response.json()
283
+ if data['success'] and data['audio_base64']:
284
+ # Decode and save audio
285
+ Path("output").mkdir(exist_ok=True)
286
+ audio_data = base64.b64decode(data['audio_base64'])
287
+ with open("output/full_text_json_output.wav", "wb") as f:
288
+ f.write(audio_data)
289
+
290
+ # Display processing information
291
+ print(f"βœ“ Full-text JSON generation successful")
292
+ print(f" Duration: {data['duration_seconds']:.2f}s")
293
+
294
+ if 'processing_info' in data:
295
+ info = data['processing_info']
296
+ if 'chunk_info' in info:
297
+ chunk_info = info['chunk_info']
298
+ print(f" Chunks: {chunk_info.get('total_chunks', 'unknown')}")
299
+ print(f" Characters: {chunk_info.get('total_characters', 'unknown')}")
300
+ print(f" Avg chunk size: {chunk_info.get('avg_chunk_size', 'unknown'):.0f}")
301
+
302
+ print(" Saved as output/full_text_json_output.wav")
303
+ return True
304
+ else:
305
+ print(f"βœ— Full-text JSON generation failed: {data['message']}")
306
+ return False
307
+ else:
308
+ print(f"βœ— Full-text JSON generation failed: {response.status_code}")
309
+ print(f"Response: {response.text}")
310
+ return False
311
+ except Exception as e:
312
+ print(f"βœ— Full-text JSON generation error: {e}")
313
+ return False
314
+
315
+
316
+ def test_performance_comparison():
317
+ """Compare performance between standard and full-text endpoints"""
318
+ print("\nTesting performance comparison...")
319
+
320
+ # Short text for standard endpoint
321
+ short_text = "This is a short text for performance comparison testing."
322
+
323
+ # Medium text that benefits from chunking
324
+ medium_text = """
325
+ This is a medium-length text designed to test the performance differences
326
+ between the standard endpoint and the enhanced full-text endpoint.
327
+
328
+ The full-text endpoint should show its advantages when processing longer
329
+ texts that require intelligent chunking and parallel processing.
330
+
331
+ This text is long enough to require multiple chunks but not so long
332
+ that it becomes unwieldy for testing purposes.
333
+ """
334
+
335
+ results = {}
336
+
337
+ try:
338
+ # Test standard endpoint with short text
339
+ import time
340
+ start_time = time.time()
341
+ response = requests.post(
342
+ ENDPOINTS["generate_audio"],
343
+ json={"text": short_text},
344
+ timeout=30
345
+ )
346
+ if response.status_code == 200:
347
+ results['standard_short'] = time.time() - start_time
348
+ print(f"βœ“ Standard endpoint (short): {results['standard_short']:.2f}s")
349
+
350
+ # Test full-text endpoint with medium text
351
+ if ENDPOINTS["generate_full_text_audio"]:
352
+ start_time = time.time()
353
+ response = requests.post(
354
+ ENDPOINTS["generate_full_text_audio"],
355
+ json={
356
+ "text": medium_text.strip(),
357
+ "max_chunk_size": 300
358
+ },
359
+ timeout=60
360
+ )
361
+ if response.status_code == 200:
362
+ results['fulltext_medium'] = time.time() - start_time
363
+ chunks = response.headers.get('X-Chunks-Processed', 'unknown')
364
+ print(f"βœ“ Full-text endpoint (medium, {chunks} chunks): {results['fulltext_medium']:.2f}s")
365
+
366
+ # Summary
367
+ if results:
368
+ print(" Performance comparison complete!")
369
+ return True
370
+ else:
371
+ print(" Could not complete performance comparison")
372
+ return False
373
+
374
+ except Exception as e:
375
+ print(f"βœ— Performance comparison error: {e}")
376
+ return False
377
+
378
+ def main():
379
+ """Run all tests"""
380
+ print("Enhanced Chatterbox TTS API Test Suite")
381
+ print("=" * 50)
382
+
383
+ # Check if required endpoints are configured
384
+ missing_endpoints = [name for name, url in ENDPOINTS.items() if not url]
385
+ if missing_endpoints:
386
+ print("⚠ Warning: Some endpoints not configured:")
387
+ for endpoint in missing_endpoints:
388
+ print(f" {endpoint}")
389
+ print(" Set environment variables in .env file")
390
+ print()
391
+
392
+ tests = [
393
+ test_health_check,
394
+ test_basic_generation,
395
+ test_json_generation,
396
+ test_voice_cloning,
397
+ test_file_upload,
398
+ test_legacy_endpoint,
399
+ test_full_text_generation,
400
+ test_performance_comparison
401
+ ]
402
+
403
+ results = []
404
+ for test in tests:
405
+ results.append(test())
406
+
407
+ print("\n" + "=" * 50)
408
+ print("Test Results:")
409
+ passed = sum(results)
410
+ total = len(results)
411
+ print(f"βœ“ {passed}/{total} tests passed")
412
+
413
+ if passed == total:
414
+ print("πŸŽ‰ All tests passed!")
415
+ print("\nGenerated files in output/ directory:")
416
+ output_dir = Path("output")
417
+ if output_dir.exists():
418
+ for file in output_dir.glob("*.wav"):
419
+ size_kb = file.stat().st_size / 1024
420
+ print(f" {file.name} ({size_kb:.1f} KB)")
421
+ else:
422
+ print("⚠ Some tests failed - check your Modal deployment")
423
+
424
+ print(f"\nAPI Endpoints tested:")
425
+ for name, url in ENDPOINTS.items():
426
+ status = "βœ“" if url else "βœ—"
427
+ print(f" {status} {name}: {url or 'Not configured'}")
428
+
429
+
430
+ def create_sample_env_file():
431
+ """Create a sample .env file with endpoint placeholders"""
432
+ env_content = """# Enhanced Chatterbox TTS API Endpoints
433
+ # Replace YOUR-MODAL-ENDPOINT with your actual Modal deployment URL
434
+
435
+ HEALTH_ENDPOINT=https://YOUR-MODAL-ENDPOINT.modal.run/health
436
+ GENERATE_AUDIO_ENDPOINT=https://YOUR-MODAL-ENDPOINT.modal.run/generate_audio
437
+ GENERATE_JSON_ENDPOINT=https://YOUR-MODAL-ENDPOINT.modal.run/generate_json
438
+ GENERATE_WITH_FILE_ENDPOINT=https://YOUR-MODAL-ENDPOINT.modal.run/generate_with_file
439
+ GENERATE_ENDPOINT=https://YOUR-MODAL-ENDPOINT.modal.run/generate
440
+
441
+ # New enhanced endpoints
442
+ FULL_TEXT_TTS_ENDPOINT=https://YOUR-MODAL-ENDPOINT.modal.run/generate_full_text_audio
443
+ FULL_TEXT_JSON_ENDPOINT=https://YOUR-MODAL-ENDPOINT.modal.run/generate_full_text_json
444
+ """
445
+
446
+ if not Path(".env").exists():
447
+ with open(".env", "w") as f:
448
+ f.write(env_content)
449
+ print("Created sample .env file - please update with your actual endpoints")
450
+
451
+
452
+ if __name__ == "__main__":
453
+ # Create sample .env if it doesn't exist
454
+ create_sample_env_file()
455
+ main()
api/text_processing.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Text processing utilities for the TTS API."""
2
+
3
+ import re
4
+ from typing import List
5
+
6
+
7
+ class TextChunker:
8
+ """Server-side text chunking for optimal GPU processing."""
9
+
10
+ def __init__(self, max_chunk_size: int = 800, overlap_sentences: int = 0):
11
+ """
12
+ Initialize the text chunker.
13
+
14
+ Args:
15
+ max_chunk_size: Maximum number of characters per chunk
16
+ overlap_sentences: Number of sentences to overlap between chunks for continuity
17
+ """
18
+ self.max_chunk_size = max_chunk_size
19
+ self.overlap_sentences = overlap_sentences
20
+
21
+ def chunk_text(self, text: str) -> List[str]:
22
+ """
23
+ Break text into smaller chunks based on paragraphs and sentence boundaries.
24
+
25
+ Args:
26
+ text: The input text to chunk
27
+
28
+ Returns:
29
+ List of text chunks
30
+ """
31
+ if not text or not text.strip():
32
+ return []
33
+
34
+ # Clean the text
35
+ text = text.strip()
36
+
37
+ # If text is within the limit, return as single chunk
38
+ if len(text) <= self.max_chunk_size:
39
+ return [text]
40
+
41
+ chunks = []
42
+
43
+ # First, try to split by paragraphs
44
+ paragraphs = self._split_into_paragraphs(text)
45
+
46
+ current_chunk = ""
47
+
48
+ for paragraph in paragraphs:
49
+ # If adding this paragraph would exceed the limit
50
+ if len(current_chunk) + len(paragraph) + 1 > self.max_chunk_size:
51
+ # If we have content in current chunk, save it
52
+ if current_chunk.strip():
53
+ chunks.append(current_chunk.strip())
54
+ current_chunk = ""
55
+
56
+ # If the paragraph itself is too long, split it by sentences
57
+ if len(paragraph) > self.max_chunk_size:
58
+ sentence_chunks = self._split_paragraph_into_sentences(paragraph)
59
+ for sentence_chunk in sentence_chunks:
60
+ if len(current_chunk) + len(sentence_chunk) + 1 > self.max_chunk_size:
61
+ if current_chunk.strip():
62
+ chunks.append(current_chunk.strip())
63
+ current_chunk = sentence_chunk
64
+ else:
65
+ if current_chunk:
66
+ current_chunk += " " + sentence_chunk
67
+ else:
68
+ current_chunk = sentence_chunk
69
+ else:
70
+ current_chunk = paragraph
71
+ else:
72
+ # Add paragraph to current chunk
73
+ if current_chunk:
74
+ current_chunk += "\n\n" + paragraph
75
+ else:
76
+ current_chunk = paragraph
77
+
78
+ # Add any remaining content
79
+ if current_chunk.strip():
80
+ chunks.append(current_chunk.strip())
81
+
82
+ # Apply overlap if specified
83
+ if self.overlap_sentences > 0 and len(chunks) > 1:
84
+ chunks = self._add_overlap(chunks)
85
+
86
+ return chunks
87
+
88
+ def _split_into_paragraphs(self, text: str) -> List[str]:
89
+ """Split text into paragraphs."""
90
+ # Split by double newlines or multiple spaces
91
+ paragraphs = re.split(r'\n\s*\n|(?:\n\s*){2,}', text)
92
+ # Filter out empty paragraphs and strip whitespace
93
+ return [p.strip() for p in paragraphs if p.strip()]
94
+
95
+ def _split_paragraph_into_sentences(self, paragraph: str) -> List[str]:
96
+ """Split a long paragraph into sentence-based chunks."""
97
+ # Split by sentence boundaries
98
+ sentences = re.split(r'(?<=[.!?])\s+', paragraph)
99
+
100
+ chunks = []
101
+ current_chunk = ""
102
+
103
+ for sentence in sentences:
104
+ # If a single sentence is longer than max_chunk_size, we need to force-split it
105
+ if len(sentence) > self.max_chunk_size:
106
+ # Save current chunk if it has content
107
+ if current_chunk.strip():
108
+ chunks.append(current_chunk.strip())
109
+ current_chunk = ""
110
+
111
+ # Force-split the long sentence into smaller pieces
112
+ while len(sentence) > self.max_chunk_size:
113
+ # Find a good breaking point (prefer spaces)
114
+ break_point = self.max_chunk_size
115
+ if ' ' in sentence[:self.max_chunk_size]:
116
+ # Find the last space within the limit
117
+ break_point = sentence[:self.max_chunk_size].rfind(' ')
118
+
119
+ chunk_part = sentence[:break_point]
120
+ chunks.append(chunk_part)
121
+ sentence = sentence[break_point:].strip()
122
+
123
+ # Add the remaining part of the sentence
124
+ if sentence:
125
+ current_chunk = sentence
126
+
127
+ elif len(current_chunk) + len(sentence) + 1 > self.max_chunk_size:
128
+ if current_chunk.strip():
129
+ chunks.append(current_chunk.strip())
130
+ current_chunk = sentence
131
+ else:
132
+ if current_chunk:
133
+ current_chunk += " " + sentence
134
+ else:
135
+ current_chunk = sentence
136
+
137
+ if current_chunk.strip():
138
+ chunks.append(current_chunk.strip())
139
+
140
+ return chunks
141
+
142
+ def _add_overlap(self, chunks: List[str]) -> List[str]:
143
+ """Add sentence overlap between chunks for better continuity."""
144
+ if len(chunks) <= 1:
145
+ return chunks
146
+
147
+ overlapped_chunks = [chunks[0]] # First chunk stays the same
148
+
149
+ for i in range(1, len(chunks)):
150
+ # Get last few sentences from previous chunk
151
+ prev_chunk = chunks[i - 1]
152
+ current_chunk = chunks[i]
153
+
154
+ prev_sentences = re.split(r'(?<=[.!?])\s+', prev_chunk)
155
+ overlap_text = " ".join(prev_sentences[-self.overlap_sentences:]) if len(prev_sentences) > self.overlap_sentences else ""
156
+
157
+ if overlap_text:
158
+ overlapped_chunk = overlap_text + " " + current_chunk
159
+ else:
160
+ overlapped_chunk = current_chunk
161
+
162
+ overlapped_chunks.append(overlapped_chunk)
163
+
164
+ return overlapped_chunks
165
+
166
+ def get_chunk_info(self, chunks: List[str]) -> dict:
167
+ """Get information about the chunks."""
168
+ return {
169
+ "total_chunks": len(chunks),
170
+ "total_characters": sum(len(chunk) for chunk in chunks),
171
+ "avg_chunk_size": sum(len(chunk) for chunk in chunks) / len(chunks) if chunks else 0,
172
+ "max_chunk_size": max(len(chunk) for chunk in chunks) if chunks else 0,
173
+ "min_chunk_size": min(len(chunk) for chunk in chunks) if chunks else 0
174
+ }
api/tts_service.py CHANGED
@@ -12,11 +12,14 @@ from fastapi.responses import StreamingResponse, Response
12
  from fastapi import HTTPException, File, UploadFile, Form
13
 
14
  from .config import app, image
15
- from .models import TTSRequest, TTSResponse, HealthResponse
16
  from .audio_utils import AudioUtils
 
 
17
 
18
  with image.imports():
19
  from chatterbox.tts import ChatterboxTTS
 
20
  # Suppress specific transformers deprecation warnings
21
  warnings.filterwarnings("ignore", message=".*past_key_values.*", category=FutureWarning)
22
 
@@ -276,3 +279,153 @@ class ChatterboxTTSService:
276
  except Exception as e:
277
  print(f"Error generating audio: {str(e)}")
278
  raise HTTPException(status_code=500, detail=f"Audio generation failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  from fastapi import HTTPException, File, UploadFile, Form
13
 
14
  from .config import app, image
15
+ from .models import TTSRequest, TTSResponse, HealthResponse, FullTextTTSRequest, FullTextTTSResponse
16
  from .audio_utils import AudioUtils
17
+ from .text_processing import TextChunker
18
+ from .audio_concatenator import AudioConcatenator
19
 
20
  with image.imports():
21
  from chatterbox.tts import ChatterboxTTS
22
+ import torch # Add torch import here
23
  # Suppress specific transformers deprecation warnings
24
  warnings.filterwarnings("ignore", message=".*past_key_values.*", category=FutureWarning)
25
 
 
279
  except Exception as e:
280
  print(f"Error generating audio: {str(e)}")
281
  raise HTTPException(status_code=500, detail=f"Audio generation failed: {str(e)}")
282
+
283
+ @modal.fastapi_endpoint(docs=True, method="POST")
284
+ def generate_full_text_audio(self, request: FullTextTTSRequest) -> StreamingResponse:
285
+ """
286
+ Generate speech audio from full text with server-side chunking and parallel processing.
287
+
288
+ This endpoint handles texts of any length by:
289
+ 1. Chunking the text intelligently (respecting sentence/paragraph boundaries)
290
+ 2. Processing chunks in parallel using GPU resources
291
+ 3. Concatenating audio chunks with proper transitions
292
+ 4. Returning the final audio file
293
+
294
+ Args:
295
+ request: FullTextTTSRequest containing text and processing parameters
296
+
297
+ Returns:
298
+ StreamingResponse with final concatenated audio as WAV file
299
+ """
300
+ try:
301
+ self._validate_text_input(request.text)
302
+ audio_prompt_path = self._process_voice_prompt(request.voice_prompt_base64)
303
+
304
+ print(f"Processing full text ({len(request.text)} chars) with server-side chunking...")
305
+
306
+ # Initialize text chunker with request parameters
307
+ chunker = TextChunker(
308
+ max_chunk_size=request.max_chunk_size,
309
+ overlap_sentences=request.overlap_sentences
310
+ )
311
+
312
+ # Chunk the text
313
+ text_chunks = chunker.chunk_text(request.text)
314
+ chunk_info = chunker.get_chunk_info(text_chunks)
315
+ print(f"Split text into {len(text_chunks)} chunks for processing")
316
+
317
+ # Initialize audio_chunks variable for processing info
318
+ audio_chunks = []
319
+ # If only one chunk, process directly
320
+ if len(text_chunks) == 1:
321
+ wav = self._generate_audio(text_chunks[0], audio_prompt_path)
322
+ # For single chunk, pass the full wav object to maintain consistency
323
+ final_audio = wav
324
+ audio_chunks = [wav] # For consistent processing info
325
+ else:
326
+ # Process chunks in parallel
327
+ import concurrent.futures
328
+ import numpy as np
329
+
330
+ def process_chunk(chunk_text: str):
331
+ """Process a single chunk."""
332
+ wav_result = self._generate_audio(chunk_text, audio_prompt_path)
333
+ # Return the full wav result, not just wav[0]
334
+ return wav_result
335
+
336
+ # Use ThreadPoolExecutor for parallel processing
337
+ with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
338
+ # Submit all chunks for processing
339
+ future_to_chunk = {
340
+ executor.submit(process_chunk, chunk): i
341
+ for i, chunk in enumerate(text_chunks)
342
+ }
343
+
344
+ # Collect results in order
345
+ results = [None] * len(text_chunks)
346
+ for future in concurrent.futures.as_completed(future_to_chunk):
347
+ chunk_index = future_to_chunk[future]
348
+ try:
349
+ audio_result = future.result()
350
+ results[chunk_index] = audio_result
351
+ except Exception as exc:
352
+ print(f'Chunk {chunk_index} generated an exception: {exc}')
353
+ raise HTTPException(status_code=500, detail=f"Failed to process chunk {chunk_index}: {str(exc)}")
354
+
355
+ # Filter out None results
356
+ audio_chunks = [result for result in results if result is not None]
357
+
358
+ if len(audio_chunks) != len(text_chunks):
359
+ raise HTTPException(status_code=500, detail=f"Only {len(audio_chunks)} out of {len(text_chunks)} chunks processed successfully")
360
+
361
+ # Concatenate audio chunks
362
+ print("Concatenating audio chunks...")
363
+ concatenator = AudioConcatenator(
364
+ silence_duration=request.silence_duration,
365
+ fade_duration=request.fade_duration
366
+ )
367
+
368
+ final_audio = concatenator.concatenate_audio_chunks(audio_chunks, self.model.sr)
369
+
370
+ # --- Start of new audio processing logic ---
371
+ import torch
372
+ import numpy as np
373
+
374
+ processed_tensor = final_audio
375
+ # Unwrap if it's a single-element tuple repeatedly
376
+ while isinstance(processed_tensor, tuple) and len(processed_tensor) == 1:
377
+ processed_tensor = processed_tensor[0]
378
+
379
+ # Convert to PyTorch tensor if it's a NumPy array
380
+ if isinstance(processed_tensor, np.ndarray):
381
+ processed_tensor = torch.from_numpy(processed_tensor.astype(np.float32))
382
+
383
+ if not isinstance(processed_tensor, torch.Tensor): # Check if it's a tensor now
384
+ raise TypeError(f"Audio data after concatenation is not a tensor. Got type: {type(processed_tensor)}")
385
+
386
+ # Ensure correct shape (C, L) for torchaudio.save
387
+ if processed_tensor.ndim == 1: # Shape (L,)
388
+ audio_to_save = processed_tensor.unsqueeze(0) # Convert to (1, L)
389
+ elif processed_tensor.ndim == 2: # Shape (C, L)
390
+ if processed_tensor.shape[0] == 0:
391
+ raise ValueError(f"Audio tensor has 0 channels: {processed_tensor.shape}")
392
+ if processed_tensor.shape[0] > 1: # If C > 1 (stereo/multi-channel)
393
+ print(f"Multi-channel audio (shape {processed_tensor.shape}) detected. Taking the first channel.")
394
+ audio_to_save = processed_tensor[0, :].unsqueeze(0) # Result is (1, L)
395
+ else: # Already (1, L)
396
+ audio_to_save = processed_tensor
397
+ else:
398
+ raise ValueError(f"Unexpected audio tensor dimensions: {processed_tensor.ndim}, shape: {processed_tensor.shape}")
399
+ buffer = AudioUtils.save_audio_to_buffer(audio_to_save, self.model.sr)
400
+ duration = audio_to_save.shape[1] / self.model.sr # Use shape[1] for length
401
+
402
+ # Reset buffer position for reading
403
+ buffer.seek(0)
404
+ # --- End of new audio processing logic --- # Prepare processing info
405
+ processing_info = {
406
+ "total_chunks": len(text_chunks),
407
+ "processed_chunks": len(audio_chunks),
408
+ "failed_chunks": len(text_chunks) - len(audio_chunks),
409
+ "sample_rate": self.model.sr,
410
+ "duration": duration
411
+ }
412
+
413
+ print(f"Full text processing complete! Final audio duration: {duration:.2f} seconds")
414
+ return StreamingResponse(
415
+ buffer,
416
+ media_type="audio/wav",
417
+ headers={
418
+ "Content-Disposition": "attachment; filename=generated_full_text_speech.wav",
419
+ "X-Audio-Duration": str(duration),
420
+ "X-Chunks-Processed": str(len(audio_chunks)),
421
+ "X-Total-Characters": str(len(request.text))
422
+ }
423
+ )
424
+
425
+ except HTTPException as http_exc:
426
+ print(f"HTTP exception in full text generation: {http_exc.detail}")
427
+ raise http_exc
428
+ except Exception as e:
429
+ error_msg = f"Full text audio generation failed: {str(e)}"
430
+ print(f"Exception in full text generation: {error_msg}")
431
+ raise HTTPException(status_code=500, detail=error_msg)