Spaces:

Agents-MCP-Hackathon
/

pdf_explainer

Sleeping

App Files Files Community

spagestic commited on Jun 10

Commit

06a06a0

1 Parent(s): e37b0d2

api docs and code added

Browse files

Files changed (7) hide show

api/README.md +61 -0
api/__init__.py +22 -0
api/audio_utils.py +62 -0
api/config.py +23 -0
api/models.py +27 -0
api/tts_service.py +278 -0
requirements.txt +1 -0

api/README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+# API Package
+This package contains the modular components of the Chatterbox TTS API.
+## Structure
+```
+api/
+├── __init__.py          # Package initialization and exports
+├── config.py            # Modal app configuration and container image setup
+├── models.py            # Pydantic request/response models
+├── audio_utils.py       # Audio processing utilities and helper functions
+├── tts_service.py       # Main TTS service class with all API endpoints
+└── README.md           # This file
+```
+## Components
+### config.py
+- Modal app configuration
+- Container image setup with required dependencies
+- Centralized configuration management
+### models.py
+- `TTSRequest`: Request model for TTS generation
+- `TTSResponse`: Response model for JSON endpoints
+- `HealthResponse`: Response model for health checks
+- All models include proper type hints and documentation
+### audio_utils.py
+- `AudioUtils`: Static utility class for audio operations
+- Buffer management for audio data
+- Temporary file handling with automatic cleanup
+- Reusable audio processing functions
+### tts_service.py
+- `ChatterboxTTSService`: Main service class with all endpoints
+- GPU-accelerated TTS model loading and inference
+- Multiple API endpoints for different use cases
+- Comprehensive error handling and validation
+## Usage
+```python
+from api import app, ChatterboxTTSService
+# The app is automatically configured and ready to deploy
+# The service class contains all the endpoints
+```
+## Benefits of Modular Architecture
+1. **Separation of Concerns**: Each file has a specific responsibility
+2. **Maintainability**: Easier to update and modify individual components
+3. **Testability**: Components can be tested in isolation
+4. **Reusability**: Components can be imported and used in other projects
+5. **Readability**: Smaller files are easier to understand and navigate

api/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+"""
+Chatterbox TTS API package.
+This package provides a modular text-to-speech API using the Chatterbox TTS model
+deployed on Modal with GPU acceleration.
+"""
+from .config import app, image
+from .models import TTSRequest, TTSResponse, HealthResponse
+from .audio_utils import AudioUtils
+from .tts_service import ChatterboxTTSService
+__all__ = [
+    "app",
+    "image",
+    "TTSRequest",
+    "TTSResponse",
+    "HealthResponse",
+    "AudioUtils",
+    "ChatterboxTTSService"
+]

api/audio_utils.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""
+Audio processing utilities for TTS service.
+"""
+import io
+import tempfile
+import os
+from .config import image
+with image.imports():
+    import torchaudio as ta
+class AudioUtils:
+    """Helper class for audio processing operations."""
+    @staticmethod
+    def save_audio_to_buffer(wav_tensor, sample_rate: int) -> io.BytesIO:
+        """
+        Save audio tensor to BytesIO buffer.
+        Args:
+            wav_tensor: Audio tensor to save
+            sample_rate: Sample rate of the audio
+        Returns:
+            BytesIO buffer containing WAV audio data
+        """
+        buffer = io.BytesIO()
+        ta.save(buffer, wav_tensor, sample_rate, format="wav")
+        buffer.seek(0)
+        return buffer
+    @staticmethod
+    def save_temp_audio_file(audio_data: bytes) -> str:
+        """
+        Save uploaded audio data to a temporary file.
+        Args:
+            audio_data: Raw audio data bytes
+        Returns:
+            Path to the temporary audio file
+        """
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
+            temp_file.write(audio_data)
+            return temp_file.name
+    @staticmethod
+    def cleanup_temp_file(file_path: str) -> None:
+        """
+        Clean up temporary audio file.
+        Args:
+            file_path: Path to the temporary file to delete
+        """
+        try:
+            if file_path and os.path.exists(file_path):
+                os.unlink(file_path)
+        except Exception as e:
+            print(f"Warning: Failed to cleanup temp file {file_path}: {e}")

api/config.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""
+Modal app configuration and container image setup.
+"""
+import modal
+# Define a container image with required dependencies
+image = modal.Image.debian_slim(python_version="3.12").pip_install(
+    "chatterbox-tts==0.1.1",
+    "fastapi[standard]",
+    "pydantic",
+    "numpy",
+    "transformers>=4.45.0,<4.47.0",  # Pin to avoid deprecation warnings
+    "torch>=2.0.0",
+    "torchaudio>=2.0.0"
+).env({
+    # Suppress the specific transformers deprecation warning
+    "PYTHONWARNINGS": "ignore::FutureWarning:transformers"
+})
+# Create the Modal app
+app = modal.App("chatterbox-api-example", image=image)

api/models.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""
+Pydantic models for request/response validation and API documentation.
+"""
+from typing import Optional
+from pydantic import BaseModel
+class TTSRequest(BaseModel):
+    """Request model for TTS generation with optional voice cloning."""
+    text: str
+    voice_prompt_base64: Optional[str] = None  # Base64 encoded audio file
+class TTSResponse(BaseModel):
+    """Response model for TTS generation with JSON output."""
+    success: bool
+    message: str
+    audio_base64: Optional[str] = None  # Base64 encoded audio response
+    duration_seconds: Optional[float] = None
+class HealthResponse(BaseModel):
+    """Response model for health check endpoint."""
+    status: str
+    model_loaded: bool

api/tts_service.py ADDED Viewed

	@@ -0,0 +1,278 @@

+"""
+Main TTS service class with all API endpoints.
+"""
+import io
+import base64
+import warnings
+from typing import Optional
+import modal
+from fastapi.responses import StreamingResponse, Response
+from fastapi import HTTPException, File, UploadFile, Form
+from .config import app, image
+from .models import TTSRequest, TTSResponse, HealthResponse
+from .audio_utils import AudioUtils
+with image.imports():
+    from chatterbox.tts import ChatterboxTTS
+    # Suppress specific transformers deprecation warnings
+    warnings.filterwarnings("ignore", message=".*past_key_values.*", category=FutureWarning)
+@app.cls(
+    gpu="a10g",
+    scaledown_window=60 * 5,
+    enable_memory_snapshot=True
+    )
+@modal.concurrent(
+    max_inputs=10
+    )
+class ChatterboxTTSService:
+    """
+    Advanced text-to-speech service using Chatterbox TTS model.
+    Provides multiple endpoints for different use cases including
+    voice cloning, file uploads, and JSON responses.
+    """
+    @modal.enter()
+    def load(self):
+        """Load the Chatterbox TTS model on container startup."""
+        print("Loading Chatterbox TTS model...")
+        # Suppress transformers deprecation warnings
+        warnings.filterwarnings("ignore", message=".*past_key_values.*", category=FutureWarning)
+        warnings.filterwarnings("ignore", message=".*tuple of tuples.*", category=FutureWarning)
+        self.model = ChatterboxTTS.from_pretrained(device="cuda")
+        print(f"Model loaded successfully! Sample rate: {self.model.sr}")
+    def _validate_text_input(self, text: str) -> None:
+        """Validate text input parameters."""
+        if not text or len(text.strip()) == 0:
+            raise HTTPException(status_code=400, detail="Text cannot be empty")
+    def _process_voice_prompt(self, voice_prompt_base64: Optional[str]) -> Optional[str]:
+        """Process base64 encoded voice prompt and return temp file path."""
+        if not voice_prompt_base64:
+            return None
+        try:
+            audio_data = base64.b64decode(voice_prompt_base64)
+            return AudioUtils.save_temp_audio_file(audio_data)
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"Invalid voice prompt audio: {str(e)}")
+    def _generate_audio(self, text: str, audio_prompt_path: Optional[str] = None):
+        """Generate audio with optional voice prompt."""
+        print(f"Generating audio for text: {text[:50]}...")
+        try:
+            if audio_prompt_path:
+                wav = self.model.generate(text, audio_prompt_path=audio_prompt_path)
+                AudioUtils.cleanup_temp_file(audio_prompt_path)
+            else:
+                wav = self.model.generate(text)
+            return wav
+        except Exception as e:
+            if audio_prompt_path:
+                AudioUtils.cleanup_temp_file(audio_prompt_path)
+            raise e
+    @modal.fastapi_endpoint(docs=True, method="GET")
+    def health(self) -> HealthResponse:
+        """Health check endpoint to verify model status."""
+        return HealthResponse(
+            status="healthy",
+            model_loaded=hasattr(self, 'model') and self.model is not None
+        )
+    @modal.fastapi_endpoint(docs=True, method="POST")
+    def generate_audio(self, request: TTSRequest) -> StreamingResponse:
+        """
+        Generate speech audio from text with optional voice prompt.
+        Args:
+            request: TTSRequest containing text and optional voice prompt
+        Returns:
+            StreamingResponse with generated audio as WAV file
+        """
+        try:
+            self._validate_text_input(request.text)
+            audio_prompt_path = self._process_voice_prompt(request.voice_prompt_base64)
+            # Generate audio
+            wav = self._generate_audio(request.text, audio_prompt_path)
+            # Create audio buffer
+            buffer = AudioUtils.save_audio_to_buffer(wav, self.model.sr)
+            return StreamingResponse(
+                io.BytesIO(buffer.read()),
+                media_type="audio/wav",
+                headers={
+                    "Content-Disposition": "attachment; filename=generated_speech.wav",
+                    "X-Audio-Duration": str(len(wav[0]) / self.model.sr)
+                }
+            )
+        except HTTPException:
+            raise
+        except Exception as e:
+            print(f"Error generating audio: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Audio generation failed: {str(e)}")
+    @modal.fastapi_endpoint(docs=True, method="POST")
+    def generate_with_file(
+        self,
+        text: str = Form(..., description="Text to convert to speech"),
+        voice_prompt: Optional[UploadFile] = File(None, description="Optional voice prompt audio file")
+    ) -> StreamingResponse:
+        """
+        Generate speech audio from text with optional voice prompt file upload.
+        Args:
+            text: Text to convert to speech
+            voice_prompt: Optional audio file for voice cloning
+        Returns:
+            StreamingResponse with generated audio as WAV file
+        """
+        try:
+            self._validate_text_input(text)
+            # Handle voice prompt file if provided
+            audio_prompt_path = None
+            if voice_prompt:
+                if voice_prompt.content_type not in ["audio/wav", "audio/mpeg", "audio/mp3"]:
+                    raise HTTPException(
+                        status_code=400,
+                        detail="Voice prompt must be WAV, MP3, or MPEG audio file"
+                    )
+                # Read and save the uploaded file
+                audio_data = voice_prompt.file.read()
+                audio_prompt_path = AudioUtils.save_temp_audio_file(audio_data)
+            # Generate audio
+            wav = self._generate_audio(text, audio_prompt_path)
+            # Create audio buffer
+            buffer = AudioUtils.save_audio_to_buffer(wav, self.model.sr)
+            return StreamingResponse(
+                io.BytesIO(buffer.read()),
+                media_type="audio/wav",
+                headers={
+                    "Content-Disposition": "attachment; filename=generated_speech.wav",
+                    "X-Audio-Duration": str(len(wav[0]) / self.model.sr)
+                }
+            )
+        except HTTPException:
+            raise
+        except Exception as e:
+            print(f"Error generating audio: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Audio generation failed: {str(e)}")
+    @modal.fastapi_endpoint(docs=True, method="POST")
+    def generate_json(self, request: TTSRequest) -> TTSResponse:
+        """
+        Generate speech audio and return as JSON with base64 encoded audio.
+        Args:
+            request: TTSRequest containing text and optional voice prompt
+        Returns:
+            TTSResponse with base64 encoded audio data
+        """
+        try:
+            self._validate_text_input(request.text)
+            audio_prompt_path = self._process_voice_prompt(request.voice_prompt_base64)
+            # Generate audio
+            wav = self._generate_audio(request.text, audio_prompt_path)
+            # Convert to base64
+            buffer = AudioUtils.save_audio_to_buffer(wav, self.model.sr)
+            audio_base64 = base64.b64encode(buffer.read()).decode('utf-8')
+            duration = len(wav[0]) / self.model.sr
+            return TTSResponse(
+                success=True,
+                message="Audio generated successfully",
+                audio_base64=audio_base64,
+                duration_seconds=duration
+            )
+        except HTTPException as http_exc:
+            return TTSResponse(success=False, message=str(http_exc.detail))
+        except Exception as e:
+            print(f"Error generating audio: {str(e)}")
+            return TTSResponse(success=False, message=f"Audio generation failed: {str(e)}")
+    @modal.fastapi_endpoint(docs=True, method="POST")
+    def generate(self, prompt: str):
+        """
+        Legacy endpoint for backward compatibility.
+        Generate audio waveform from the input text.
+        """
+        try:
+            # Generate audio waveform from the input text
+            wav = self.model.generate(prompt)
+            # Create audio buffer
+            buffer = AudioUtils.save_audio_to_buffer(wav, self.model.sr)
+            # Return the audio as a streaming response with appropriate MIME type.
+            return StreamingResponse(
+                io.BytesIO(buffer.read()),
+                media_type="audio/wav",
+            )
+        except Exception as e:
+            print(f"Error in legacy endpoint: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Audio generation failed: {str(e)}")
+    @modal.fastapi_endpoint(docs=True, method="POST")
+    def generate_audio_file(self, request: TTSRequest) -> Response:
+        """
+        Generate speech audio from text with optional voice prompt and return as a complete file.
+        Unlike the streaming endpoint, this returns the entire file at once.
+        Args:
+            request: TTSRequest containing text and optional voice prompt
+        Returns:
+            Response with complete audio file data
+        """
+        try:
+            self._validate_text_input(request.text)
+            audio_prompt_path = self._process_voice_prompt(request.voice_prompt_base64)
+            # Generate audio
+            wav = self._generate_audio(request.text, audio_prompt_path)
+            # Create audio buffer
+            buffer = AudioUtils.save_audio_to_buffer(wav, self.model.sr)
+            audio_data = buffer.read()
+            duration = len(wav[0]) / self.model.sr
+            # Return the complete audio file
+            return Response(
+                content=audio_data,
+                media_type="audio/wav",
+                headers={
+                    "Content-Disposition": "attachment; filename=generated_speech.wav",
+                    "X-Audio-Duration": str(duration)
+                }
+            )
+        except HTTPException:
+            raise
+        except Exception as e:
+            print(f"Error generating audio: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Audio generation failed: {str(e)}")

requirements.txt CHANGED Viewed

@@ -24,6 +24,7 @@ Jinja2==3.1.6
 markdown-it-py==3.0.0
 mdurl==0.1.2
 mistralai==1.8.1
 numpy==2.2.6
 orjson==3.10.18
 packaging==25.0

 markdown-it-py==3.0.0
 mdurl==0.1.2
 mistralai==1.8.1
+modal==1.0.3
 numpy==2.2.6
 orjson==3.10.18
 packaging==25.0