spagestic commited on
Commit
8f3c067
·
1 Parent(s): d1c4aa1

feat: implement simplified audio processing with enhanced TTS API integration

Browse files
src/processors/generate_simple_tts_audio.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Simplified TTS audio generation that uses the enhanced API endpoints."""
2
+ import os
3
+ import requests
4
+ import tempfile
5
+ import soundfile as sf
6
+ import numpy as np
7
+ import gradio as gr
8
+
9
+
10
+ def generate_simple_tts_audio(text_input: str, audio_prompt_input=None, progress=None):
11
+ """
12
+ Generate TTS audio using the enhanced API that handles chunking and concatenation server-side.
13
+
14
+ Args:
15
+ text_input: The text to convert to speech (any length)
16
+ audio_prompt_input: Optional audio prompt for voice cloning
17
+ progress: Optional progress callback
18
+
19
+ Returns:
20
+ Tuple of (sample_rate, audio_data)
21
+ """
22
+ # Use the new full-text endpoint that handles everything server-side
23
+ FULL_TEXT_ENDPOINT = os.getenv("FULL_TEXT_TTS_ENDPOINT", "YOUR-MODAL-ENDPOINT-URL/generate_full_text_audio")
24
+ GENERATE_WITH_FILE_ENDPOINT = os.getenv("GENERATE_WITH_FILE_ENDPOINT", "YOUR-MODAL-ENDPOINT-URL/generate_with_file")
25
+
26
+ if not text_input or len(text_input.strip()) == 0:
27
+ raise gr.Error("Please enter some text to synthesize.")
28
+
29
+ if progress:
30
+ progress(0.1, desc="Preparing request for full-text processing...")
31
+
32
+ try:
33
+ if audio_prompt_input is None:
34
+ # Use the new full-text endpoint for enhanced processing
35
+ if progress:
36
+ progress(0.3, desc="Sending full text to enhanced TTS API...")
37
+
38
+ payload = {
39
+ "text": text_input,
40
+ "max_chunk_size": 800,
41
+ "silence_duration": 0.5,
42
+ "fade_duration": 0.1,
43
+ "overlap_sentences": 0
44
+ }
45
+
46
+ response = requests.post(
47
+ FULL_TEXT_ENDPOINT,
48
+ json=payload,
49
+ headers={"Content-Type": "application/json"},
50
+ timeout=300, # Longer timeout for full-text processing
51
+ stream=True
52
+ )
53
+
54
+ if response.status_code != 200:
55
+ raise gr.Error(f"API Error: {response.status_code} - {response.text}")
56
+
57
+ if progress:
58
+ progress(0.6, desc="Server processing text chunks in parallel...")
59
+
60
+ # Get content length if available for progress tracking
61
+ content_length = response.headers.get('content-length')
62
+ chunks_processed = response.headers.get('X-Chunks-Processed', 'unknown')
63
+ total_chars = response.headers.get('X-Total-Characters', len(text_input))
64
+
65
+ if progress:
66
+ progress(0.7, desc=f"Processing {chunks_processed} chunks ({total_chars} characters)...")
67
+
68
+ bytes_downloaded = 0
69
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
70
+ for chunk in response.iter_content(chunk_size=8192):
71
+ if chunk:
72
+ temp_file.write(chunk)
73
+ bytes_downloaded += len(chunk)
74
+
75
+ # Update progress based on bytes downloaded
76
+ if progress:
77
+ progress(0.7, desc=f"Downloading processed audio... ({bytes_downloaded // 1024}KB)")
78
+
79
+ temp_path = temp_file.name
80
+
81
+ if progress:
82
+ progress(0.9, desc="Loading final audio...")
83
+
84
+ audio_data, sample_rate = sf.read(temp_path)
85
+ os.unlink(temp_path)
86
+
87
+ if progress:
88
+ progress(1.0, desc=f"Complete! Processed {chunks_processed} chunks into final audio.")
89
+
90
+ return (sample_rate, audio_data)
91
+
92
+ else:
93
+ # For voice cloning, still use the original endpoint
94
+ if progress:
95
+ progress(0.3, desc="Preparing voice cloning request...")
96
+
97
+ files = {'text': (None, text_input)}
98
+ with open(audio_prompt_input, 'rb') as f:
99
+ audio_content = f.read()
100
+ files['voice_prompt'] = ('voice_prompt.wav', audio_content, 'audio/wav')
101
+
102
+ if progress:
103
+ progress(0.5, desc="Sending request with voice cloning...")
104
+
105
+ response = requests.post(
106
+ GENERATE_WITH_FILE_ENDPOINT,
107
+ files=files,
108
+ timeout=180,
109
+ stream=True
110
+ )
111
+
112
+ if response.status_code != 200:
113
+ raise gr.Error(f"API Error: {response.status_code} - {response.text}")
114
+
115
+ if progress:
116
+ progress(0.8, desc="Processing cloned voice...")
117
+
118
+ bytes_downloaded = 0
119
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
120
+ for chunk in response.iter_content(chunk_size=8192):
121
+ if chunk:
122
+ temp_file.write(chunk)
123
+ bytes_downloaded += len(chunk)
124
+
125
+ if progress:
126
+ progress(0.8, desc=f"Downloading cloned audio... ({bytes_downloaded // 1024}KB)")
127
+
128
+ temp_path = temp_file.name
129
+
130
+ audio_data, sample_rate = sf.read(temp_path)
131
+ os.unlink(temp_path)
132
+
133
+ if progress:
134
+ progress(1.0, desc="Voice cloning complete!")
135
+
136
+ return (sample_rate, audio_data)
137
+
138
+ except requests.exceptions.Timeout:
139
+ raise gr.Error("Request timed out. The API might be processing a large text. Please try again.")
140
+ except requests.exceptions.ConnectionError:
141
+ raise gr.Error("Unable to connect to the API. Please check if the endpoint URL is correct.")
142
+ except Exception as e:
143
+ raise gr.Error(f"Error generating audio: {str(e)}")
144
+
145
+
146
+ def get_api_processing_info(text: str) -> dict:
147
+ """
148
+ Get processing information from the API without generating audio.
149
+
150
+ Args:
151
+ text: The text to analyze
152
+
153
+ Returns:
154
+ Dictionary with processing information
155
+ """
156
+ try:
157
+ # This could be enhanced to call an API info endpoint
158
+ text_length = len(text.strip()) if text else 0
159
+ estimated_chunks = max(1, text_length // 800)
160
+
161
+ return {
162
+ "text_length": text_length,
163
+ "estimated_chunks": estimated_chunks,
164
+ "processing_mode": "server_side_parallel_gpu",
165
+ "benefits": [
166
+ "Server-side GPU acceleration",
167
+ "Parallel chunk processing",
168
+ "Automatic audio concatenation",
169
+ "Optimized for large texts",
170
+ "No client-side resource usage"
171
+ ]
172
+ }
173
+ except Exception as e:
174
+ return {"error": f"Failed to analyze text: {str(e)}"}
src/processors/pdf_processor.py CHANGED
@@ -34,24 +34,18 @@ class PDFProcessor:
34
  explanations = self.extractor.generate_explanations(extracted_text)
35
 
36
  # Show explanations immediately, update status for audio loading
37
- yield extracted_text, gr.update(value="Generating audio..."), explanations, None, gr.update(visible=False)
38
- # Step 3: Generate audio
39
  try:
40
- from .audio_processor import AudioProcessor
41
 
42
- # Create audio processor with parallel processing enabled
43
- audio_processor = AudioProcessor(
44
- max_chunk_size=800,
45
- max_workers=4,
46
- silence_duration=0.5,
47
- enable_parallel=True
48
- )
49
 
50
  # Generate progress callback for audio processing
51
  def audio_progress(progress, desc=""):
52
  yield extracted_text, gr.update(value=f"Generating audio: {desc}"), explanations, None, gr.update(visible=False)
53
 
54
- # Generate audio using the new parallel processor
55
  audio_result, _ = audio_processor.generate_audio(explanations, progress=audio_progress)
56
 
57
  # Show everything, update status to complete
 
34
  explanations = self.extractor.generate_explanations(extracted_text)
35
 
36
  # Show explanations immediately, update status for audio loading
37
+ yield extracted_text, gr.update(value="Generating audio..."), explanations, None, gr.update(visible=False) # Step 3: Generate audio
 
38
  try:
39
+ from .simple_audio_processor import SimpleAudioProcessor
40
 
41
+ # Create simplified audio processor
42
+ audio_processor = SimpleAudioProcessor()
 
 
 
 
 
43
 
44
  # Generate progress callback for audio processing
45
  def audio_progress(progress, desc=""):
46
  yield extracted_text, gr.update(value=f"Generating audio: {desc}"), explanations, None, gr.update(visible=False)
47
 
48
+ # Generate audio using the simplified processor
49
  audio_result, _ = audio_processor.generate_audio(explanations, progress=audio_progress)
50
 
51
  # Show everything, update status to complete
src/processors/simple_audio_processor.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Simplified audio generation functionality that delegates complex processing to the TTS API."""
2
+ from typing import Tuple, Optional
3
+ import gradio as gr
4
+ import numpy as np
5
+
6
+ class SimpleAudioProcessor:
7
+ """Simplified audio processor that uses the enhanced TTS API for complex processing."""
8
+
9
+ def __init__(self):
10
+ """Initialize the simple audio processor."""
11
+ pass
12
+
13
+ def generate_audio(self, explanation_text: str, progress=None) -> Tuple[Tuple[int, np.ndarray], dict]:
14
+ """
15
+ Generate TTS audio for explanations using the enhanced TTS API.
16
+
17
+ This method sends the full text to the TTS API which handles:
18
+ - Text chunking
19
+ - Parallel processing
20
+ - Audio concatenation
21
+ - All on the server side with GPU acceleration
22
+
23
+ Args:
24
+ explanation_text: The text to convert to audio
25
+ progress: Optional progress callback
26
+
27
+ Returns:
28
+ Tuple of (audio_result, update_dict) where audio_result is (sample_rate, audio_data)
29
+ """
30
+ if not explanation_text or explanation_text.strip() == "":
31
+ raise gr.Error("No explanations available to convert to audio. Please generate explanations first.")
32
+
33
+ try:
34
+ clean_text = explanation_text.strip()
35
+
36
+ if progress:
37
+ progress(0.1, desc="Sending text to TTS API for processing...")
38
+
39
+ # Import the simplified audio generation function
40
+ from .generate_simple_tts_audio import generate_simple_tts_audio
41
+
42
+ # Generate audio using the new simplified API call
43
+ audio_result = generate_simple_tts_audio(clean_text, progress=progress)
44
+
45
+ if progress:
46
+ progress(1.0, desc="Audio generation complete!")
47
+
48
+ return audio_result, gr.update(visible=True)
49
+
50
+ except Exception as e:
51
+ raise gr.Error(f"Error generating audio: {str(e)}")
52
+
53
+ def get_processing_info(self, text: str) -> dict:
54
+ """Get basic information about the text to be processed."""
55
+ if not text or not text.strip():
56
+ return {"error": "No text provided"}
57
+
58
+ text_length = len(text.strip())
59
+ estimated_chunks = max(1, text_length // 800) # Rough estimate
60
+ estimated_time = text_length * 0.05 # Rough estimate: 0.05 seconds per character
61
+
62
+ return {
63
+ "processing_mode": "server_side_parallel",
64
+ "text_length": text_length,
65
+ "estimated_chunks": estimated_chunks,
66
+ "estimated_time_seconds": estimated_time,
67
+ "estimated_time_readable": f"{estimated_time:.1f} seconds" if estimated_time < 60 else f"{estimated_time/60:.1f} minutes",
68
+ "note": "Processing handled by TTS API with GPU acceleration"
69
+ }