spagestic commited on
Commit
91e586d
·
1 Parent(s): 94d063a

feat: Update audio processing to support parallel chunking and enhance text chunking logic

Browse files
requirements.txt CHANGED
@@ -36,7 +36,7 @@ pydantic_core==2.33.2
36
  pydub==0.25.1
37
  Pygments==2.19.1
38
  python-dateutil==2.9.0.post0
39
- python-dotenv==1.1.0
40
  python-multipart==0.0.20
41
  pytz==2025.2
42
  PyYAML==6.0.2
@@ -45,6 +45,7 @@ requests==2.32.3
45
  rich==14.0.0
46
  ruff==0.11.13
47
  safehttpx==0.1.6
 
48
  semantic-version==2.10.0
49
  shellingham==1.5.4
50
  six==1.17.0
 
36
  pydub==0.25.1
37
  Pygments==2.19.1
38
  python-dateutil==2.9.0.post0
39
+ python-dotenv==1.1.1
40
  python-multipart==0.0.20
41
  pytz==2025.2
42
  PyYAML==6.0.2
 
45
  rich==14.0.0
46
  ruff==0.11.13
47
  safehttpx==0.1.6
48
+ scipy==1.15.3
49
  semantic-version==2.10.0
50
  shellingham==1.5.4
51
  six==1.17.0
src/processors/audio_concatenator.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Audio concatenation utility for combining multiple audio chunks into a single audio file."""
2
+
3
+ import numpy as np
4
+ from typing import List, Tuple, Optional
5
+ import gradio as gr
6
+
7
+
8
+ class AudioConcatenator:
9
+ """Handles concatenation of multiple audio chunks."""
10
+
11
+ def __init__(self, silence_duration: float = 0.5, fade_duration: float = 0.1):
12
+ """
13
+ Initialize the audio concatenator.
14
+
15
+ Args:
16
+ silence_duration: Duration of silence between chunks (seconds)
17
+ fade_duration: Duration of fade in/out effects (seconds)
18
+ """
19
+ self.silence_duration = silence_duration
20
+ self.fade_duration = fade_duration
21
+
22
+ def concatenate_audio_chunks(
23
+ self,
24
+ audio_chunks: List[Tuple[int, np.ndarray]],
25
+ progress_callback: Optional[callable] = None
26
+ ) -> Tuple[int, np.ndarray]:
27
+ """
28
+ Concatenate multiple audio chunks into a single audio file.
29
+
30
+ Args:
31
+ audio_chunks: List of (sample_rate, audio_data) tuples
32
+ progress_callback: Optional callback for progress updates
33
+
34
+ Returns:
35
+ Tuple of (sample_rate, concatenated_audio_data)
36
+ """
37
+ if not audio_chunks:
38
+ raise gr.Error("No audio chunks to concatenate")
39
+
40
+ if len(audio_chunks) == 1:
41
+ return audio_chunks[0]
42
+
43
+ if progress_callback:
44
+ progress_callback(0.1, desc="Preparing audio concatenation...")
45
+
46
+ # Verify all chunks have the same sample rate
47
+ sample_rates = [chunk[0] for chunk in audio_chunks]
48
+ if len(set(sample_rates)) > 1:
49
+ raise gr.Error(f"Inconsistent sample rates found: {set(sample_rates)}. All chunks must have the same sample rate.")
50
+
51
+ sample_rate = sample_rates[0]
52
+
53
+ if progress_callback:
54
+ progress_callback(0.2, desc="Normalizing audio chunks...")
55
+
56
+ # Normalize and prepare audio data
57
+ normalized_chunks = []
58
+ for i, (_, audio_data) in enumerate(audio_chunks):
59
+ # Ensure audio data is in the correct format
60
+ if audio_data.ndim == 1:
61
+ normalized_audio = audio_data
62
+ elif audio_data.ndim == 2:
63
+ # Convert stereo to mono by averaging channels
64
+ normalized_audio = np.mean(audio_data, axis=1)
65
+ else:
66
+ raise gr.Error(f"Unsupported audio format in chunk {i + 1}: {audio_data.shape}")
67
+
68
+ # Normalize audio levels
69
+ normalized_audio = self._normalize_audio(normalized_audio)
70
+
71
+ # Apply fade effects
72
+ normalized_audio = self._apply_fade_effects(normalized_audio, sample_rate)
73
+
74
+ normalized_chunks.append(normalized_audio)
75
+
76
+ if progress_callback:
77
+ progress = 0.2 + (0.5 * (i + 1) / len(audio_chunks))
78
+ progress_callback(progress, desc=f"Processed chunk {i + 1}/{len(audio_chunks)}")
79
+
80
+ if progress_callback:
81
+ progress_callback(0.7, desc="Creating silence segments...")
82
+
83
+ # Create silence segments
84
+ silence_samples = int(self.silence_duration * sample_rate)
85
+ silence = np.zeros(silence_samples, dtype=np.float32)
86
+
87
+ if progress_callback:
88
+ progress_callback(0.8, desc="Concatenating audio segments...")
89
+
90
+ # Concatenate all chunks with silence in between
91
+ concatenated_segments = []
92
+ for i, chunk in enumerate(normalized_chunks):
93
+ concatenated_segments.append(chunk)
94
+
95
+ # Add silence between chunks (but not after the last chunk)
96
+ if i < len(normalized_chunks) - 1:
97
+ concatenated_segments.append(silence)
98
+
99
+ if progress_callback:
100
+ progress = 0.8 + (0.15 * (i + 1) / len(normalized_chunks))
101
+ progress_callback(progress, desc=f"Concatenated {i + 1}/{len(normalized_chunks)} chunks")
102
+
103
+ # Combine all segments
104
+ final_audio = np.concatenate(concatenated_segments)
105
+
106
+ if progress_callback:
107
+ progress_callback(0.95, desc="Finalizing audio...")
108
+
109
+ # Final normalization and cleanup
110
+ final_audio = self._normalize_audio(final_audio)
111
+ final_audio = self._remove_clicks_and_pops(final_audio)
112
+
113
+ if progress_callback:
114
+ progress_callback(1.0, desc="Audio concatenation complete!")
115
+
116
+ return sample_rate, final_audio
117
+
118
+ def _normalize_audio(self, audio_data: np.ndarray) -> np.ndarray:
119
+ """Normalize audio to prevent clipping."""
120
+ # Find the maximum absolute value
121
+ max_val = np.max(np.abs(audio_data))
122
+
123
+ if max_val == 0:
124
+ return audio_data
125
+
126
+ # Normalize to 95% of maximum to leave some headroom
127
+ normalized = audio_data * (0.95 / max_val)
128
+
129
+ return normalized.astype(np.float32)
130
+
131
+ def _apply_fade_effects(self, audio_data: np.ndarray, sample_rate: int) -> np.ndarray:
132
+ """Apply fade in and fade out effects to reduce pops and clicks."""
133
+ fade_samples = int(self.fade_duration * sample_rate)
134
+
135
+ if len(audio_data) < 2 * fade_samples:
136
+ # If audio is too short for fade effects, return as-is
137
+ return audio_data
138
+
139
+ audio_with_fades = audio_data.copy()
140
+ # Apply fade in
141
+ fade_in = np.linspace(0, 1, fade_samples)
142
+ audio_with_fades[:fade_samples] *= fade_in
143
+
144
+ # Apply fade out
145
+ fade_out = np.linspace(1, 0, fade_samples)
146
+ audio_with_fades[-fade_samples:] *= fade_out
147
+
148
+ return audio_with_fades
149
+
150
+ def _remove_clicks_and_pops(self, audio_data: np.ndarray) -> np.ndarray:
151
+ """Apply basic filtering to remove clicks and pops."""
152
+ try:
153
+ # Simple high-pass filter to remove DC offset and low-frequency artifacts
154
+ from scipy import signal
155
+
156
+ # Design a high-pass filter (removes frequencies below 80 Hz)
157
+ # This helps remove some pops and clicks while preserving speech
158
+ sos = signal.butter(2, 80, btype='highpass', fs=22050, output='sos')
159
+ filtered_audio = signal.sosfilt(sos, audio_data)
160
+
161
+ return filtered_audio.astype(np.float32)
162
+ except ImportError:
163
+ # If scipy is not available, return audio as-is
164
+ return audio_data.astype(np.float32)
165
+
166
+ def get_concatenation_info(self, audio_chunks: List[Tuple[int, np.ndarray]]) -> dict:
167
+ """Get information about the concatenation process."""
168
+ if not audio_chunks:
169
+ return {}
170
+
171
+ total_duration = 0
172
+ total_silence_duration = 0
173
+ chunk_durations = []
174
+
175
+ sample_rate = audio_chunks[0][0]
176
+
177
+ for _, audio_data in audio_chunks:
178
+ duration = len(audio_data) / sample_rate
179
+ chunk_durations.append(duration)
180
+ total_duration += duration
181
+
182
+ # Add silence duration (between chunks)
183
+ if len(audio_chunks) > 1:
184
+ total_silence_duration = (len(audio_chunks) - 1) * self.silence_duration
185
+ total_duration += total_silence_duration
186
+
187
+ return {
188
+ "num_chunks": len(audio_chunks),
189
+ "total_duration": total_duration,
190
+ "total_silence_duration": total_silence_duration,
191
+ "chunk_durations": chunk_durations,
192
+ "average_chunk_duration": np.mean(chunk_durations),
193
+ "sample_rate": sample_rate
194
+ }
src/processors/audio_processor.py CHANGED
@@ -1,17 +1,171 @@
1
  """Audio generation functionality."""
2
 
3
  import gradio as gr
 
 
 
 
 
 
4
 
5
  class AudioProcessor:
6
- """Handles audio generation operations."""
7
- def generate_audio(self, explanation_text):
8
- """Generate TTS audio for explanations."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  if not explanation_text or explanation_text.strip() == "":
10
  raise gr.Error("No explanations available to convert to audio. Please generate explanations first.")
11
  try:
12
  from .generate_tts_audio import generate_tts_audio
13
  clean_text = explanation_text.strip()
 
 
 
 
 
 
 
 
 
 
14
  audio_result = generate_tts_audio(clean_text, None)
15
  return audio_result, gr.update(visible=True)
16
  except Exception as e:
17
  raise gr.Error(f"Error generating audio: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """Audio generation functionality."""
2
 
3
  import gradio as gr
4
+ from typing import Tuple, Optional
5
+ import numpy as np
6
+ from .text_chunker import TextChunker
7
+ from .parallel_processor import ParallelAudioProcessor
8
+ from .audio_concatenator import AudioConcatenator
9
+
10
 
11
  class AudioProcessor:
12
+ """Handles audio generation operations with parallel processing and chunking."""
13
+
14
+ def __init__(self,
15
+ max_chunk_size: int = 800,
16
+ max_workers: int = 4,
17
+ silence_duration: float = 0.5,
18
+ enable_parallel: bool = True):
19
+ """
20
+ Initialize the audio processor.
21
+
22
+ Args:
23
+ max_chunk_size: Maximum characters per chunk
24
+ max_workers: Maximum parallel workers
25
+ silence_duration: Silence between chunks (seconds)
26
+ enable_parallel: Whether to use parallel processing
27
+ """
28
+ self.text_chunker = TextChunker(max_chunk_size=max_chunk_size)
29
+ self.parallel_processor = ParallelAudioProcessor(max_workers=max_workers)
30
+ self.audio_concatenator = AudioConcatenator(silence_duration=silence_duration)
31
+ self.enable_parallel = enable_parallel
32
+
33
+ def generate_audio(self, explanation_text: str, progress=None) -> Tuple[Tuple[int, np.ndarray], dict]:
34
+ """
35
+ Generate TTS audio for explanations with chunking and parallel processing.
36
+
37
+ Args:
38
+ explanation_text: The text to convert to audio
39
+ progress: Optional progress callback
40
+
41
+ Returns:
42
+ Tuple of (audio_result, update_dict) where audio_result is (sample_rate, audio_data)
43
+ """
44
+ if not explanation_text or explanation_text.strip() == "":
45
+ raise gr.Error("No explanations available to convert to audio. Please generate explanations first.")
46
+
47
+ try:
48
+ clean_text = explanation_text.strip()
49
+
50
+ if progress:
51
+ progress(0.05, desc="Analyzing text for chunking...")
52
+
53
+ # Step 1: Chunk the text
54
+ text_chunks = self.text_chunker.chunk_text(clean_text)
55
+ chunk_info = self.text_chunker.get_chunk_info(text_chunks)
56
+
57
+ if progress:
58
+ progress(0.1, desc=f"Split text into {len(text_chunks)} chunks")
59
+
60
+ # If only one chunk and it's small enough, use simple processing
61
+ if len(text_chunks) == 1 and len(text_chunks[0]) <= 1000:
62
+ if progress:
63
+ progress(0.2, desc="Processing single chunk...")
64
+
65
+ from .generate_tts_audio import generate_tts_audio
66
+ audio_result = generate_tts_audio(text_chunks[0], None, progress=progress)
67
+
68
+ if progress:
69
+ progress(1.0, desc="Audio generation complete!")
70
+
71
+ return audio_result, gr.update(visible=True)
72
+
73
+ # Step 2: Process chunks in parallel (or sequentially if disabled)
74
+ if self.enable_parallel and len(text_chunks) > 1:
75
+ if progress:
76
+ progress(0.15, desc="Starting parallel audio processing...")
77
+
78
+ # Import the audio generation function
79
+ from .generate_tts_audio import generate_tts_audio
80
+
81
+ # Process chunks in parallel
82
+ def progress_wrapper(p, desc=""):
83
+ if progress:
84
+ # Map parallel progress to 15-80% of total progress
85
+ mapped_progress = 0.15 + (p * 0.65)
86
+ progress(mapped_progress, desc)
87
+
88
+ audio_chunks = self.parallel_processor.process_chunks_parallel(
89
+ text_chunks,
90
+ generate_tts_audio,
91
+ progress_callback=progress_wrapper
92
+ )
93
+ else:
94
+ # Sequential processing for single chunk or when parallel is disabled
95
+ if progress:
96
+ progress(0.15, desc="Processing chunks sequentially...")
97
+
98
+ from .generate_tts_audio import generate_tts_audio
99
+ audio_chunks = []
100
+
101
+ for i, chunk in enumerate(text_chunks):
102
+ if progress:
103
+ chunk_progress = 0.15 + (0.65 * i / len(text_chunks))
104
+ progress(chunk_progress, desc=f"Processing chunk {i + 1}/{len(text_chunks)}")
105
+
106
+ audio_result = generate_tts_audio(chunk, None)
107
+ audio_chunks.append(audio_result)
108
+
109
+ # Step 3: Concatenate audio chunks
110
+ if progress:
111
+ progress(0.8, desc="Concatenating audio chunks...")
112
+
113
+ def concat_progress_wrapper(p, desc=""):
114
+ if progress:
115
+ # Map concatenation progress to 80-100% of total progress
116
+ mapped_progress = 0.8 + (p * 0.2)
117
+ progress(mapped_progress, desc)
118
+
119
+ final_audio = self.audio_concatenator.concatenate_audio_chunks(
120
+ audio_chunks,
121
+ progress_callback=concat_progress_wrapper
122
+ )
123
+
124
+ if progress:
125
+ progress(1.0, desc=f"Generated audio from {len(text_chunks)} chunks!")
126
+
127
+ return final_audio, gr.update(visible=True)
128
+
129
+ except Exception as e:
130
+ raise gr.Error(f"Error generating audio: {str(e)}")
131
+
132
+ def generate_audio_legacy(self, explanation_text: str) -> Tuple[Tuple[int, np.ndarray], dict]:
133
+ """
134
+ Legacy audio generation method (for backward compatibility).
135
+ """
136
  if not explanation_text or explanation_text.strip() == "":
137
  raise gr.Error("No explanations available to convert to audio. Please generate explanations first.")
138
  try:
139
  from .generate_tts_audio import generate_tts_audio
140
  clean_text = explanation_text.strip()
141
+
142
+ # Use the original truncation logic for legacy mode
143
+ if len(clean_text) > 1000:
144
+ sentences = clean_text[:950].split('.')
145
+ if len(sentences) > 1:
146
+ clean_text = '.'.join(sentences[:-1]) + '.'
147
+ else:
148
+ clean_text = clean_text[:950]
149
+ clean_text += " [Text has been truncated for audio generation]"
150
+
151
  audio_result = generate_tts_audio(clean_text, None)
152
  return audio_result, gr.update(visible=True)
153
  except Exception as e:
154
  raise gr.Error(f"Error generating audio: {str(e)}")
155
+
156
+ def get_processing_info(self, text: str) -> dict:
157
+ """Get information about how the text would be processed."""
158
+ if not text or not text.strip():
159
+ return {"error": "No text provided"}
160
+
161
+ chunks = self.text_chunker.chunk_text(text.strip())
162
+ chunk_info = self.text_chunker.get_chunk_info(chunks)
163
+
164
+ estimated_time = self.parallel_processor.estimate_processing_time(chunks)
165
+
166
+ return {
167
+ "processing_mode": "parallel" if self.enable_parallel and len(chunks) > 1 else "sequential",
168
+ "chunk_info": chunk_info,
169
+ "estimated_time_seconds": estimated_time,
170
+ "estimated_time_readable": f"{estimated_time:.1f} seconds" if estimated_time < 60 else f"{estimated_time/60:.1f} minutes"
171
+ }
src/processors/parallel_processor.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Parallel audio processing for generating multiple audio chunks concurrently."""
2
+
3
+ import asyncio
4
+ import concurrent.futures
5
+ from typing import List, Tuple, Optional, Callable
6
+ import numpy as np
7
+ import gradio as gr
8
+
9
+
10
+ class ParallelAudioProcessor:
11
+ """Handles parallel processing of multiple audio chunks."""
12
+
13
+ def __init__(self, max_workers: int = 4):
14
+ """
15
+ Initialize the parallel processor.
16
+
17
+ Args:
18
+ max_workers: Maximum number of concurrent workers for audio generation
19
+ """
20
+ self.max_workers = max_workers
21
+
22
+ def process_chunks_parallel(
23
+ self,
24
+ text_chunks: List[str],
25
+ audio_generator_func: Callable,
26
+ progress_callback: Optional[Callable] = None
27
+ ) -> List[Tuple[int, np.ndarray]]:
28
+ """
29
+ Process multiple text chunks in parallel to generate audio.
30
+
31
+ Args:
32
+ text_chunks: List of text chunks to process
33
+ audio_generator_func: Function to generate audio from text
34
+ progress_callback: Optional callback for progress updates
35
+
36
+ Returns:
37
+ List of tuples containing (sample_rate, audio_data) for each chunk
38
+ """
39
+ if not text_chunks:
40
+ return []
41
+
42
+ total_chunks = len(text_chunks)
43
+ completed_chunks = 0
44
+ results = [None] * total_chunks
45
+
46
+ def update_progress(chunk_index: int, desc: str = ""):
47
+ nonlocal completed_chunks
48
+ if progress_callback:
49
+ progress = completed_chunks / total_chunks
50
+ progress_callback(progress, desc=f"Processing chunk {completed_chunks + 1}/{total_chunks}{': ' + desc if desc else ''}")
51
+
52
+ def process_single_chunk(chunk_index: int, text_chunk: str) -> Tuple[int, Tuple[int, np.ndarray]]:
53
+ """Process a single chunk and return the result with its index."""
54
+ try:
55
+ # Create a local progress callback for this chunk
56
+ def chunk_progress(progress: float, desc: str = ""):
57
+ update_progress(chunk_index, f"Chunk {chunk_index + 1}: {desc}")
58
+
59
+ # Generate audio for this chunk
60
+ audio_result = audio_generator_func(text_chunk, None, progress=chunk_progress)
61
+ return chunk_index, audio_result
62
+ except Exception as e:
63
+ raise Exception(f"Error processing chunk {chunk_index + 1}: {str(e)}")
64
+
65
+ # Use ThreadPoolExecutor for parallel processing
66
+ with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
67
+ # Submit all chunks for processing
68
+ future_to_index = {
69
+ executor.submit(process_single_chunk, i, chunk): i
70
+ for i, chunk in enumerate(text_chunks)
71
+ }
72
+
73
+ # Collect results as they complete
74
+ for future in concurrent.futures.as_completed(future_to_index):
75
+ chunk_index = future_to_index[future]
76
+ try:
77
+ index, audio_result = future.result()
78
+ results[index] = audio_result
79
+ completed_chunks += 1
80
+
81
+ if progress_callback:
82
+ progress = completed_chunks / total_chunks
83
+ progress_callback(
84
+ progress,
85
+ desc=f"Completed {completed_chunks}/{total_chunks} audio chunks"
86
+ )
87
+
88
+ except Exception as e:
89
+ raise gr.Error(f"Failed to process chunk {chunk_index + 1}: {str(e)}")
90
+
91
+ # Filter out any None results (shouldn't happen, but just in case)
92
+ valid_results = [result for result in results if result is not None]
93
+
94
+ if len(valid_results) != total_chunks:
95
+ raise gr.Error(f"Only {len(valid_results)} out of {total_chunks} chunks processed successfully")
96
+
97
+ return valid_results
98
+
99
+ async def process_chunks_async(
100
+ self,
101
+ text_chunks: List[str],
102
+ audio_generator_func: Callable,
103
+ progress_callback: Optional[Callable] = None
104
+ ) -> List[Tuple[int, np.ndarray]]:
105
+ """
106
+ Async version of parallel chunk processing.
107
+
108
+ Args:
109
+ text_chunks: List of text chunks to process
110
+ audio_generator_func: Function to generate audio from text
111
+ progress_callback: Optional callback for progress updates
112
+
113
+ Returns:
114
+ List of tuples containing (sample_rate, audio_data) for each chunk
115
+ """
116
+ if not text_chunks:
117
+ return []
118
+
119
+ async def process_chunk_async(chunk_index: int, text_chunk: str):
120
+ """Process a single chunk asynchronously."""
121
+ loop = asyncio.get_event_loop()
122
+
123
+ def chunk_progress(progress: float, desc: str = ""):
124
+ if progress_callback:
125
+ progress_callback(
126
+ (chunk_index + progress) / len(text_chunks),
127
+ desc=f"Chunk {chunk_index + 1}: {desc}"
128
+ )
129
+
130
+ # Run the audio generation in a thread pool
131
+ audio_result = await loop.run_in_executor(
132
+ None,
133
+ lambda: audio_generator_func(text_chunk, None, progress=chunk_progress)
134
+ )
135
+ return chunk_index, audio_result
136
+
137
+ # Create tasks for all chunks
138
+ tasks = [
139
+ process_chunk_async(i, chunk)
140
+ for i, chunk in enumerate(text_chunks)
141
+ ]
142
+
143
+ # Process all chunks concurrently
144
+ try:
145
+ results = await asyncio.gather(*tasks)
146
+ # Sort results by chunk index to maintain order
147
+ results.sort(key=lambda x: x[0])
148
+ return [result[1] for result in results]
149
+ except Exception as e:
150
+ raise gr.Error(f"Error in async processing: {str(e)}")
151
+
152
+ def estimate_processing_time(self, text_chunks: List[str], avg_time_per_char: float = 0.1) -> float:
153
+ """
154
+ Estimate total processing time for all chunks.
155
+
156
+ Args:
157
+ text_chunks: List of text chunks
158
+ avg_time_per_char: Average processing time per character (seconds)
159
+
160
+ Returns:
161
+ Estimated processing time in seconds
162
+ """
163
+ total_chars = sum(len(chunk) for chunk in text_chunks)
164
+ sequential_time = total_chars * avg_time_per_char
165
+
166
+ # Account for parallelization
167
+ parallel_efficiency = min(len(text_chunks), self.max_workers) / len(text_chunks) if text_chunks else 1
168
+ estimated_time = sequential_time * parallel_efficiency
169
+
170
+ return estimated_time
src/processors/pdf_processor.py CHANGED
@@ -35,24 +35,24 @@ class PDFProcessor:
35
 
36
  # Show explanations immediately, update status for audio loading
37
  yield extracted_text, gr.update(value="Generating audio..."), explanations, None, gr.update(visible=False)
38
-
39
- # Step 3: Generate audio
40
  try:
41
- from .generate_tts_audio import generate_tts_audio
42
 
43
- # Clean up the text for better TTS
44
- clean_text = explanations.strip()
 
 
 
 
 
45
 
46
- # Limit text length for TTS (assuming 1000 character limit)
47
- if len(clean_text) > 1000:
48
- sentences = clean_text[:950].split('.')
49
- if len(sentences) > 1:
50
- clean_text = '.'.join(sentences[:-1]) + '.'
51
- else:
52
- clean_text = clean_text[:950]
53
- clean_text += " [Text has been truncated for audio generation]"
54
 
55
- audio_result = generate_tts_audio(clean_text, None)
 
56
 
57
  # Show everything, update status to complete
58
  yield extracted_text, gr.update(value="All steps complete!"), explanations, audio_result, gr.update(visible=True)
 
35
 
36
  # Show explanations immediately, update status for audio loading
37
  yield extracted_text, gr.update(value="Generating audio..."), explanations, None, gr.update(visible=False)
38
+ # Step 3: Generate audio
 
39
  try:
40
+ from .audio_processor import AudioProcessor
41
 
42
+ # Create audio processor with parallel processing enabled
43
+ audio_processor = AudioProcessor(
44
+ max_chunk_size=800,
45
+ max_workers=4,
46
+ silence_duration=0.5,
47
+ enable_parallel=True
48
+ )
49
 
50
+ # Generate progress callback for audio processing
51
+ def audio_progress(progress, desc=""):
52
+ yield extracted_text, gr.update(value=f"Generating audio: {desc}"), explanations, None, gr.update(visible=False)
 
 
 
 
 
53
 
54
+ # Generate audio using the new parallel processor
55
+ audio_result, _ = audio_processor.generate_audio(explanations, progress=audio_progress)
56
 
57
  # Show everything, update status to complete
58
  yield extracted_text, gr.update(value="All steps complete!"), explanations, audio_result, gr.update(visible=True)
src/processors/text_chunker.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Text chunking utility for breaking down large text into smaller chunks for audio processing."""
2
+
3
+ import re
4
+ from typing import List
5
+
6
+
7
+ class TextChunker:
8
+ """Handles intelligent text chunking for audio processing."""
9
+
10
+ def __init__(self, max_chunk_size: int = 800, overlap_sentences: int = 0):
11
+ """
12
+ Initialize the text chunker.
13
+
14
+ Args:
15
+ max_chunk_size: Maximum number of characters per chunk
16
+ overlap_sentences: Number of sentences to overlap between chunks for continuity
17
+ """
18
+ self.max_chunk_size = max_chunk_size
19
+ self.overlap_sentences = overlap_sentences
20
+
21
+ def chunk_text(self, text: str) -> List[str]:
22
+ """
23
+ Break text into smaller chunks based on paragraphs and sentence boundaries.
24
+
25
+ Args:
26
+ text: The input text to chunk
27
+
28
+ Returns:
29
+ List of text chunks
30
+ """
31
+ if not text or not text.strip():
32
+ return []
33
+
34
+ # Clean the text
35
+ text = text.strip()
36
+
37
+ # If text is within the limit, return as single chunk
38
+ if len(text) <= self.max_chunk_size:
39
+ return [text]
40
+
41
+ chunks = []
42
+
43
+ # First, try to split by paragraphs
44
+ paragraphs = self._split_into_paragraphs(text)
45
+
46
+ current_chunk = ""
47
+
48
+ for paragraph in paragraphs:
49
+ # If adding this paragraph would exceed the limit
50
+ if len(current_chunk) + len(paragraph) + 1 > self.max_chunk_size:
51
+ # If we have content in current chunk, save it
52
+ if current_chunk.strip():
53
+ chunks.append(current_chunk.strip())
54
+ current_chunk = ""
55
+
56
+ # If the paragraph itself is too long, split it by sentences
57
+ if len(paragraph) > self.max_chunk_size:
58
+ sentence_chunks = self._split_paragraph_into_sentences(paragraph)
59
+ for sentence_chunk in sentence_chunks:
60
+ if len(current_chunk) + len(sentence_chunk) + 1 > self.max_chunk_size:
61
+ if current_chunk.strip():
62
+ chunks.append(current_chunk.strip())
63
+ current_chunk = sentence_chunk
64
+ else:
65
+ if current_chunk:
66
+ current_chunk += " " + sentence_chunk
67
+ else:
68
+ current_chunk = sentence_chunk
69
+ else:
70
+ current_chunk = paragraph
71
+ else:
72
+ # Add paragraph to current chunk
73
+ if current_chunk:
74
+ current_chunk += "\n\n" + paragraph
75
+ else:
76
+ current_chunk = paragraph
77
+
78
+ # Add any remaining content
79
+ if current_chunk.strip():
80
+ chunks.append(current_chunk.strip())
81
+
82
+ # Apply overlap if specified
83
+ if self.overlap_sentences > 0 and len(chunks) > 1:
84
+ chunks = self._add_overlap(chunks)
85
+
86
+ return chunks
87
+ def _split_into_paragraphs(self, text: str) -> List[str]:
88
+ """Split text into paragraphs."""
89
+ # Split by double newlines or multiple spaces
90
+ paragraphs = re.split(r'\n\s*\n|(?:\n\s*){2,}', text)
91
+ # Filter out empty paragraphs and strip whitespace
92
+ return [p.strip() for p in paragraphs if p.strip()]
93
+
94
+ def _split_paragraph_into_sentences(self, paragraph: str) -> List[str]:
95
+ """Split a long paragraph into sentence-based chunks."""
96
+ # Split by sentence boundaries
97
+ sentences = re.split(r'(?<=[.!?])\s+', paragraph)
98
+
99
+ chunks = []
100
+ current_chunk = ""
101
+
102
+ for sentence in sentences:
103
+ # If a single sentence is longer than max_chunk_size, we need to force-split it
104
+ if len(sentence) > self.max_chunk_size:
105
+ # Save current chunk if it has content
106
+ if current_chunk.strip():
107
+ chunks.append(current_chunk.strip())
108
+ current_chunk = ""
109
+
110
+ # Force-split the long sentence into smaller pieces
111
+ while len(sentence) > self.max_chunk_size:
112
+ # Find a good breaking point (prefer spaces)
113
+ break_point = self.max_chunk_size
114
+ if ' ' in sentence[:self.max_chunk_size]:
115
+ # Find the last space within the limit
116
+ break_point = sentence[:self.max_chunk_size].rfind(' ')
117
+
118
+ chunk_part = sentence[:break_point]
119
+ chunks.append(chunk_part)
120
+ sentence = sentence[break_point:].strip()
121
+
122
+ # Add the remaining part of the sentence
123
+ if sentence:
124
+ current_chunk = sentence
125
+
126
+ elif len(current_chunk) + len(sentence) + 1 > self.max_chunk_size:
127
+ if current_chunk.strip():
128
+ chunks.append(current_chunk.strip())
129
+ current_chunk = sentence
130
+ else:
131
+ if current_chunk:
132
+ current_chunk += " " + sentence
133
+ else:
134
+ current_chunk = sentence
135
+
136
+ if current_chunk.strip():
137
+ chunks.append(current_chunk.strip())
138
+
139
+ return chunks
140
+
141
+ def _add_overlap(self, chunks: List[str]) -> List[str]:
142
+ """Add sentence overlap between chunks for better continuity."""
143
+ if len(chunks) <= 1:
144
+ return chunks
145
+
146
+ overlapped_chunks = [chunks[0]] # First chunk stays the same
147
+
148
+ for i in range(1, len(chunks)):
149
+ # Get last few sentences from previous chunk
150
+ prev_chunk = chunks[i - 1]
151
+ current_chunk = chunks[i]
152
+
153
+ prev_sentences = re.split(r'(?<=[.!?])\s+', prev_chunk)
154
+ overlap_text = " ".join(prev_sentences[-self.overlap_sentences:]) if len(prev_sentences) > self.overlap_sentences else ""
155
+
156
+ if overlap_text:
157
+ overlapped_chunk = overlap_text + " " + current_chunk
158
+ else:
159
+ overlapped_chunk = current_chunk
160
+
161
+ overlapped_chunks.append(overlapped_chunk)
162
+
163
+ return overlapped_chunks
164
+
165
+ def get_chunk_info(self, chunks: List[str]) -> dict:
166
+ """Get information about the chunks."""
167
+ return {
168
+ "total_chunks": len(chunks),
169
+ "total_characters": sum(len(chunk) for chunk in chunks),
170
+ "avg_chunk_size": sum(len(chunk) for chunk in chunks) / len(chunks) if chunks else 0,
171
+ "max_chunk_size": max(len(chunk) for chunk in chunks) if chunks else 0,
172
+ "min_chunk_size": min(len(chunk) for chunk in chunks) if chunks else 0
173
+ }
src/ui_components/interface.py CHANGED
@@ -41,14 +41,14 @@ def build_interface(process_pdf_fn):
41
  lines=15,
42
  placeholder="Explanations will be automatically generated after text extraction...",
43
  show_copy_button=True,
44
- interactive=False
45
- )
46
  gr.Markdown("### 🔊 Audio Generation")
47
  audio_output = gr.Audio(
48
  label="Generated Explanation Audio",
49
  interactive=False,
50
  visible=False
51
  )
 
52
  pdf_input.upload(
53
  fn=process_pdf_fn,
54
  inputs=[pdf_input],
 
41
  lines=15,
42
  placeholder="Explanations will be automatically generated after text extraction...",
43
  show_copy_button=True,
44
+ interactive=False )
 
45
  gr.Markdown("### 🔊 Audio Generation")
46
  audio_output = gr.Audio(
47
  label="Generated Explanation Audio",
48
  interactive=False,
49
  visible=False
50
  )
51
+
52
  pdf_input.upload(
53
  fn=process_pdf_fn,
54
  inputs=[pdf_input],