Spaces:
Running
on
Zero
Running
on
Zero
Added sentence-wise chunking
#4
by
SherryT997
- opened
app.py
CHANGED
@@ -6,7 +6,7 @@ from threading import Thread
|
|
6 |
from typing import Optional
|
7 |
|
8 |
import numpy as np
|
9 |
-
import spaces
|
10 |
import gradio as gr
|
11 |
import torch
|
12 |
|
@@ -142,6 +142,21 @@ jenny_examples = [
|
|
142 |
]
|
143 |
]
|
144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
def numpy_to_mp3(audio_array, sampling_rate):
|
147 |
# Normalize audio_array if it's floating-point
|
@@ -168,6 +183,22 @@ def numpy_to_mp3(audio_array, sampling_rate):
|
|
168 |
|
169 |
return mp3_bytes
|
170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
sampling_rate = model.audio_encoder.config.sampling_rate
|
172 |
frame_rate = model.audio_encoder.config.frame_rate
|
173 |
|
@@ -200,14 +231,13 @@ frame_rate = model.audio_encoder.config.frame_rate
|
|
200 |
def generate_base(text, description, play_steps_in_s=2.0):
|
201 |
# Initialize variables
|
202 |
play_steps = int(frame_rate * play_steps_in_s)
|
203 |
-
chunk_size =
|
204 |
|
205 |
# Tokenize the full text and description
|
206 |
inputs = description_tokenizer(description, return_tensors="pt").to(device)
|
207 |
|
208 |
# Split text into chunks of approximately 10 words
|
209 |
-
|
210 |
-
chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
|
211 |
|
212 |
all_audio = []
|
213 |
|
@@ -272,14 +302,13 @@ def generate_base(text, description, play_steps_in_s=2.0):
|
|
272 |
def generate_jenny(text, description, play_steps_in_s=2.0):
|
273 |
# Initialize variables
|
274 |
play_steps = int(frame_rate * play_steps_in_s)
|
275 |
-
chunk_size =
|
276 |
|
277 |
# Tokenize the full text and description
|
278 |
inputs = description_tokenizer(description, return_tensors="pt").to(device)
|
279 |
|
280 |
# Split text into chunks of approximately 10 words
|
281 |
-
|
282 |
-
chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
|
283 |
|
284 |
all_audio = []
|
285 |
|
|
|
6 |
from typing import Optional
|
7 |
|
8 |
import numpy as np
|
9 |
+
import spaces, re
|
10 |
import gradio as gr
|
11 |
import torch
|
12 |
|
|
|
142 |
]
|
143 |
]
|
144 |
|
145 |
+
def split_with_punctuation(sentence):
|
146 |
+
"""
|
147 |
+
Splits a text into sentences at '.', '!', '?', and language-specific full stops (e.g., '।') while preserving sentence boundaries and
|
148 |
+
accounting for punctuation inside quotes or other paired delimiters.
|
149 |
+
Supports multiple Indian languages by accounting for non-Latin scripts.
|
150 |
+
"""
|
151 |
+
# Regular expression to capture sentences ending with sentence stop characters.
|
152 |
+
# Handles the standard punctuation marks (., !, ?, ।) and checks for sentence-ending whitespace or end of text.
|
153 |
+
pattern = r'([^.?!।“”‘’\(\)]*[.!?।]+(?=\s|$))'
|
154 |
+
|
155 |
+
# Find all sentences in the text
|
156 |
+
parts = re.findall(pattern, sentence, re.DOTALL)
|
157 |
+
|
158 |
+
# Clean up the results and return them
|
159 |
+
return [part.strip() for part in parts if part.strip()]
|
160 |
|
161 |
def numpy_to_mp3(audio_array, sampling_rate):
|
162 |
# Normalize audio_array if it's floating-point
|
|
|
183 |
|
184 |
return mp3_bytes
|
185 |
|
186 |
+
def process_text_in_chunks(text, chunk_size=20):
|
187 |
+
sentences = split_with_punctuation(text)
|
188 |
+
|
189 |
+
all_chunks = []
|
190 |
+
|
191 |
+
# Step 2: For each sentence, split into smaller chunks based on chunk_size (in words)
|
192 |
+
for sentence in sentences:
|
193 |
+
words = sentence.split()
|
194 |
+
sentence_chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
|
195 |
+
|
196 |
+
# Tokenize each chunk using the tokenizer
|
197 |
+
for chunk in sentence_chunks:
|
198 |
+
all_chunks.append(chunk)
|
199 |
+
|
200 |
+
return all_chunks
|
201 |
+
|
202 |
sampling_rate = model.audio_encoder.config.sampling_rate
|
203 |
frame_rate = model.audio_encoder.config.frame_rate
|
204 |
|
|
|
231 |
def generate_base(text, description, play_steps_in_s=2.0):
|
232 |
# Initialize variables
|
233 |
play_steps = int(frame_rate * play_steps_in_s)
|
234 |
+
chunk_size = 20 # Process 10 words at a time
|
235 |
|
236 |
# Tokenize the full text and description
|
237 |
inputs = description_tokenizer(description, return_tensors="pt").to(device)
|
238 |
|
239 |
# Split text into chunks of approximately 10 words
|
240 |
+
chunks = process_text_in_chunks(text, chunk_size=chunk_size)
|
|
|
241 |
|
242 |
all_audio = []
|
243 |
|
|
|
302 |
def generate_jenny(text, description, play_steps_in_s=2.0):
|
303 |
# Initialize variables
|
304 |
play_steps = int(frame_rate * play_steps_in_s)
|
305 |
+
chunk_size = 20 # Process 10 words at a time
|
306 |
|
307 |
# Tokenize the full text and description
|
308 |
inputs = description_tokenizer(description, return_tensors="pt").to(device)
|
309 |
|
310 |
# Split text into chunks of approximately 10 words
|
311 |
+
chunks = process_text_in_chunks(text, chunk_size=chunk_size)
|
|
|
312 |
|
313 |
all_audio = []
|
314 |
|