Added sentence-wise chunking

#4
by SherryT997 - opened
Files changed (1) hide show
  1. app.py +36 -7
app.py CHANGED
@@ -6,7 +6,7 @@ from threading import Thread
6
  from typing import Optional
7
 
8
  import numpy as np
9
- import spaces
10
  import gradio as gr
11
  import torch
12
 
@@ -142,6 +142,21 @@ jenny_examples = [
142
  ]
143
  ]
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  def numpy_to_mp3(audio_array, sampling_rate):
147
  # Normalize audio_array if it's floating-point
@@ -168,6 +183,22 @@ def numpy_to_mp3(audio_array, sampling_rate):
168
 
169
  return mp3_bytes
170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  sampling_rate = model.audio_encoder.config.sampling_rate
172
  frame_rate = model.audio_encoder.config.frame_rate
173
 
@@ -200,14 +231,13 @@ frame_rate = model.audio_encoder.config.frame_rate
200
  def generate_base(text, description, play_steps_in_s=2.0):
201
  # Initialize variables
202
  play_steps = int(frame_rate * play_steps_in_s)
203
- chunk_size = 15 # Process 10 words at a time
204
 
205
  # Tokenize the full text and description
206
  inputs = description_tokenizer(description, return_tensors="pt").to(device)
207
 
208
  # Split text into chunks of approximately 10 words
209
- words = text.split()
210
- chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
211
 
212
  all_audio = []
213
 
@@ -272,14 +302,13 @@ def generate_base(text, description, play_steps_in_s=2.0):
272
  def generate_jenny(text, description, play_steps_in_s=2.0):
273
  # Initialize variables
274
  play_steps = int(frame_rate * play_steps_in_s)
275
- chunk_size = 15 # Process 10 words at a time
276
 
277
  # Tokenize the full text and description
278
  inputs = description_tokenizer(description, return_tensors="pt").to(device)
279
 
280
  # Split text into chunks of approximately 10 words
281
- words = text.split()
282
- chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
283
 
284
  all_audio = []
285
 
 
6
  from typing import Optional
7
 
8
  import numpy as np
9
+ import spaces, re
10
  import gradio as gr
11
  import torch
12
 
 
142
  ]
143
  ]
144
 
145
+ def split_with_punctuation(sentence):
146
+ """
147
+ Splits a text into sentences at '.', '!', '?', and language-specific full stops (e.g., '।') while preserving sentence boundaries and
148
+ accounting for punctuation inside quotes or other paired delimiters.
149
+ Supports multiple Indian languages by accounting for non-Latin scripts.
150
+ """
151
+ # Regular expression to capture sentences ending with sentence stop characters.
152
+ # Handles the standard punctuation marks (., !, ?, ।) and checks for sentence-ending whitespace or end of text.
153
+ pattern = r'([^.?!।“”‘’\(\)]*[.!?।]+(?=\s|$))'
154
+
155
+ # Find all sentences in the text
156
+ parts = re.findall(pattern, sentence, re.DOTALL)
157
+
158
+ # Clean up the results and return them
159
+ return [part.strip() for part in parts if part.strip()]
160
 
161
  def numpy_to_mp3(audio_array, sampling_rate):
162
  # Normalize audio_array if it's floating-point
 
183
 
184
  return mp3_bytes
185
 
186
+ def process_text_in_chunks(text, chunk_size=20):
187
+ sentences = split_with_punctuation(text)
188
+
189
+ all_chunks = []
190
+
191
+ # Step 2: For each sentence, split into smaller chunks based on chunk_size (in words)
192
+ for sentence in sentences:
193
+ words = sentence.split()
194
+ sentence_chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
195
+
196
+ # Tokenize each chunk using the tokenizer
197
+ for chunk in sentence_chunks:
198
+ all_chunks.append(chunk)
199
+
200
+ return all_chunks
201
+
202
  sampling_rate = model.audio_encoder.config.sampling_rate
203
  frame_rate = model.audio_encoder.config.frame_rate
204
 
 
231
  def generate_base(text, description, play_steps_in_s=2.0):
232
  # Initialize variables
233
  play_steps = int(frame_rate * play_steps_in_s)
234
+ chunk_size = 20 # Process 10 words at a time
235
 
236
  # Tokenize the full text and description
237
  inputs = description_tokenizer(description, return_tensors="pt").to(device)
238
 
239
  # Split text into chunks of approximately 10 words
240
+ chunks = process_text_in_chunks(text, chunk_size=chunk_size)
 
241
 
242
  all_audio = []
243
 
 
302
  def generate_jenny(text, description, play_steps_in_s=2.0):
303
  # Initialize variables
304
  play_steps = int(frame_rate * play_steps_in_s)
305
+ chunk_size = 20 # Process 10 words at a time
306
 
307
  # Tokenize the full text and description
308
  inputs = description_tokenizer(description, return_tensors="pt").to(device)
309
 
310
  # Split text into chunks of approximately 10 words
311
+ chunks = process_text_in_chunks(text, chunk_size=chunk_size)
 
312
 
313
  all_audio = []
314