Spaces:

ai4bharat
/

indic-parler-tts

Running on Zero

App Files Files Community

Added sentence-wise chunking

by SherryT997 - opened 23 days ago

base: refs/heads/main

←

from: refs/pr/4

Discussion Files changed

+36

-7

Files changed (1) hide show

app.py +36 -7

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ from threading import Thread
 from typing import Optional
 import numpy as np
-import spaces
 import gradio as gr
 import torch
@@ -142,6 +142,21 @@ jenny_examples = [
     ]
 ]
 def numpy_to_mp3(audio_array, sampling_rate):
     # Normalize audio_array if it's floating-point
@@ -168,6 +183,22 @@ def numpy_to_mp3(audio_array, sampling_rate):
     return mp3_bytes
 sampling_rate = model.audio_encoder.config.sampling_rate
 frame_rate = model.audio_encoder.config.frame_rate
@@ -200,14 +231,13 @@ frame_rate = model.audio_encoder.config.frame_rate
 def generate_base(text, description, play_steps_in_s=2.0):
     # Initialize variables
     play_steps = int(frame_rate * play_steps_in_s)
-    chunk_size = 15  # Process 10 words at a time
     # Tokenize the full text and description
     inputs = description_tokenizer(description, return_tensors="pt").to(device)
     # Split text into chunks of approximately 10 words
-    words = text.split()
-    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
     all_audio = []
@@ -272,14 +302,13 @@ def generate_base(text, description, play_steps_in_s=2.0):
 def generate_jenny(text, description, play_steps_in_s=2.0):
     # Initialize variables
     play_steps = int(frame_rate * play_steps_in_s)
-    chunk_size = 15  # Process 10 words at a time
     # Tokenize the full text and description
     inputs = description_tokenizer(description, return_tensors="pt").to(device)
     # Split text into chunks of approximately 10 words
-    words = text.split()
-    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
     all_audio = []

 from typing import Optional
 import numpy as np
+import spaces, re
 import gradio as gr
 import torch
     ]
 ]
+def split_with_punctuation(sentence):
+    """
+    Splits a text into sentences at '.', '!', '?', and language-specific full stops (e.g., '।') while preserving sentence boundaries and
+    accounting for punctuation inside quotes or other paired delimiters.
+    Supports multiple Indian languages by accounting for non-Latin scripts.
+    """
+    # Regular expression to capture sentences ending with sentence stop characters.
+    # Handles the standard punctuation marks (., !, ?, ।) and checks for sentence-ending whitespace or end of text.
+    pattern = r'([^.?!।“”‘’\(\)]*[.!?।]+(?=\s|$))'
+    # Find all sentences in the text
+    parts = re.findall(pattern, sentence, re.DOTALL)
+    # Clean up the results and return them
+    return [part.strip() for part in parts if part.strip()]
 def numpy_to_mp3(audio_array, sampling_rate):
     # Normalize audio_array if it's floating-point
     return mp3_bytes
+def process_text_in_chunks(text, chunk_size=20):
+    sentences = split_with_punctuation(text)
+    all_chunks = []
+    # Step 2: For each sentence, split into smaller chunks based on chunk_size (in words)
+    for sentence in sentences:
+        words = sentence.split()
+        sentence_chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
+        # Tokenize each chunk using the tokenizer
+        for chunk in sentence_chunks:
+            all_chunks.append(chunk)
+    return all_chunks
 sampling_rate = model.audio_encoder.config.sampling_rate
 frame_rate = model.audio_encoder.config.frame_rate
 def generate_base(text, description, play_steps_in_s=2.0):
     # Initialize variables
     play_steps = int(frame_rate * play_steps_in_s)
+    chunk_size = 20  # Process 10 words at a time
     # Tokenize the full text and description
     inputs = description_tokenizer(description, return_tensors="pt").to(device)
     # Split text into chunks of approximately 10 words
+    chunks = process_text_in_chunks(text, chunk_size=chunk_size)
     all_audio = []
 def generate_jenny(text, description, play_steps_in_s=2.0):
     # Initialize variables
     play_steps = int(frame_rate * play_steps_in_s)
+    chunk_size = 20  # Process 10 words at a time
     # Tokenize the full text and description
     inputs = description_tokenizer(description, return_tensors="pt").to(device)
     # Split text into chunks of approximately 10 words
+    chunks = process_text_in_chunks(text, chunk_size=chunk_size)
     all_audio = []