Spaces:

akshathmangudi
/

indic-tts

Running

App Files Files Community

akshathmangudi commited on 3 days ago

Commit

9470ace

verified ·

1 Parent(s): c8a00e4

Upload 4 files

Browse files

Files changed (4) hide show

app.py +64 -0
indian_voices.json +38 -0
requirements.txt +133 -0
tts_utils.py +34 -0

app.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# app.py
+import streamlit as st
+import json
+import soundfile as sf
+import io
+from tts_utils import load_model, generate_speech
+import torch
+torch.backends.cudnn.benchmark = True  # Enable cuDNN benchmark for performance
+torch.cuda.empty_cache()  # Clear CUDA cache to prevent memory issues
+# Cache model loading for performance
+@st.cache_resource
+def cached_load_model():
+    return load_model()
+# Load voice database from JSON
+def load_voice_database():
+    with open('indian_voices.json') as f:
+        voices = json.load(f)
+    return {voice['name']: voice for voice in voices}
+INDIAN_FEMALE_VOICES = load_voice_database()
+# Streamlit UI
+st.title("🎙️ Indian English TTS Voice Selector")
+text_input = st.text_area("Enter your text:", "Hey, how are you doing today?")
+# Voice selection with warnings
+selected_voice = st.selectbox(
+    "Choose voice:",
+    options=list(INDIAN_FEMALE_VOICES.keys()),
+    format_func=lambda x: f"{x} 🔊"  # Add speaker icon
+)
+# Show voice description and warning
+voice_info = INDIAN_FEMALE_VOICES[selected_voice]
+st.caption(f"**Description:** {voice_info['description']}")
+if voice_info['warning']:
+    st.warning(f"⚠️ Note: {voice_info['warning']}")
+if st.button("Generate Speech", type="primary"):
+    model, tokenizer, desc_tokenizer = cached_load_model()
+    with st.spinner(f"Generating {selected_voice}'s voice..."):
+        try:
+            audio_array = generate_speech(
+                text_input,
+                voice_info['prompt'],
+                model,
+                tokenizer,
+                desc_tokenizer
+            )
+            # Audio playback
+            buffer = io.BytesIO()
+            sf.write(buffer, audio_array, 16000, format="WAV")
+            buffer.seek(0)
+            st.audio(buffer, format="audio/wav")
+            st.success("Audio generated successfully!")
+        except RuntimeError as e:
+            st.error(f"GPU Error: {str(e)}. Try a different voice or shorter text.")

indian_voices.json ADDED Viewed

	@@ -0,0 +1,38 @@

+[
+    {
+        "name": "Swapna",
+        "description": "Clear and expressive Indian English accent",
+        "warning": null,
+        "prompt": "Swapna's voice with a female Indian English accent, clear and expressive."
+    },
+    {
+        "name": "Meera",
+        "description": "Warm tone with moderate pace",
+        "warning": "Might be slower on older GPUs",
+        "prompt": "Meera's voice with a warm Indian English tone, moderate pace and natural pitch."
+    },
+    {
+        "name": "Sneha",
+        "description": "High-pitched with clear articulation",
+        "warning": "Best for short sentences",
+        "prompt": "Sneha's voice with a slightly high-pitched Indian English accent, clear articulation."
+    },
+    {
+        "name": "Priya",
+        "description": "Neutral expressivity",
+        "warning": null,
+        "prompt": "Priya's voice in Indian English, balanced tone with neutral expressivity."
+    },
+    {
+        "name": "Gauri",
+        "description": "Deep voice with calm delivery",
+        "warning": "Requires more VRAM",
+        "prompt": "Gauri's voice with a deep Indian English accent, calm and measured delivery."
+    },
+    {
+        "name": "Nisha",
+        "description": "Animated and expressive",
+        "warning": "May take longer to generate",
+        "prompt": "Nisha's voice in Indian English, animated and expressive with moderate speed."
+    }
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,133 @@

+absl-py==2.3.0
+altair==5.5.0
+argbind==0.3.9
+asttokens==3.0.0
+attrs==25.3.0
+audioread==3.0.1
+blinker==1.9.0
+cachetools==5.5.2
+certifi==2025.4.26
+cffi==1.17.1
+charset-normalizer==3.4.2
+click==8.2.1
+contourpy==1.3.2
+cycler==0.12.1
+decorator==5.2.1
+descript-audio-codec==1.0.0
+descript-audiotools @ git+https://github.com/descriptinc/audiotools@348ebf2034ce24e2a91a553e3171cb00c0c71678
+docstring_parser==0.16
+einops==0.8.1
+exceptiongroup==1.3.0
+executing==2.2.0
+ffmpy==0.6.0
+filelock==3.18.0
+fire==0.7.0
+flatten-dict==0.4.2
+fonttools==4.58.2
+fsspec==2025.5.1
+future==1.0.0
+gitdb==4.0.12
+GitPython==3.1.44
+grpcio==1.73.0
+hf-xet==1.1.3
+huggingface-hub==0.33.0
+idna==3.10
+importlib_resources==6.5.2
+ipython==8.37.0
+jedi==0.19.2
+Jinja2==3.1.6
+joblib==1.5.1
+jsonschema==4.24.0
+jsonschema-specifications==2025.4.1
+julius==0.2.7
+kiwisolver==1.4.8
+lazy_loader==0.4
+librosa==0.11.0
+llvmlite==0.44.0
+Markdown==3.8
+markdown-it-py==3.0.0
+markdown2==2.5.3
+MarkupSafe==3.0.2
+matplotlib==3.10.3
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mpmath==1.3.0
+msgpack==1.1.0
+narwhals==1.42.0
+networkx==3.4.2
+numba==0.61.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvrtc-cu12==12.6.77
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.5.1.17
+nvidia-cufft-cu12==11.3.0.4
+nvidia-cufile-cu12==1.11.1.6
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-cusparselt-cu12==0.6.3
+nvidia-nccl-cu12==2.26.2
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvtx-cu12==12.6.77
+packaging==24.2
+pandas==2.3.0
+parler_tts @ git+https://github.com/huggingface/parler-tts.git@d108732cd57788ec86bc857d99a6cabd66663d68
+parso==0.8.4
+pexpect==4.9.0
+pillow==11.2.1
+platformdirs==4.3.8
+pooch==1.8.2
+prompt_toolkit==3.0.51
+protobuf==4.25.8
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==20.0.0
+pycparser==2.22
+pydeck==0.9.1
+Pygments==2.19.1
+pyloudnorm==0.1.1
+pyparsing==3.2.3
+pystoi==0.4.1
+python-dateutil==2.9.0.post0
+pytz==2025.2
+PyYAML==6.0.2
+randomname==0.2.1
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.4
+rich==14.0.0
+rpds-py==0.25.1
+safetensors==0.5.3
+scikit-learn==1.7.0
+scipy==1.15.3
+sentencepiece==0.2.0
+six==1.17.0
+smmap==5.0.2
+soundfile==0.13.1
+soxr==0.5.0.post1
+stack-data==0.6.3
+streamlit==1.45.1
+sympy==1.14.0
+tenacity==9.1.2
+tensorboard==2.19.0
+tensorboard-data-server==0.7.2
+termcolor==3.1.0
+threadpoolctl==3.6.0
+tokenizers==0.20.3
+toml==0.10.2
+torch==2.7.1
+torch-stoi==0.2.3
+torchaudio==2.7.1
+tornado==6.5.1
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.46.1
+triton==3.3.1
+typing_extensions==4.14.0
+tzdata==2025.2
+urllib3==2.4.0
+watchdog==6.0.0
+wcwidth==0.2.13
+Werkzeug==3.1.3

tts_utils.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# tts_utils.py
+import torch
+from parler_tts import ParlerTTSForConditionalGeneration
+from transformers import AutoTokenizer
+def load_model():
+    model = ParlerTTSForConditionalGeneration.from_pretrained(
+        "ai4bharat/indic-parler-tts",
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+    )
+    tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
+    description_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
+    return model, tokenizer, description_tokenizer
+def generate_speech(text, voice_prompt, model, tokenizer, description_tokenizer):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = model.to(device)
+    description_input_ids = description_tokenizer(
+        voice_prompt,
+        return_tensors="pt"
+    ).to(device)
+    prompt_input_ids = tokenizer(text, return_tensors="pt").to(device)
+    generation = model.generate(
+        input_ids=description_input_ids.input_ids,
+        attention_mask=description_input_ids.attention_mask,
+        prompt_input_ids=prompt_input_ids.input_ids,
+        prompt_attention_mask=prompt_input_ids.attention_mask,
+        max_new_tokens=1024
+    )
+    return generation.cpu().numpy().squeeze()