akshathmangudi commited on
Commit
9470ace
·
verified ·
1 Parent(s): c8a00e4

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +64 -0
  2. indian_voices.json +38 -0
  3. requirements.txt +133 -0
  4. tts_utils.py +34 -0
app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import streamlit as st
3
+ import json
4
+ import soundfile as sf
5
+ import io
6
+ from tts_utils import load_model, generate_speech
7
+
8
+ import torch
9
+ torch.backends.cudnn.benchmark = True # Enable cuDNN benchmark for performance
10
+ torch.cuda.empty_cache() # Clear CUDA cache to prevent memory issues
11
+
12
+ # Cache model loading for performance
13
+ @st.cache_resource
14
+ def cached_load_model():
15
+ return load_model()
16
+
17
+ # Load voice database from JSON
18
+ def load_voice_database():
19
+ with open('indian_voices.json') as f:
20
+ voices = json.load(f)
21
+ return {voice['name']: voice for voice in voices}
22
+
23
+ INDIAN_FEMALE_VOICES = load_voice_database()
24
+
25
+ # Streamlit UI
26
+ st.title("🎙️ Indian English TTS Voice Selector")
27
+ text_input = st.text_area("Enter your text:", "Hey, how are you doing today?")
28
+
29
+ # Voice selection with warnings
30
+ selected_voice = st.selectbox(
31
+ "Choose voice:",
32
+ options=list(INDIAN_FEMALE_VOICES.keys()),
33
+ format_func=lambda x: f"{x} 🔊" # Add speaker icon
34
+ )
35
+
36
+ # Show voice description and warning
37
+ voice_info = INDIAN_FEMALE_VOICES[selected_voice]
38
+ st.caption(f"**Description:** {voice_info['description']}")
39
+ if voice_info['warning']:
40
+ st.warning(f"⚠️ Note: {voice_info['warning']}")
41
+
42
+ if st.button("Generate Speech", type="primary"):
43
+ model, tokenizer, desc_tokenizer = cached_load_model()
44
+
45
+ with st.spinner(f"Generating {selected_voice}'s voice..."):
46
+ try:
47
+ audio_array = generate_speech(
48
+ text_input,
49
+ voice_info['prompt'],
50
+ model,
51
+ tokenizer,
52
+ desc_tokenizer
53
+ )
54
+
55
+ # Audio playback
56
+ buffer = io.BytesIO()
57
+ sf.write(buffer, audio_array, 16000, format="WAV")
58
+ buffer.seek(0)
59
+
60
+ st.audio(buffer, format="audio/wav")
61
+ st.success("Audio generated successfully!")
62
+
63
+ except RuntimeError as e:
64
+ st.error(f"GPU Error: {str(e)}. Try a different voice or shorter text.")
indian_voices.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "name": "Swapna",
4
+ "description": "Clear and expressive Indian English accent",
5
+ "warning": null,
6
+ "prompt": "Swapna's voice with a female Indian English accent, clear and expressive."
7
+ },
8
+ {
9
+ "name": "Meera",
10
+ "description": "Warm tone with moderate pace",
11
+ "warning": "Might be slower on older GPUs",
12
+ "prompt": "Meera's voice with a warm Indian English tone, moderate pace and natural pitch."
13
+ },
14
+ {
15
+ "name": "Sneha",
16
+ "description": "High-pitched with clear articulation",
17
+ "warning": "Best for short sentences",
18
+ "prompt": "Sneha's voice with a slightly high-pitched Indian English accent, clear articulation."
19
+ },
20
+ {
21
+ "name": "Priya",
22
+ "description": "Neutral expressivity",
23
+ "warning": null,
24
+ "prompt": "Priya's voice in Indian English, balanced tone with neutral expressivity."
25
+ },
26
+ {
27
+ "name": "Gauri",
28
+ "description": "Deep voice with calm delivery",
29
+ "warning": "Requires more VRAM",
30
+ "prompt": "Gauri's voice with a deep Indian English accent, calm and measured delivery."
31
+ },
32
+ {
33
+ "name": "Nisha",
34
+ "description": "Animated and expressive",
35
+ "warning": "May take longer to generate",
36
+ "prompt": "Nisha's voice in Indian English, animated and expressive with moderate speed."
37
+ }
38
+ ]
requirements.txt ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.3.0
2
+ altair==5.5.0
3
+ argbind==0.3.9
4
+ asttokens==3.0.0
5
+ attrs==25.3.0
6
+ audioread==3.0.1
7
+ blinker==1.9.0
8
+ cachetools==5.5.2
9
+ certifi==2025.4.26
10
+ cffi==1.17.1
11
+ charset-normalizer==3.4.2
12
+ click==8.2.1
13
+ contourpy==1.3.2
14
+ cycler==0.12.1
15
+ decorator==5.2.1
16
+ descript-audio-codec==1.0.0
17
+ descript-audiotools @ git+https://github.com/descriptinc/audiotools@348ebf2034ce24e2a91a553e3171cb00c0c71678
18
+ docstring_parser==0.16
19
+ einops==0.8.1
20
+ exceptiongroup==1.3.0
21
+ executing==2.2.0
22
+ ffmpy==0.6.0
23
+ filelock==3.18.0
24
+ fire==0.7.0
25
+ flatten-dict==0.4.2
26
+ fonttools==4.58.2
27
+ fsspec==2025.5.1
28
+ future==1.0.0
29
+ gitdb==4.0.12
30
+ GitPython==3.1.44
31
+ grpcio==1.73.0
32
+ hf-xet==1.1.3
33
+ huggingface-hub==0.33.0
34
+ idna==3.10
35
+ importlib_resources==6.5.2
36
+ ipython==8.37.0
37
+ jedi==0.19.2
38
+ Jinja2==3.1.6
39
+ joblib==1.5.1
40
+ jsonschema==4.24.0
41
+ jsonschema-specifications==2025.4.1
42
+ julius==0.2.7
43
+ kiwisolver==1.4.8
44
+ lazy_loader==0.4
45
+ librosa==0.11.0
46
+ llvmlite==0.44.0
47
+ Markdown==3.8
48
+ markdown-it-py==3.0.0
49
+ markdown2==2.5.3
50
+ MarkupSafe==3.0.2
51
+ matplotlib==3.10.3
52
+ matplotlib-inline==0.1.7
53
+ mdurl==0.1.2
54
+ mpmath==1.3.0
55
+ msgpack==1.1.0
56
+ narwhals==1.42.0
57
+ networkx==3.4.2
58
+ numba==0.61.2
59
+ numpy==2.2.6
60
+ nvidia-cublas-cu12==12.6.4.1
61
+ nvidia-cuda-cupti-cu12==12.6.80
62
+ nvidia-cuda-nvrtc-cu12==12.6.77
63
+ nvidia-cuda-runtime-cu12==12.6.77
64
+ nvidia-cudnn-cu12==9.5.1.17
65
+ nvidia-cufft-cu12==11.3.0.4
66
+ nvidia-cufile-cu12==1.11.1.6
67
+ nvidia-curand-cu12==10.3.7.77
68
+ nvidia-cusolver-cu12==11.7.1.2
69
+ nvidia-cusparse-cu12==12.5.4.2
70
+ nvidia-cusparselt-cu12==0.6.3
71
+ nvidia-nccl-cu12==2.26.2
72
+ nvidia-nvjitlink-cu12==12.6.85
73
+ nvidia-nvtx-cu12==12.6.77
74
+ packaging==24.2
75
+ pandas==2.3.0
76
+ parler_tts @ git+https://github.com/huggingface/parler-tts.git@d108732cd57788ec86bc857d99a6cabd66663d68
77
+ parso==0.8.4
78
+ pexpect==4.9.0
79
+ pillow==11.2.1
80
+ platformdirs==4.3.8
81
+ pooch==1.8.2
82
+ prompt_toolkit==3.0.51
83
+ protobuf==4.25.8
84
+ ptyprocess==0.7.0
85
+ pure_eval==0.2.3
86
+ pyarrow==20.0.0
87
+ pycparser==2.22
88
+ pydeck==0.9.1
89
+ Pygments==2.19.1
90
+ pyloudnorm==0.1.1
91
+ pyparsing==3.2.3
92
+ pystoi==0.4.1
93
+ python-dateutil==2.9.0.post0
94
+ pytz==2025.2
95
+ PyYAML==6.0.2
96
+ randomname==0.2.1
97
+ referencing==0.36.2
98
+ regex==2024.11.6
99
+ requests==2.32.4
100
+ rich==14.0.0
101
+ rpds-py==0.25.1
102
+ safetensors==0.5.3
103
+ scikit-learn==1.7.0
104
+ scipy==1.15.3
105
+ sentencepiece==0.2.0
106
+ six==1.17.0
107
+ smmap==5.0.2
108
+ soundfile==0.13.1
109
+ soxr==0.5.0.post1
110
+ stack-data==0.6.3
111
+ streamlit==1.45.1
112
+ sympy==1.14.0
113
+ tenacity==9.1.2
114
+ tensorboard==2.19.0
115
+ tensorboard-data-server==0.7.2
116
+ termcolor==3.1.0
117
+ threadpoolctl==3.6.0
118
+ tokenizers==0.20.3
119
+ toml==0.10.2
120
+ torch==2.7.1
121
+ torch-stoi==0.2.3
122
+ torchaudio==2.7.1
123
+ tornado==6.5.1
124
+ tqdm==4.67.1
125
+ traitlets==5.14.3
126
+ transformers==4.46.1
127
+ triton==3.3.1
128
+ typing_extensions==4.14.0
129
+ tzdata==2025.2
130
+ urllib3==2.4.0
131
+ watchdog==6.0.0
132
+ wcwidth==0.2.13
133
+ Werkzeug==3.1.3
tts_utils.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tts_utils.py
2
+ import torch
3
+ from parler_tts import ParlerTTSForConditionalGeneration
4
+ from transformers import AutoTokenizer
5
+
6
+ def load_model():
7
+ model = ParlerTTSForConditionalGeneration.from_pretrained(
8
+ "ai4bharat/indic-parler-tts",
9
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
10
+ )
11
+ tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
12
+ description_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
13
+ return model, tokenizer, description_tokenizer
14
+
15
+ def generate_speech(text, voice_prompt, model, tokenizer, description_tokenizer):
16
+ device = "cuda" if torch.cuda.is_available() else "cpu"
17
+ model = model.to(device)
18
+
19
+ description_input_ids = description_tokenizer(
20
+ voice_prompt,
21
+ return_tensors="pt"
22
+ ).to(device)
23
+
24
+ prompt_input_ids = tokenizer(text, return_tensors="pt").to(device)
25
+
26
+ generation = model.generate(
27
+ input_ids=description_input_ids.input_ids,
28
+ attention_mask=description_input_ids.attention_mask,
29
+ prompt_input_ids=prompt_input_ids.input_ids,
30
+ prompt_attention_mask=prompt_input_ids.attention_mask,
31
+ max_new_tokens=1024
32
+ )
33
+
34
+ return generation.cpu().numpy().squeeze()