ruslanmv commited on
Commit
0a6371e
·
1 Parent(s): 2ee6bbe

First commit

Browse files
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import torch
4
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
+ from io import BytesIO
6
+ import soundfile as sf
7
+
8
+ # Load models outside of function calls for efficiency
9
+ def load_models():
10
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
11
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
12
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
+ return model, processor, vocoder
14
+
15
+ model, processor, vocoder = load_models()
16
+
17
+ # Load speaker embeddings
18
+ def get_speaker_embeddings():
19
+ speaker_embeddings = np.load("cmu_us_clb_arctic-wav-arctic_a0144.npy")
20
+ return torch.tensor(speaker_embeddings).unsqueeze(0)
21
+
22
+ speaker_embeddings = get_speaker_embeddings()
23
+
24
+ # Function to convert text to speech
25
+ def text_to_speech(text):
26
+ try:
27
+ # Segment the text if it's too long
28
+ max_length = 100 # Set a max length as per model's capability
29
+ segments = [text[i:i+max_length] for i in range(0, len(text), max_length)]
30
+ combined_speech = []
31
+
32
+ for segment in segments:
33
+ inputs = processor(text=segment, return_tensors="pt")
34
+ spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
35
+ with torch.no_grad():
36
+ speech = vocoder(spectrogram)
37
+ combined_speech.extend(speech.numpy())
38
+
39
+ # Combine audio data into a single numpy array
40
+ combined_speech = np.array(combined_speech)
41
+
42
+ return 16000, combined_speech # Return sample rate and combined audio data
43
+ except Exception as e:
44
+ return None, f"Error in text-to-speech conversion: {e}"
45
+
46
+ # Gradio Interface
47
+ def gradio_interface(text):
48
+ sample_rate, audio_data = text_to_speech(text)
49
+ if sample_rate and isinstance(audio_data, np.ndarray):
50
+ return sample_rate, audio_data
51
+ else:
52
+ return None # Return None if there's an error
53
+
54
+ interface = gr.Interface(
55
+ fn=gradio_interface,
56
+ title="Text to Voice", # Add a title to the interface
57
+ description="Hight Fidelity TTS. Visit <a href='https://ruslanmv.com/' target='_blank'>ruslanmv.com</a> for more information.",
58
+ inputs=gr.Textbox(lines=10, label="Enter text to convert to speech"),
59
+ outputs=gr.Audio(label="Generated audio")
60
+ )
61
+
62
+ interface.launch()
cmu_us_awb_arctic-wav-arctic_a0002.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5db7a684ab490f21cec1628e00d461a184e369fe4eafb1ee441a796faf4ab6ae
3
+ size 2176
cmu_us_bdl_arctic-wav-arctic_a0009.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:215326eae3a428af8934c385fbe043b36c72849ca17d1d013adeb189e6bd6962
3
+ size 2176
cmu_us_clb_arctic-wav-arctic_a0144.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf67b36c47edfb1851466a1dff081b436bc6809b5ebc12811d9df0c0d0f28d0e
3
+ size 2176
cmu_us_ksp_arctic-wav-arctic_b0087.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6c5c2a38c2e400179019c560a74c4322f4ee13beda22ee601807545edee283e
3
+ size 2176
cmu_us_rms_arctic-wav-arctic_b0353.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a49dac3e9c3a71a4dbca4c364233c7915ae6e0cb71b2ceaed97296231b95cb50
3
+ size 2176
cmu_us_slt_arctic-wav-arctic_a0508.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f71ffadda3f3a4de079740a0b34963824dc644d9d5442283bd0a2b0d4f44ff0b
3
+ size 2176
notebook.ipynb ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 5,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "Running on local URL: http://127.0.0.1:7868\n",
13
+ "\n",
14
+ "To create a public link, set `share=True` in `launch()`.\n"
15
+ ]
16
+ },
17
+ {
18
+ "data": {
19
+ "text/html": [
20
+ "<div><iframe src=\"http://127.0.0.1:7868/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
21
+ ],
22
+ "text/plain": [
23
+ "<IPython.core.display.HTML object>"
24
+ ]
25
+ },
26
+ "metadata": {},
27
+ "output_type": "display_data"
28
+ },
29
+ {
30
+ "data": {
31
+ "text/plain": []
32
+ },
33
+ "execution_count": 5,
34
+ "metadata": {},
35
+ "output_type": "execute_result"
36
+ },
37
+ {
38
+ "name": "stderr",
39
+ "output_type": "stream",
40
+ "text": [
41
+ "c:\\Users\\066226758\\Blog\\Virtual-Webcam-Chatbot\\.venv\\lib\\site-packages\\gradio\\processing_utils.py:583: UserWarning: Trying to convert audio automatically from float32 to 16-bit int format.\n",
42
+ " warnings.warn(warning.format(data.dtype))\n"
43
+ ]
44
+ }
45
+ ],
46
+ "source": [
47
+ "import gradio as gr\n",
48
+ "import numpy as np\n",
49
+ "import torch\n",
50
+ "from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan\n",
51
+ "from io import BytesIO\n",
52
+ "import soundfile as sf\n",
53
+ "\n",
54
+ "# Load models outside of function calls for efficiency\n",
55
+ "def load_models():\n",
56
+ " model = SpeechT5ForTextToSpeech.from_pretrained(\"microsoft/speecht5_tts\")\n",
57
+ " processor = SpeechT5Processor.from_pretrained(\"microsoft/speecht5_tts\")\n",
58
+ " vocoder = SpeechT5HifiGan.from_pretrained(\"microsoft/speecht5_hifigan\")\n",
59
+ " return model, processor, vocoder\n",
60
+ "\n",
61
+ "model, processor, vocoder = load_models()\n",
62
+ "\n",
63
+ "# Load speaker embeddings\n",
64
+ "def get_speaker_embeddings():\n",
65
+ " speaker_embeddings = np.load(\"cmu_us_clb_arctic-wav-arctic_a0144.npy\")\n",
66
+ " return torch.tensor(speaker_embeddings).unsqueeze(0)\n",
67
+ "\n",
68
+ "speaker_embeddings = get_speaker_embeddings()\n",
69
+ "\n",
70
+ "# Function to convert text to speech\n",
71
+ "def text_to_speech(text):\n",
72
+ " try:\n",
73
+ " # Segment the text if it's too long\n",
74
+ " max_length = 100 # Set a max length as per model's capability\n",
75
+ " segments = [text[i:i+max_length] for i in range(0, len(text), max_length)]\n",
76
+ " combined_speech = []\n",
77
+ "\n",
78
+ " for segment in segments:\n",
79
+ " inputs = processor(text=segment, return_tensors=\"pt\")\n",
80
+ " spectrogram = model.generate_speech(inputs[\"input_ids\"], speaker_embeddings)\n",
81
+ " with torch.no_grad():\n",
82
+ " speech = vocoder(spectrogram)\n",
83
+ " combined_speech.extend(speech.numpy())\n",
84
+ "\n",
85
+ " # Combine audio data into a single numpy array\n",
86
+ " combined_speech = np.array(combined_speech)\n",
87
+ "\n",
88
+ " return 16000, combined_speech # Return sample rate and combined audio data\n",
89
+ " except Exception as e:\n",
90
+ " return None, f\"Error in text-to-speech conversion: {e}\"\n",
91
+ "\n",
92
+ "# Gradio Interface\n",
93
+ "def gradio_interface(text):\n",
94
+ " sample_rate, audio_data = text_to_speech(text)\n",
95
+ " if sample_rate and isinstance(audio_data, np.ndarray):\n",
96
+ " return sample_rate, audio_data\n",
97
+ " else:\n",
98
+ " return None # Return None if there's an error\n",
99
+ "\n",
100
+ "interface = gr.Interface(\n",
101
+ " fn=gradio_interface,\n",
102
+ " title=\"Text to Voice T5\", # Add a title to the interface\n",
103
+ " description=\"Developed by Ruslan Magana, visit <a href='https://ruslanmv.com/' target='_blank'>ruslanmv.com</a> for more information.\",\n",
104
+ " inputs=gr.Textbox(lines=10, label=\"Enter text to convert to speech\"),\n",
105
+ " outputs=gr.Audio(label=\"Generated audio\")\n",
106
+ ")\n",
107
+ "\n",
108
+ "interface.launch()\n"
109
+ ]
110
+ }
111
+ ],
112
+ "metadata": {
113
+ "kernelspec": {
114
+ "display_name": "Python (watson)",
115
+ "language": "python",
116
+ "name": "watson"
117
+ },
118
+ "language_info": {
119
+ "codemirror_mode": {
120
+ "name": "ipython",
121
+ "version": 3
122
+ },
123
+ "file_extension": ".py",
124
+ "mimetype": "text/x-python",
125
+ "name": "python",
126
+ "nbconvert_exporter": "python",
127
+ "pygments_lexer": "ipython3",
128
+ "version": "3.10.11"
129
+ }
130
+ },
131
+ "nbformat": 4,
132
+ "nbformat_minor": 2
133
+ }
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit_option_menu == 0.3.2
2
+ requests==2.28.1
3
+ times==0.7
4
+ htbuilder==0.6.1
5
+ transformers==4.29.2
6
+ torch==2.0.1
7
+ soundfile==0.12.1
8
+ torchaudio == 2.0.2
9
+ sentencepiece==0.1.99
10
+ soundfile
speech.wav ADDED
Binary file (10.3 kB). View file