Spaces:

ruslanmv
/

Text-to-Voice-Transformers

Sleeping

App Files Files Community

ruslanmv commited on Jun 14, 2024

Commit

0a6371e

1 Parent(s): 2ee6bbe

First commit

Browse files

Files changed (10) hide show

app.py +62 -0
cmu_us_awb_arctic-wav-arctic_a0002.npy +3 -0
cmu_us_bdl_arctic-wav-arctic_a0009.npy +3 -0
cmu_us_clb_arctic-wav-arctic_a0144.npy +3 -0
cmu_us_ksp_arctic-wav-arctic_b0087.npy +3 -0
cmu_us_rms_arctic-wav-arctic_b0353.npy +3 -0
cmu_us_slt_arctic-wav-arctic_a0508.npy +3 -0
notebook.ipynb +133 -0
requirements.txt +10 -0
speech.wav +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import gradio as gr
+import numpy as np
+import torch
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from io import BytesIO
+import soundfile as sf
+# Load models outside of function calls for efficiency
+def load_models():
+    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+    return model, processor, vocoder
+model, processor, vocoder = load_models()
+# Load speaker embeddings
+def get_speaker_embeddings():
+    speaker_embeddings = np.load("cmu_us_clb_arctic-wav-arctic_a0144.npy")
+    return torch.tensor(speaker_embeddings).unsqueeze(0)
+speaker_embeddings = get_speaker_embeddings()
+# Function to convert text to speech
+def text_to_speech(text):
+    try:
+        # Segment the text if it's too long
+        max_length = 100  # Set a max length as per model's capability
+        segments = [text[i:i+max_length] for i in range(0, len(text), max_length)]
+        combined_speech = []
+        for segment in segments:
+            inputs = processor(text=segment, return_tensors="pt")
+            spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
+            with torch.no_grad():
+                speech = vocoder(spectrogram)
+                combined_speech.extend(speech.numpy())
+        # Combine audio data into a single numpy array
+        combined_speech = np.array(combined_speech)
+        return 16000, combined_speech  # Return sample rate and combined audio data
+    except Exception as e:
+        return None, f"Error in text-to-speech conversion: {e}"
+# Gradio Interface
+def gradio_interface(text):
+    sample_rate, audio_data = text_to_speech(text)
+    if sample_rate and isinstance(audio_data, np.ndarray):
+        return sample_rate, audio_data
+    else:
+        return None  # Return None if there's an error
+interface = gr.Interface(
+    fn=gradio_interface,
+    title="Text to Voice",  # Add a title to the interface
+    description="Hight Fidelity TTS. Visit <a href='https://ruslanmv.com/' target='_blank'>ruslanmv.com</a> for more information.",
+    inputs=gr.Textbox(lines=10, label="Enter text to convert to speech"),
+    outputs=gr.Audio(label="Generated audio")
+)
+interface.launch()

cmu_us_awb_arctic-wav-arctic_a0002.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5db7a684ab490f21cec1628e00d461a184e369fe4eafb1ee441a796faf4ab6ae
+size 2176

cmu_us_bdl_arctic-wav-arctic_a0009.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:215326eae3a428af8934c385fbe043b36c72849ca17d1d013adeb189e6bd6962
+size 2176

cmu_us_clb_arctic-wav-arctic_a0144.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf67b36c47edfb1851466a1dff081b436bc6809b5ebc12811d9df0c0d0f28d0e
+size 2176

cmu_us_ksp_arctic-wav-arctic_b0087.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6c5c2a38c2e400179019c560a74c4322f4ee13beda22ee601807545edee283e
+size 2176

cmu_us_rms_arctic-wav-arctic_b0353.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a49dac3e9c3a71a4dbca4c364233c7915ae6e0cb71b2ceaed97296231b95cb50
+size 2176

cmu_us_slt_arctic-wav-arctic_a0508.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f71ffadda3f3a4de079740a0b34963824dc644d9d5442283bd0a2b0d4f44ff0b
+size 2176

notebook.ipynb ADDED Viewed

	@@ -0,0 +1,133 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://127.0.0.1:7868\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7868/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\066226758\\Blog\\Virtual-Webcam-Chatbot\\.venv\\lib\\site-packages\\gradio\\processing_utils.py:583: UserWarning: Trying to convert audio automatically from float32 to 16-bit int format.\n",
+      "  warnings.warn(warning.format(data.dtype))\n"
+     ]
+    }
+   ],
+   "source": [
+    "import gradio as gr\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan\n",
+    "from io import BytesIO\n",
+    "import soundfile as sf\n",
+    "\n",
+    "# Load models outside of function calls for efficiency\n",
+    "def load_models():\n",
+    "    model = SpeechT5ForTextToSpeech.from_pretrained(\"microsoft/speecht5_tts\")\n",
+    "    processor = SpeechT5Processor.from_pretrained(\"microsoft/speecht5_tts\")\n",
+    "    vocoder = SpeechT5HifiGan.from_pretrained(\"microsoft/speecht5_hifigan\")\n",
+    "    return model, processor, vocoder\n",
+    "\n",
+    "model, processor, vocoder = load_models()\n",
+    "\n",
+    "# Load speaker embeddings\n",
+    "def get_speaker_embeddings():\n",
+    "    speaker_embeddings = np.load(\"cmu_us_clb_arctic-wav-arctic_a0144.npy\")\n",
+    "    return torch.tensor(speaker_embeddings).unsqueeze(0)\n",
+    "\n",
+    "speaker_embeddings = get_speaker_embeddings()\n",
+    "\n",
+    "# Function to convert text to speech\n",
+    "def text_to_speech(text):\n",
+    "    try:\n",
+    "        # Segment the text if it's too long\n",
+    "        max_length = 100  # Set a max length as per model's capability\n",
+    "        segments = [text[i:i+max_length] for i in range(0, len(text), max_length)]\n",
+    "        combined_speech = []\n",
+    "\n",
+    "        for segment in segments:\n",
+    "            inputs = processor(text=segment, return_tensors=\"pt\")\n",
+    "            spectrogram = model.generate_speech(inputs[\"input_ids\"], speaker_embeddings)\n",
+    "            with torch.no_grad():\n",
+    "                speech = vocoder(spectrogram)\n",
+    "                combined_speech.extend(speech.numpy())\n",
+    "\n",
+    "        # Combine audio data into a single numpy array\n",
+    "        combined_speech = np.array(combined_speech)\n",
+    "\n",
+    "        return 16000, combined_speech  # Return sample rate and combined audio data\n",
+    "    except Exception as e:\n",
+    "        return None, f\"Error in text-to-speech conversion: {e}\"\n",
+    "\n",
+    "# Gradio Interface\n",
+    "def gradio_interface(text):\n",
+    "    sample_rate, audio_data = text_to_speech(text)\n",
+    "    if sample_rate and isinstance(audio_data, np.ndarray):\n",
+    "        return sample_rate, audio_data\n",
+    "    else:\n",
+    "        return None  # Return None if there's an error\n",
+    "\n",
+    "interface = gr.Interface(\n",
+    "    fn=gradio_interface,\n",
+    "    title=\"Text to Voice T5\",  # Add a title to the interface\n",
+    "    description=\"Developed by Ruslan Magana, visit <a href='https://ruslanmv.com/' target='_blank'>ruslanmv.com</a> for more information.\",\n",
+    "    inputs=gr.Textbox(lines=10, label=\"Enter text to convert to speech\"),\n",
+    "    outputs=gr.Audio(label=\"Generated audio\")\n",
+    ")\n",
+    "\n",
+    "interface.launch()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (watson)",
+   "language": "python",
+   "name": "watson"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+streamlit_option_menu == 0.3.2
+requests==2.28.1
+times==0.7
+htbuilder==0.6.1
+transformers==4.29.2
+torch==2.0.1
+soundfile==0.12.1
+torchaudio == 2.0.2
+sentencepiece==0.1.99
+soundfile

speech.wav ADDED Viewed

Binary file (10.3 kB). View file