{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running on local URL: http://127.0.0.1:7868\n", "\n", "To create a public link, set `share=True` in `launch()`.\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\066226758\\Blog\\Virtual-Webcam-Chatbot\\.venv\\lib\\site-packages\\gradio\\processing_utils.py:583: UserWarning: Trying to convert audio automatically from float32 to 16-bit int format.\n", " warnings.warn(warning.format(data.dtype))\n" ] } ], "source": [ "import gradio as gr\n", "import numpy as np\n", "import torch\n", "from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan\n", "from io import BytesIO\n", "import soundfile as sf\n", "\n", "# Load models outside of function calls for efficiency\n", "def load_models():\n", " model = SpeechT5ForTextToSpeech.from_pretrained(\"microsoft/speecht5_tts\")\n", " processor = SpeechT5Processor.from_pretrained(\"microsoft/speecht5_tts\")\n", " vocoder = SpeechT5HifiGan.from_pretrained(\"microsoft/speecht5_hifigan\")\n", " return model, processor, vocoder\n", "\n", "model, processor, vocoder = load_models()\n", "\n", "# Load speaker embeddings\n", "def get_speaker_embeddings():\n", " speaker_embeddings = np.load(\"cmu_us_clb_arctic-wav-arctic_a0144.npy\")\n", " return torch.tensor(speaker_embeddings).unsqueeze(0)\n", "\n", "speaker_embeddings = get_speaker_embeddings()\n", "\n", "# Function to convert text to speech\n", "def text_to_speech(text):\n", " try:\n", " # Segment the text if it's too long\n", " max_length = 100 # Set a max length as per model's capability\n", " segments = [text[i:i+max_length] for i in range(0, len(text), max_length)]\n", " combined_speech = []\n", "\n", " for segment in segments:\n", " inputs = processor(text=segment, return_tensors=\"pt\")\n", " spectrogram = model.generate_speech(inputs[\"input_ids\"], speaker_embeddings)\n", " with torch.no_grad():\n", " speech = vocoder(spectrogram)\n", " combined_speech.extend(speech.numpy())\n", "\n", " # Combine audio data into a single numpy array\n", " combined_speech = np.array(combined_speech)\n", "\n", " return 16000, combined_speech # Return sample rate and combined audio data\n", " except Exception as e:\n", " return None, f\"Error in text-to-speech conversion: {e}\"\n", "\n", "# Gradio Interface\n", "def gradio_interface(text):\n", " sample_rate, audio_data = text_to_speech(text)\n", " if sample_rate and isinstance(audio_data, np.ndarray):\n", " return sample_rate, audio_data\n", " else:\n", " return None # Return None if there's an error\n", "\n", "interface = gr.Interface(\n", " fn=gradio_interface,\n", " title=\"Text to Voice T5\", # Add a title to the interface\n", " description=\"Developed by Ruslan Magana, visit ruslanmv.com for more information.\",\n", " inputs=gr.Textbox(lines=10, label=\"Enter text to convert to speech\"),\n", " outputs=gr.Audio(label=\"Generated audio\")\n", ")\n", "\n", "interface.launch()\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python (watson)", "language": "python", "name": "watson" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" } }, "nbformat": 4, "nbformat_minor": 2 }