How to change voice character [ANSWERED]
Hey y'all, Micah here. A lot of people are asking this question so I figure I'd provide the answer I came up with. In order to change the voice character of this TTS AI, you have to change the speaker embeddings. In my work I used Speechbrain's spkrec-xvect-voxceleb
model to calculate the embeddings, after passing them through an AudioDenoiser package.
The code for the above looks like this:
from transformers import pipeline
import soundfile as sf
import torch
import time
import torchaudio
from audio_denoiser.AudioDenoiser import AudioDenoiser
def embed(source, target):
# First, denoise audio. (Optional but improves quality)
signal, fs = torchaudio.load(source)
auto_scale = True # Recommended for low-volume input audio
signal = denoiser.process_waveform(waveform=signal, sample_rate=16000, auto_scale=auto_scale)
# Calculate speech embeddings.
from speechbrain.inference.speaker import EncoderClassifier
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", savedir="pretrained_models/spkrec-xvect-voxceleb", run_opts={"device":device})
embeddings = classifier.encode_batch(signal)
# Here, embeddings is length 2048, so we need to squeeze it down.
embeddings = torch.nn.functional.normalize(embeddings[:, :512], dim=-1).squeeze([1]) # Changes size from [1, 1, 1, 512] to [1, 512]
# Write embeddings to a file.
print(embeddings.size())
with open(target, "wb") as f:
if not ("Voice Embeddings" in target):
target = "./Voice Embeddings/" + target
torch.save(embeddings, f)
return target
def GetEmbedding(location):
with open(location, "rb") as f:
return torch.load(f).squeeze(1)
# Making an embedding:
embed("whatever.wav", "whatever.bin") # target can end with whatever but I just use .bin
# Note that audio must be in 16khz mono audio in WAV format. I use FFMPEG in my thing to convert the audio.
# Then, when you go to generate speech:
synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")
speech = synthesiser(Text, forward_params={"speaker_embeddings": GetEmbedding(PATH_TO_EMBEDDING)})
You can find the relevant source code for my bot's TTS component here
Keep in mind that it's a toy project, so the code isn't super organized or easy to read :)
Here's a demo:
I used this sample audio of Joe Biden speaking:
And generated these:
I include the second sample so you can see that the model still suffers when generating longer passages of voiced text. This is especially apparent with slower speakers, like Joe Biden.
Here's the same text read on a faster speaker:
If you need help, feel free to drop me a line on my Discord, micahb.dev
!
@MicahB you REALLY shouldn't be publishing your HF token on GitHub...
DW if there's a token up there, it's dead already :3
Were you able to get the thing working?
@MicahB , not perfectly, but a little more natural sounding - it may be able to be improved further by making the pipeline more nuanced but having a hyper-specific pipeline would probably be a pain to maintain. Below is a basic NTLK powered split-based pipeline to help compensate:
import os
import io
import torch
import torchaudio
import soundfile as sf
from transformers import pipeline
from speechbrain.inference.speaker import EncoderClassifier
import sounddevice as sd # For audio playback without saving to file, easier for testing.
import nltk
import numpy as np
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
device = "cuda" if torch.cuda.is_available() else "cpu"
def embed(source, target):
# Load audio
signal, fs = torchaudio.load(source)
# Resample if necessary to 16kHz
if fs != 16000:
signal = torchaudio.functional.resample(signal, fs, 16000)
# Use speechbrain x-vector model to get speaker embeddings
classifier = EncoderClassifier.from_hparams(
source="speechbrain/spkrec-xvect-voxceleb",
savedir="pretrained_models/spkrec-xvect-voxceleb",
run_opts={"device": device}
)
# Encode and normalize embeddings
embeddings = classifier.encode_batch(signal) # shape: [1, 1, 1, 512]
embeddings = torch.nn.functional.normalize(embeddings[:, :512], dim=-1).squeeze(1) # [1, 512]
# Create directory if it doesn't exist
if not os.path.exists("./Voice Embeddings"):
os.makedirs("./Voice Embeddings")
if "Voice Embeddings" not in target:
target = "./Voice Embeddings/" + target
torch.save(embeddings, target)
return target
def GetEmbedding(location):
emb = torch.load(location) # shape: [1, 512]
# If dimension is [1, 1, 512], squeeze to [1, 512]
if emb.dim() == 3:
emb = emb.squeeze(1)
return emb
def add_commas(text):
"""
A naive approach to add commas to text using NLTK.
This approach uses POS tagging to add commas before coordinating conjunctions (CC).
It's simplistic and might not always produce perfect results.
"""
sentences = sent_tokenize(text)
processed_sentences = []
for sent in sentences:
words = word_tokenize(sent)
tagged = pos_tag(words)
updated_words = []
for i, (word, tag) in enumerate(tagged):
if tag == 'CC' and i > 0:
# Insert a comma before the conjunction if not already punctuation
if updated_words and updated_words[-1] not in [',', '.', '!', '?']:
updated_words.append(',')
updated_words.append(word)
processed_sentence = ' '.join(updated_words)
processed_sentence = processed_sentence.replace(' ,', ',')
processed_sentences.append(processed_sentence)
final_text = ' '.join(processed_sentences)
return final_text
def generate_tts_audio(text, synthesiser, speaker_embeddings):
# Pass the text through our comma-adding function
refined_text = add_commas(text)
# Split the text at commas - may want to consider splitting at other sentence boundaries like '?', '!', and '.' (be mindful of decimal vs period)
segments = refined_text.split(',')
all_audio_segments = []
sample_rate = None
for i, segment in enumerate(segments):
segment = segment.strip()
if not segment:
continue
# Generate TTS for each segment
speech = synthesiser(segment, forward_params={"speaker_embeddings": speaker_embeddings})
sound = speech["audio"] # numpy array: [samples]
if sample_rate is None:
sample_rate = speech["sampling_rate"]
# Append a small silence (optional) between segments to simulate a natural pause
# For now, we skip adding silence, or you could do:
# sound = np.concatenate([sound, np.zeros(int(sample_rate * 0.2))]) # 0.2s silence
all_audio_segments.append(sound)
# Concatenate all segments into one audio array
if len(all_audio_segments) > 1:
final_audio = np.concatenate(all_audio_segments)
elif len(all_audio_segments) == 1:
final_audio = all_audio_segments[0]
else:
final_audio = np.array([])
return final_audio, sample_rate
def play_tts_audio(sound, sample_rate):
# Play the audio from memory using sounddevice
sd.play(sound, sample_rate)
sd.wait()
if __name__ == "__main__":
# Compute speaker embeddings once
embedding_path = embed("whatever.wav", "whatever.bin")
# Initialize the TTS pipeline and load embeddings
synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
embeddings = GetEmbedding(embedding_path).to(device)
# Example usage:
text_to_speak = "Hello world this is a really long sentence to see how well the model handles long content text to speech generation and to ensure that pauses are reduced."
sound, sample_rate = generate_tts_audio(text_to_speak, synthesiser, embeddings)
play_tts_audio(sound, sample_rate)
Edit: code block formatting
neato taquito