File size: 4,327 Bytes
ae8e1dd d2969a7 ae8e1dd 4b2875e ae8e1dd c61a6f7 83210ed 9e9a056 83210ed 9e9a056 ae8e1dd 9e9a056 ae8e1dd c61a6f7 ae8e1dd d2969a7 83210ed ae8e1dd 0a07595 ae8e1dd c61a6f7 ae8e1dd 45fe7d7 ae8e1dd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import gradio as gr
import argparse
import json
import datetime as dt
import numpy as np
from scipy.io.wavfile import write
import torch
from pydub import AudioSegment
from model.classifier import SpecClassifier
from torch.utils.data import DataLoader
from text import text_to_sequence, cmudict
from text.symbols import symbols
import utils_data as utils
from kaldiio import WriteHelper
import os
from tqdm import tqdm
from text import text_to_sequence, convert_text
import sys
from model import GradTTSXvector, GradTTSWithEmo
import IPython.display as ipd
device = ('cuda' if torch.cuda.is_available() else 'cpu')
device
hps, args = utils.get_hparams_decode_two_mixture()
gradtts_uncond_model = GradTTSWithEmo
gradtts_uncond_model = gradtts_uncond_model(**hps.model).to(device)
model = SpecClassifier(
in_dim=hps.data.n_mel_channels,
d_decoder=hps.model.d_decoder,
h_decoder=hps.model.h_decoder,
l_decoder=hps.model.l_decoder,
k_decoder=hps.model.k_decoder,
decoder_dropout=hps.model.decoder_dropout,
n_class=hps.model.n_emos,
cond_dim=hps.data.n_mel_channels,
model_type=getattr(hps.model, "classifier_type", "CNN-with-time")
)
# ckpt = './cnnwt_SGD_1959.pt'
# ckpt_tts = './grad_uncond_cnn_001.pt'
ckpt = './CNN_SGD_001_1885.pt'
ckpt_tts = './grad_uncond_cnn_001.pt'
utils.load_checkpoint_no_logger(ckpt_tts, gradtts_uncond_model, None)
utils.load_checkpoint_no_logger(ckpt, model, None)
_ = model.to(device).eval()
HIFIGAN_CONFIG = './config.json'
HIFIGAN_CHECKPT = './g_01720000'
from models import Generator as HiFiGAN
from env import AttrDict
print('Initializing HiFi-GAN...')
with open(HIFIGAN_CONFIG) as f:
h = AttrDict(json.load(f))
vocoder = HiFiGAN(h)
vocoder.load_state_dict(torch.load(HIFIGAN_CHECKPT, map_location=lambda loc, storage: loc)['generator'])
_ = vocoder.to(device).eval()
vocoder.remove_weight_norm()
emotions = sorted(["angry", "surprise", "fear", "happy", "neutral", "sad"])
spekears = ['Madi', 'Marzhan', 'Akzhol']
def generate_audio(text, quantity, speaker, emotion_1, emotion_2):
y_dec = torch.tensor([torch.nan])
gui = 300
while torch.isnan(y_dec).sum() != 0:
x, x_lengths = convert_text(text)
emo_1, emo_2 = emotions.index(emotion_1), emotions.index(emotion_2)
emo1 = torch.LongTensor([emo_1]).to(device)
emo2 = torch.LongTensor([emo_2]).to(device)
sid = torch.LongTensor([spekears.index(speaker)]).to(device)
intensity = quantity / 100
y_enc, y_dec, attn = gradtts_uncond_model.classifier_guidance_decode_two_mixture(
x, x_lengths,
n_timesteps=100,
temperature=2.0,
stoc=args.stoc,
spk=sid,
emo1=emo1,
emo2=emo2,
emo1_weight=intensity,
length_scale=1.,
classifier_func=model.forward,
guidance=gui,
classifier_type=model.model_type
)
y_dec = y_dec.detach()
res = y_dec.squeeze().to(device).numpy()
x = torch.from_numpy(res).unsqueeze(0)
y_g_hat = vocoder(x)
audio = y_g_hat.squeeze()
audio = audio * 32768.0
audio = audio.detach().cpu().numpy().astype('int16')
gui -= 50
if gui <= 0:
break
sr = 22050
return (sr, audio)
demo = gr.Interface(
generate_audio,
[
gr.Textbox(value='Батпақ соры шабындыққа және жыл бойғы жайылымға пайдаланылады.', label="Text you want to synthesize"),
gr.Slider(0, 100, value=50, step=10, label="Count", info="Choose between 0 and 100"),
# gr.Slider(0, 1000, value=100, step=10, label="Guidance", info="Choose between 0 and 1000"),
gr.Dropdown(spekears, value=spekears[1], label="Narrator", info="Select a narrator."
),
gr.Dropdown(emotions, value=emotions[0], label="Emotion 1", info="Select first emotion"),
gr.Dropdown(emotions, value=emotions[3], label="Emotion 2", info="Select second emotion."
),
],
"audio",
)
print('launching the app')
demo.launch() |