Spaces:
Running
Running
File size: 11,197 Bytes
524d54e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"#@title Choose English model { run: \"auto\" }\n",
"lang = 'English'\n",
"tag = 'training/espnet/egs2/ljspeech/tts1' #@param [\"kan-bayashi/ljspeech_tacotron2\", \"kan-bayashi/ljspeech_fastspeech\", \"kan-bayashi/ljspeech_fastspeech2\", \"kan-bayashi/ljspeech_conformer_fastspeech2\", \"kan-bayashi/ljspeech_joint_finetune_conformer_fastspeech2_hifigan\", \"kan-bayashi/ljspeech_joint_train_conformer_fastspeech2_hifigan\", \"kan-bayashi/ljspeech_vits\"] {type:\"string\"}\n",
"vocoder_tag = \"none\" #@param [\"none\", \"parallel_wavegan/ljspeech_parallel_wavegan.v1\", \"parallel_wavegan/ljspeech_full_band_melgan.v2\", \"parallel_wavegan/ljspeech_multi_band_melgan.v2\", \"parallel_wavegan/ljspeech_hifigan.v1\", \"parallel_wavegan/ljspeech_style_melgan.v1\"] {type:\"string\"}"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: 'exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[7], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mespnet2\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbin\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtts_inference\u001b[39;00m \u001b[39mimport\u001b[39;00m Text2Speech\n\u001b[1;32m 2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mespnet2\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mutils\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtypes\u001b[39;00m \u001b[39mimport\u001b[39;00m str_or_none\n\u001b[0;32m----> 4\u001b[0m text2speech \u001b[39m=\u001b[39m Text2Speech(\n\u001b[1;32m 5\u001b[0m train_config\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m/home/robinhad/Projects/ukrainian-tts/training/espnet/egs2/ljspeech/tts1/exp/tts_train_raw_phn_tacotron_g2p_en_no_space/config.yaml\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 6\u001b[0m model_file\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m/home/robinhad/Projects/ukrainian-tts/training/espnet/egs2/ljspeech/tts1/exp/tts_train_raw_phn_tacotron_g2p_en_no_space/checkpoint.pth\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 7\u001b[0m device\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mcuda\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 8\u001b[0m \u001b[39m# Only for Tacotron 2 & Transformer\u001b[39;49;00m\n\u001b[1;32m 9\u001b[0m threshold\u001b[39m=\u001b[39;49m\u001b[39m0.5\u001b[39;49m,\n\u001b[1;32m 10\u001b[0m \u001b[39m# Only for Tacotron 2\u001b[39;49;00m\n\u001b[1;32m 11\u001b[0m minlenratio\u001b[39m=\u001b[39;49m\u001b[39m0.0\u001b[39;49m,\n\u001b[1;32m 12\u001b[0m maxlenratio\u001b[39m=\u001b[39;49m\u001b[39m10.0\u001b[39;49m,\n\u001b[1;32m 13\u001b[0m use_att_constraint\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m 14\u001b[0m backward_window\u001b[39m=\u001b[39;49m\u001b[39m1\u001b[39;49m,\n\u001b[1;32m 15\u001b[0m forward_window\u001b[39m=\u001b[39;49m\u001b[39m3\u001b[39;49m,\n\u001b[1;32m 16\u001b[0m \u001b[39m# Only for FastSpeech & FastSpeech2 & VITS\u001b[39;49;00m\n\u001b[1;32m 17\u001b[0m speed_control_alpha\u001b[39m=\u001b[39;49m\u001b[39m4\u001b[39;49m,\n\u001b[1;32m 18\u001b[0m \u001b[39m# Only for VITS\u001b[39;49;00m\n\u001b[1;32m 19\u001b[0m noise_scale\u001b[39m=\u001b[39;49m\u001b[39m0.333\u001b[39;49m,\n\u001b[1;32m 20\u001b[0m noise_scale_dur\u001b[39m=\u001b[39;49m\u001b[39m0.333\u001b[39;49m,\n\u001b[1;32m 21\u001b[0m )\n",
"File \u001b[0;32m~/Projects/ukrainian-tts/training/espnet/espnet2/bin/tts_inference.py:92\u001b[0m, in \u001b[0;36mText2Speech.__init__\u001b[0;34m(self, train_config, model_file, threshold, minlenratio, maxlenratio, use_teacher_forcing, use_att_constraint, backward_window, forward_window, speed_control_alpha, noise_scale, noise_scale_dur, vocoder_config, vocoder_file, dtype, device, seed, always_fix_seed, prefer_normalized_feats)\u001b[0m\n\u001b[1;32m 89\u001b[0m \u001b[39massert\u001b[39;00m check_argument_types()\n\u001b[1;32m 91\u001b[0m \u001b[39m# setup model\u001b[39;00m\n\u001b[0;32m---> 92\u001b[0m model, train_args \u001b[39m=\u001b[39m TTSTask\u001b[39m.\u001b[39;49mbuild_model_from_file(\n\u001b[1;32m 93\u001b[0m train_config, model_file, device\n\u001b[1;32m 94\u001b[0m )\n\u001b[1;32m 95\u001b[0m model\u001b[39m.\u001b[39mto(dtype\u001b[39m=\u001b[39m\u001b[39mgetattr\u001b[39m(torch, dtype))\u001b[39m.\u001b[39meval()\n\u001b[1;32m 96\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdevice \u001b[39m=\u001b[39m device\n",
"File \u001b[0;32m~/Projects/ukrainian-tts/training/espnet/espnet2/tasks/abs_task.py:1822\u001b[0m, in \u001b[0;36mAbsTask.build_model_from_file\u001b[0;34m(cls, config_file, model_file, device)\u001b[0m\n\u001b[1;32m 1820\u001b[0m args \u001b[39m=\u001b[39m yaml\u001b[39m.\u001b[39msafe_load(f)\n\u001b[1;32m 1821\u001b[0m args \u001b[39m=\u001b[39m argparse\u001b[39m.\u001b[39mNamespace(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39margs)\n\u001b[0;32m-> 1822\u001b[0m model \u001b[39m=\u001b[39m \u001b[39mcls\u001b[39;49m\u001b[39m.\u001b[39;49mbuild_model(args)\n\u001b[1;32m 1823\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(model, AbsESPnetModel):\n\u001b[1;32m 1824\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\n\u001b[1;32m 1825\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mmodel must inherit \u001b[39m\u001b[39m{\u001b[39;00mAbsESPnetModel\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m, but got \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mtype\u001b[39m(model)\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m 1826\u001b[0m )\n",
"File \u001b[0;32m~/Projects/ukrainian-tts/training/espnet/espnet2/tasks/tts.py:309\u001b[0m, in \u001b[0;36mTTSTask.build_model\u001b[0;34m(cls, args)\u001b[0m\n\u001b[1;32m 307\u001b[0m \u001b[39mif\u001b[39;00m args\u001b[39m.\u001b[39mnormalize \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 308\u001b[0m normalize_class \u001b[39m=\u001b[39m normalize_choices\u001b[39m.\u001b[39mget_class(args\u001b[39m.\u001b[39mnormalize)\n\u001b[0;32m--> 309\u001b[0m normalize \u001b[39m=\u001b[39m normalize_class(\u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49margs\u001b[39m.\u001b[39;49mnormalize_conf)\n\u001b[1;32m 310\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 311\u001b[0m normalize \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n",
"File \u001b[0;32m~/Projects/ukrainian-tts/training/espnet/espnet2/layers/global_mvn.py:40\u001b[0m, in \u001b[0;36mGlobalMVN.__init__\u001b[0;34m(self, stats_file, norm_means, norm_vars, eps)\u001b[0m\n\u001b[1;32m 37\u001b[0m stats_file \u001b[39m=\u001b[39m Path(stats_file)\n\u001b[1;32m 39\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstats_file \u001b[39m=\u001b[39m stats_file\n\u001b[0;32m---> 40\u001b[0m stats \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39;49mload(stats_file)\n\u001b[1;32m 41\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(stats, np\u001b[39m.\u001b[39mndarray):\n\u001b[1;32m 42\u001b[0m \u001b[39m# Kaldi like stats\u001b[39;00m\n\u001b[1;32m 43\u001b[0m count \u001b[39m=\u001b[39m stats[\u001b[39m0\u001b[39m]\u001b[39m.\u001b[39mflatten()[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m]\n",
"File \u001b[0;32m~/.miniconda3/envs/espnet/lib/python3.8/site-packages/numpy/lib/npyio.py:390\u001b[0m, in \u001b[0;36mload\u001b[0;34m(file, mmap_mode, allow_pickle, fix_imports, encoding)\u001b[0m\n\u001b[1;32m 388\u001b[0m own_fid \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m 389\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 390\u001b[0m fid \u001b[39m=\u001b[39m stack\u001b[39m.\u001b[39menter_context(\u001b[39mopen\u001b[39;49m(os_fspath(file), \u001b[39m\"\u001b[39;49m\u001b[39mrb\u001b[39;49m\u001b[39m\"\u001b[39;49m))\n\u001b[1;32m 391\u001b[0m own_fid \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[1;32m 393\u001b[0m \u001b[39m# Code to distinguish from NumPy binary files and pickles.\u001b[39;00m\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz'"
]
}
],
"source": [
"from espnet2.bin.tts_inference import Text2Speech\n",
"from espnet2.utils.types import str_or_none\n",
"\n",
"text2speech = Text2Speech(\n",
" train_config=\"exp/tts_train_raw_phn_tacotron_g2p_en_no_space/config.yaml\",\n",
" model_file=\"exp/tts_train_raw_phn_tacotron_g2p_en_no_space/checkpoint.pth\",\n",
" device=\"cuda\",\n",
" # Only for Tacotron 2 & Transformer\n",
" threshold=0.5,\n",
" # Only for Tacotron 2\n",
" minlenratio=0.0,\n",
" maxlenratio=10.0,\n",
" use_att_constraint=False,\n",
" backward_window=1,\n",
" forward_window=3,\n",
" # Only for FastSpeech & FastSpeech2 & VITS\n",
" speed_control_alpha=4,\n",
" # Only for VITS\n",
" noise_scale=0.333,\n",
" noise_scale_dur=0.333,\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"import torch\n",
"\n",
"# decide the input sentence by yourself\n",
"print(f\"Input your favorite sentence in {lang}.\")\n",
"x = input()\n",
"\n",
"# synthesis\n",
"with torch.no_grad():\n",
" start = time.time()\n",
" wav = text2speech(x)[\"wav\"]\n",
"rtf = (time.time() - start) / (len(wav) / text2speech.fs)\n",
"print(f\"RTF = {rtf:5f}\")\n",
"\n",
"# let us listen to generated samples\n",
"from IPython.display import display, Audio\n",
"display(Audio(wav.view(-1).cpu().numpy(), rate=text2speech.fs))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.15 ('espnet')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.15"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "baacc56cbf39183fce53815df8d7ef29797de9f36fbce345069f80337ea8dac3"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|