Spaces:
Running
Running
Yurii Paniv
commited on
Commit
·
524d54e
1
Parent(s):
96954c7
Test model inference
Browse files- training/esp_test.ipynb +114 -0
training/esp_test.ipynb
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 3,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"#@title Choose English model { run: \"auto\" }\n",
|
| 10 |
+
"lang = 'English'\n",
|
| 11 |
+
"tag = 'training/espnet/egs2/ljspeech/tts1' #@param [\"kan-bayashi/ljspeech_tacotron2\", \"kan-bayashi/ljspeech_fastspeech\", \"kan-bayashi/ljspeech_fastspeech2\", \"kan-bayashi/ljspeech_conformer_fastspeech2\", \"kan-bayashi/ljspeech_joint_finetune_conformer_fastspeech2_hifigan\", \"kan-bayashi/ljspeech_joint_train_conformer_fastspeech2_hifigan\", \"kan-bayashi/ljspeech_vits\"] {type:\"string\"}\n",
|
| 12 |
+
"vocoder_tag = \"none\" #@param [\"none\", \"parallel_wavegan/ljspeech_parallel_wavegan.v1\", \"parallel_wavegan/ljspeech_full_band_melgan.v2\", \"parallel_wavegan/ljspeech_multi_band_melgan.v2\", \"parallel_wavegan/ljspeech_hifigan.v1\", \"parallel_wavegan/ljspeech_style_melgan.v1\"] {type:\"string\"}"
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"cell_type": "code",
|
| 17 |
+
"execution_count": 7,
|
| 18 |
+
"metadata": {},
|
| 19 |
+
"outputs": [
|
| 20 |
+
{
|
| 21 |
+
"ename": "FileNotFoundError",
|
| 22 |
+
"evalue": "[Errno 2] No such file or directory: 'exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz'",
|
| 23 |
+
"output_type": "error",
|
| 24 |
+
"traceback": [
|
| 25 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
| 26 |
+
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
|
| 27 |
+
"Cell \u001b[0;32mIn[7], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mespnet2\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbin\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtts_inference\u001b[39;00m \u001b[39mimport\u001b[39;00m Text2Speech\n\u001b[1;32m 2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mespnet2\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mutils\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtypes\u001b[39;00m \u001b[39mimport\u001b[39;00m str_or_none\n\u001b[0;32m----> 4\u001b[0m text2speech \u001b[39m=\u001b[39m Text2Speech(\n\u001b[1;32m 5\u001b[0m train_config\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m/home/robinhad/Projects/ukrainian-tts/training/espnet/egs2/ljspeech/tts1/exp/tts_train_raw_phn_tacotron_g2p_en_no_space/config.yaml\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 6\u001b[0m model_file\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m/home/robinhad/Projects/ukrainian-tts/training/espnet/egs2/ljspeech/tts1/exp/tts_train_raw_phn_tacotron_g2p_en_no_space/checkpoint.pth\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 7\u001b[0m device\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mcuda\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 8\u001b[0m \u001b[39m# Only for Tacotron 2 & Transformer\u001b[39;49;00m\n\u001b[1;32m 9\u001b[0m threshold\u001b[39m=\u001b[39;49m\u001b[39m0.5\u001b[39;49m,\n\u001b[1;32m 10\u001b[0m \u001b[39m# Only for Tacotron 2\u001b[39;49;00m\n\u001b[1;32m 11\u001b[0m minlenratio\u001b[39m=\u001b[39;49m\u001b[39m0.0\u001b[39;49m,\n\u001b[1;32m 12\u001b[0m maxlenratio\u001b[39m=\u001b[39;49m\u001b[39m10.0\u001b[39;49m,\n\u001b[1;32m 13\u001b[0m use_att_constraint\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m 14\u001b[0m backward_window\u001b[39m=\u001b[39;49m\u001b[39m1\u001b[39;49m,\n\u001b[1;32m 15\u001b[0m forward_window\u001b[39m=\u001b[39;49m\u001b[39m3\u001b[39;49m,\n\u001b[1;32m 16\u001b[0m \u001b[39m# Only for FastSpeech & FastSpeech2 & VITS\u001b[39;49;00m\n\u001b[1;32m 17\u001b[0m speed_control_alpha\u001b[39m=\u001b[39;49m\u001b[39m4\u001b[39;49m,\n\u001b[1;32m 18\u001b[0m \u001b[39m# Only for VITS\u001b[39;49;00m\n\u001b[1;32m 19\u001b[0m noise_scale\u001b[39m=\u001b[39;49m\u001b[39m0.333\u001b[39;49m,\n\u001b[1;32m 20\u001b[0m noise_scale_dur\u001b[39m=\u001b[39;49m\u001b[39m0.333\u001b[39;49m,\n\u001b[1;32m 21\u001b[0m )\n",
|
| 28 |
+
"File \u001b[0;32m~/Projects/ukrainian-tts/training/espnet/espnet2/bin/tts_inference.py:92\u001b[0m, in \u001b[0;36mText2Speech.__init__\u001b[0;34m(self, train_config, model_file, threshold, minlenratio, maxlenratio, use_teacher_forcing, use_att_constraint, backward_window, forward_window, speed_control_alpha, noise_scale, noise_scale_dur, vocoder_config, vocoder_file, dtype, device, seed, always_fix_seed, prefer_normalized_feats)\u001b[0m\n\u001b[1;32m 89\u001b[0m \u001b[39massert\u001b[39;00m check_argument_types()\n\u001b[1;32m 91\u001b[0m \u001b[39m# setup model\u001b[39;00m\n\u001b[0;32m---> 92\u001b[0m model, train_args \u001b[39m=\u001b[39m TTSTask\u001b[39m.\u001b[39;49mbuild_model_from_file(\n\u001b[1;32m 93\u001b[0m train_config, model_file, device\n\u001b[1;32m 94\u001b[0m )\n\u001b[1;32m 95\u001b[0m model\u001b[39m.\u001b[39mto(dtype\u001b[39m=\u001b[39m\u001b[39mgetattr\u001b[39m(torch, dtype))\u001b[39m.\u001b[39meval()\n\u001b[1;32m 96\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdevice \u001b[39m=\u001b[39m device\n",
|
| 29 |
+
"File \u001b[0;32m~/Projects/ukrainian-tts/training/espnet/espnet2/tasks/abs_task.py:1822\u001b[0m, in \u001b[0;36mAbsTask.build_model_from_file\u001b[0;34m(cls, config_file, model_file, device)\u001b[0m\n\u001b[1;32m 1820\u001b[0m args \u001b[39m=\u001b[39m yaml\u001b[39m.\u001b[39msafe_load(f)\n\u001b[1;32m 1821\u001b[0m args \u001b[39m=\u001b[39m argparse\u001b[39m.\u001b[39mNamespace(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39margs)\n\u001b[0;32m-> 1822\u001b[0m model \u001b[39m=\u001b[39m \u001b[39mcls\u001b[39;49m\u001b[39m.\u001b[39;49mbuild_model(args)\n\u001b[1;32m 1823\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(model, AbsESPnetModel):\n\u001b[1;32m 1824\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\n\u001b[1;32m 1825\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mmodel must inherit \u001b[39m\u001b[39m{\u001b[39;00mAbsESPnetModel\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m, but got \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mtype\u001b[39m(model)\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m 1826\u001b[0m )\n",
|
| 30 |
+
"File \u001b[0;32m~/Projects/ukrainian-tts/training/espnet/espnet2/tasks/tts.py:309\u001b[0m, in \u001b[0;36mTTSTask.build_model\u001b[0;34m(cls, args)\u001b[0m\n\u001b[1;32m 307\u001b[0m \u001b[39mif\u001b[39;00m args\u001b[39m.\u001b[39mnormalize \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 308\u001b[0m normalize_class \u001b[39m=\u001b[39m normalize_choices\u001b[39m.\u001b[39mget_class(args\u001b[39m.\u001b[39mnormalize)\n\u001b[0;32m--> 309\u001b[0m normalize \u001b[39m=\u001b[39m normalize_class(\u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49margs\u001b[39m.\u001b[39;49mnormalize_conf)\n\u001b[1;32m 310\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 311\u001b[0m normalize \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n",
|
| 31 |
+
"File \u001b[0;32m~/Projects/ukrainian-tts/training/espnet/espnet2/layers/global_mvn.py:40\u001b[0m, in \u001b[0;36mGlobalMVN.__init__\u001b[0;34m(self, stats_file, norm_means, norm_vars, eps)\u001b[0m\n\u001b[1;32m 37\u001b[0m stats_file \u001b[39m=\u001b[39m Path(stats_file)\n\u001b[1;32m 39\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstats_file \u001b[39m=\u001b[39m stats_file\n\u001b[0;32m---> 40\u001b[0m stats \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39;49mload(stats_file)\n\u001b[1;32m 41\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(stats, np\u001b[39m.\u001b[39mndarray):\n\u001b[1;32m 42\u001b[0m \u001b[39m# Kaldi like stats\u001b[39;00m\n\u001b[1;32m 43\u001b[0m count \u001b[39m=\u001b[39m stats[\u001b[39m0\u001b[39m]\u001b[39m.\u001b[39mflatten()[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m]\n",
|
| 32 |
+
"File \u001b[0;32m~/.miniconda3/envs/espnet/lib/python3.8/site-packages/numpy/lib/npyio.py:390\u001b[0m, in \u001b[0;36mload\u001b[0;34m(file, mmap_mode, allow_pickle, fix_imports, encoding)\u001b[0m\n\u001b[1;32m 388\u001b[0m own_fid \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m 389\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 390\u001b[0m fid \u001b[39m=\u001b[39m stack\u001b[39m.\u001b[39menter_context(\u001b[39mopen\u001b[39;49m(os_fspath(file), \u001b[39m\"\u001b[39;49m\u001b[39mrb\u001b[39;49m\u001b[39m\"\u001b[39;49m))\n\u001b[1;32m 391\u001b[0m own_fid \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[1;32m 393\u001b[0m \u001b[39m# Code to distinguish from NumPy binary files and pickles.\u001b[39;00m\n",
|
| 33 |
+
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz'"
|
| 34 |
+
]
|
| 35 |
+
}
|
| 36 |
+
],
|
| 37 |
+
"source": [
|
| 38 |
+
"from espnet2.bin.tts_inference import Text2Speech\n",
|
| 39 |
+
"from espnet2.utils.types import str_or_none\n",
|
| 40 |
+
"\n",
|
| 41 |
+
"text2speech = Text2Speech(\n",
|
| 42 |
+
" train_config=\"exp/tts_train_raw_phn_tacotron_g2p_en_no_space/config.yaml\",\n",
|
| 43 |
+
" model_file=\"exp/tts_train_raw_phn_tacotron_g2p_en_no_space/checkpoint.pth\",\n",
|
| 44 |
+
" device=\"cuda\",\n",
|
| 45 |
+
" # Only for Tacotron 2 & Transformer\n",
|
| 46 |
+
" threshold=0.5,\n",
|
| 47 |
+
" # Only for Tacotron 2\n",
|
| 48 |
+
" minlenratio=0.0,\n",
|
| 49 |
+
" maxlenratio=10.0,\n",
|
| 50 |
+
" use_att_constraint=False,\n",
|
| 51 |
+
" backward_window=1,\n",
|
| 52 |
+
" forward_window=3,\n",
|
| 53 |
+
" # Only for FastSpeech & FastSpeech2 & VITS\n",
|
| 54 |
+
" speed_control_alpha=4,\n",
|
| 55 |
+
" # Only for VITS\n",
|
| 56 |
+
" noise_scale=0.333,\n",
|
| 57 |
+
" noise_scale_dur=0.333,\n",
|
| 58 |
+
")\n"
|
| 59 |
+
]
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"cell_type": "code",
|
| 63 |
+
"execution_count": null,
|
| 64 |
+
"metadata": {},
|
| 65 |
+
"outputs": [],
|
| 66 |
+
"source": [
|
| 67 |
+
"import time\n",
|
| 68 |
+
"import torch\n",
|
| 69 |
+
"\n",
|
| 70 |
+
"# decide the input sentence by yourself\n",
|
| 71 |
+
"print(f\"Input your favorite sentence in {lang}.\")\n",
|
| 72 |
+
"x = input()\n",
|
| 73 |
+
"\n",
|
| 74 |
+
"# synthesis\n",
|
| 75 |
+
"with torch.no_grad():\n",
|
| 76 |
+
" start = time.time()\n",
|
| 77 |
+
" wav = text2speech(x)[\"wav\"]\n",
|
| 78 |
+
"rtf = (time.time() - start) / (len(wav) / text2speech.fs)\n",
|
| 79 |
+
"print(f\"RTF = {rtf:5f}\")\n",
|
| 80 |
+
"\n",
|
| 81 |
+
"# let us listen to generated samples\n",
|
| 82 |
+
"from IPython.display import display, Audio\n",
|
| 83 |
+
"display(Audio(wav.view(-1).cpu().numpy(), rate=text2speech.fs))"
|
| 84 |
+
]
|
| 85 |
+
}
|
| 86 |
+
],
|
| 87 |
+
"metadata": {
|
| 88 |
+
"kernelspec": {
|
| 89 |
+
"display_name": "Python 3.8.15 ('espnet')",
|
| 90 |
+
"language": "python",
|
| 91 |
+
"name": "python3"
|
| 92 |
+
},
|
| 93 |
+
"language_info": {
|
| 94 |
+
"codemirror_mode": {
|
| 95 |
+
"name": "ipython",
|
| 96 |
+
"version": 3
|
| 97 |
+
},
|
| 98 |
+
"file_extension": ".py",
|
| 99 |
+
"mimetype": "text/x-python",
|
| 100 |
+
"name": "python",
|
| 101 |
+
"nbconvert_exporter": "python",
|
| 102 |
+
"pygments_lexer": "ipython3",
|
| 103 |
+
"version": "3.8.15"
|
| 104 |
+
},
|
| 105 |
+
"orig_nbformat": 4,
|
| 106 |
+
"vscode": {
|
| 107 |
+
"interpreter": {
|
| 108 |
+
"hash": "baacc56cbf39183fce53815df8d7ef29797de9f36fbce345069f80337ea8dac3"
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"nbformat": 4,
|
| 113 |
+
"nbformat_minor": 2
|
| 114 |
+
}
|