sbapan41 commited on
Commit
924e8cc
·
verified ·
1 Parent(s): 09b7c9f

Upload 11 files

Browse files
Files changed (11) hide show
  1. LICENSE +201 -0
  2. README.md +11 -8
  3. SoniTranslate_Colab.ipynb +124 -0
  4. app.py +2 -0
  5. app_rvc.py +0 -0
  6. packages.txt +3 -0
  7. pre-requirements.txt +17 -0
  8. requirements.txt +19 -0
  9. requirements_xtts.txt +58 -0
  10. vci_pipeline.py +454 -0
  11. voice_main.py +732 -0
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md CHANGED
@@ -1,13 +1,16 @@
1
  ---
2
- title: Quantum Dubbing
3
- emoji: 🏆
4
- colorFrom: gray
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 5.23.1
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Quantum_Dubbing (Quantum_Dubbing)
3
+ emoji: 🌍
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.31.3
8
+ app_file: app_rvc.py
9
+ pinned: true
10
+ license: mit
11
+ short_description: Video Dubbing with Open Source Projects
12
+ preload_from_hub:
13
+ - Systran/faster-whisper-large-v3
14
  ---
15
 
16
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
SoniTranslate_Colab.ipynb ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4",
8
+ "include_colab_link": true
9
+ },
10
+ "kernelspec": {
11
+ "name": "python3",
12
+ "display_name": "Python 3"
13
+ },
14
+ "language_info": {
15
+ "name": "python"
16
+ },
17
+ "accelerator": "GPU"
18
+ },
19
+ "cells": [
20
+ {
21
+ "cell_type": "markdown",
22
+ "metadata": {
23
+ "id": "view-in-github",
24
+ "colab_type": "text"
25
+ },
26
+ "source": [
27
+ "<a href=\"https://colab.research.google.com/github/R3gm/SoniTranslate/blob/main/SoniTranslate_Colab.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "markdown",
32
+ "source": [
33
+ "# SoniTranslate\n",
34
+ "\n",
35
+ "| Description | Link |\n",
36
+ "| ----------- | ---- |\n",
37
+ "| 🎉 Repository | [![GitHub Repository](https://img.shields.io/badge/GitHub-Repository-black?style=flat-square&logo=github)](https://github.com/R3gm/SoniTranslate/) |\n",
38
+ "| 🚀 Online Demo in HF | [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/r3gm/SoniTranslate_translate_audio_of_a_video_content) |\n",
39
+ "\n",
40
+ "\n"
41
+ ],
42
+ "metadata": {
43
+ "id": "8lw0EgLex-YZ"
44
+ }
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": null,
49
+ "metadata": {
50
+ "id": "LUgwm0rfx0_J",
51
+ "cellView": "form"
52
+ },
53
+ "outputs": [],
54
+ "source": [
55
+ "# @title Install requirements for SoniTranslate\n",
56
+ "!git clone https://github.com/r3gm/SoniTranslate.git\n",
57
+ "%cd SoniTranslate\n",
58
+ "\n",
59
+ "!apt install git-lfs\n",
60
+ "!git lfs install\n",
61
+ "\n",
62
+ "!sed -i 's|git+https://github.com/R3gm/whisperX.git@cuda_11_8|git+https://github.com/R3gm/whisperX.git@cuda_12_x|' requirements_base.txt\n",
63
+ "!pip install -q -r requirements_base.txt\n",
64
+ "!pip install -q -r requirements_extra.txt\n",
65
+ "!pip install -q ort-nightly-gpu --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ort-cuda-12-nightly/pypi/simple/\n",
66
+ "\n",
67
+ "Install_PIPER_TTS = True # @param {type:\"boolean\"}\n",
68
+ "\n",
69
+ "if Install_PIPER_TTS:\n",
70
+ " !pip install -q piper-tts==1.2.0\n",
71
+ "\n",
72
+ "Install_Coqui_XTTS = True # @param {type:\"boolean\"}\n",
73
+ "\n",
74
+ "if Install_Coqui_XTTS:\n",
75
+ " !pip install -q -r requirements_xtts.txt\n",
76
+ " !pip install -q TTS==0.21.1 --no-deps"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "markdown",
81
+ "source": [
82
+ "One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and `accept the license to use the models`: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation\n",
83
+ "\n",
84
+ "\n",
85
+ "\n",
86
+ "\n",
87
+ "Get your KEY TOKEN here: https://hf.co/settings/tokens"
88
+ ],
89
+ "metadata": {
90
+ "id": "LTaTstXPXNg2"
91
+ }
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "source": [
96
+ "#@markdown # `RUN THE WEB APP`\n",
97
+ "YOUR_HF_TOKEN = \"\" #@param {type:'string'}\n",
98
+ "%env YOUR_HF_TOKEN={YOUR_HF_TOKEN}\n",
99
+ "theme = \"Taithrah/Minimal\" # @param [\"Taithrah/Minimal\", \"aliabid94/new-theme\", \"gstaff/xkcd\", \"ParityError/LimeFace\", \"abidlabs/pakistan\", \"rottenlittlecreature/Moon_Goblin\", \"ysharma/llamas\", \"gradio/dracula_revamped\"]\n",
100
+ "interface_language = \"english\" # @param ['arabic', 'azerbaijani', 'chinese_zh_cn', 'english', 'french', 'german', 'hindi', 'indonesian', 'italian', 'japanese', 'korean', 'marathi', 'polish', 'portuguese', 'russian', 'spanish', 'swedish', 'turkish', 'ukrainian', 'vietnamese']\n",
101
+ "verbosity_level = \"info\" # @param [\"debug\", \"info\", \"warning\", \"error\", \"critical\"]\n",
102
+ "\n",
103
+ "\n",
104
+ "%cd /content/SoniTranslate\n",
105
+ "!python app_rvc.py --theme {theme} --verbosity_level {verbosity_level} --language {interface_language} --public_url"
106
+ ],
107
+ "metadata": {
108
+ "id": "XkhXfaFw4R4J",
109
+ "cellView": "form"
110
+ },
111
+ "execution_count": null,
112
+ "outputs": []
113
+ },
114
+ {
115
+ "cell_type": "markdown",
116
+ "source": [
117
+ "Open the `public URL` when it appears"
118
+ ],
119
+ "metadata": {
120
+ "id": "KJW3KrhZJh0u"
121
+ }
122
+ }
123
+ ]
124
+ }
app.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ import os
2
+ os.system("python app_rvc.py --language french --theme aliabid94/new-theme")
app_rvc.py ADDED
The diff for this file is too large to render. See raw diff
 
packages.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ git-lfs
2
+ aria2 -y
3
+ ffmpeg
pre-requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pip==23.1.2
2
+ --extra-index-url https://download.pytorch.org/whl/cu121
3
+ torch==2.2.0 # +cu121
4
+ torchvision # <=0.17.0+cu121
5
+ torchaudio # <=2.2.0+cu121
6
+ ctranslate2<=4.4.0
7
+ yt-dlp
8
+ gradio==4.19.2
9
+ pydub==0.25.1
10
+ edge_tts==6.1.7
11
+ deep_translator==1.11.4
12
+ git+https://github.com/R3gm/[email protected]
13
+ git+https://github.com/R3gm/whisperX.git@cuda_12_x
14
+ nest_asyncio
15
+ gTTS
16
+ gradio_client==0.10.1
17
+ IPython
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ praat-parselmouth>=0.4.3
2
+ pyworld==0.3.2
3
+ faiss-cpu==1.7.3
4
+ torchcrepe==0.0.20
5
+ ffmpeg-python>=0.2.0
6
+ fairseq==0.12.2
7
+ gdown
8
+ rarfile
9
+ transformers
10
+ accelerate
11
+ optimum
12
+ sentencepiece
13
+ srt
14
+ git+https://github.com/R3gm/openvoice_package.git@lite
15
+ openai==1.14.3
16
+ tiktoken==0.6.0
17
+ # Documents
18
+ pypdf==4.2.0
19
+ python-docx
requirements_xtts.txt ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core deps
2
+ numpy==1.23.5
3
+ cython>=0.29.30
4
+ scipy>=1.11.2
5
+ torch
6
+ torchaudio
7
+ soundfile
8
+ librosa
9
+ scikit-learn
10
+ numba
11
+ inflect>=5.6.0
12
+ tqdm>=4.64.1
13
+ anyascii>=0.3.0
14
+ pyyaml>=6.0
15
+ fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail
16
+ aiohttp>=3.8.1
17
+ packaging>=23.1
18
+ # deps for examples
19
+ flask>=2.0.1
20
+ # deps for inference
21
+ pysbd>=0.3.4
22
+ # deps for notebooks
23
+ umap-learn>=0.5.1
24
+ pandas
25
+ # deps for training
26
+ matplotlib
27
+ # coqui stack
28
+ trainer>=0.0.32
29
+ # config management
30
+ coqpit>=0.0.16
31
+ # chinese g2p deps
32
+ jieba
33
+ pypinyin
34
+ # korean
35
+ hangul_romanize
36
+ # gruut+supported langs
37
+ gruut[de,es,fr]==2.2.3
38
+ # deps for korean
39
+ jamo
40
+ nltk
41
+ g2pkk>=0.1.1
42
+ # deps for bangla
43
+ bangla
44
+ bnnumerizer
45
+ bnunicodenormalizer
46
+ #deps for tortoise
47
+ einops>=0.6.0
48
+ transformers
49
+ #deps for bark
50
+ encodec>=0.1.1
51
+ # deps for XTTS
52
+ unidecode>=1.3.2
53
+ num2words
54
+ spacy[ja]>=3
55
+
56
+ # after this
57
+ # pip install -r requirements_xtts.txt
58
+ # pip install TTS==0.21.1 --no-deps
vci_pipeline.py ADDED
@@ -0,0 +1,454 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np, parselmouth, torch, pdb, sys
2
+ from time import time as ttime
3
+ import torch.nn.functional as F
4
+ import scipy.signal as signal
5
+ import pyworld, os, traceback, faiss, librosa, torchcrepe
6
+ from scipy import signal
7
+ from functools import lru_cache
8
+ from quantum_dubbing.logging_setup import logger
9
+
10
+ now_dir = os.getcwd()
11
+ sys.path.append(now_dir)
12
+
13
+ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
14
+
15
+ input_audio_path2wav = {}
16
+
17
+
18
+ @lru_cache
19
+ def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
20
+ audio = input_audio_path2wav[input_audio_path]
21
+ f0, t = pyworld.harvest(
22
+ audio,
23
+ fs=fs,
24
+ f0_ceil=f0max,
25
+ f0_floor=f0min,
26
+ frame_period=frame_period,
27
+ )
28
+ f0 = pyworld.stonemask(audio, f0, t, fs)
29
+ return f0
30
+
31
+
32
+ def change_rms(data1, sr1, data2, sr2, rate): # 1 is the input audio, 2 is the output audio, rate is the proportion of 2
33
+ # print(data1.max(),data2.max())
34
+ rms1 = librosa.feature.rms(
35
+ y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
36
+ ) # one dot every half second
37
+ rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
38
+ rms1 = torch.from_numpy(rms1)
39
+ rms1 = F.interpolate(
40
+ rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
41
+ ).squeeze()
42
+ rms2 = torch.from_numpy(rms2)
43
+ rms2 = F.interpolate(
44
+ rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
45
+ ).squeeze()
46
+ rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
47
+ data2 *= (
48
+ torch.pow(rms1, torch.tensor(1 - rate))
49
+ * torch.pow(rms2, torch.tensor(rate - 1))
50
+ ).numpy()
51
+ return data2
52
+
53
+
54
+ class VC(object):
55
+ def __init__(self, tgt_sr, config):
56
+ self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
57
+ config.x_pad,
58
+ config.x_query,
59
+ config.x_center,
60
+ config.x_max,
61
+ config.is_half,
62
+ )
63
+ self.sr = 16000 # hubert input sampling rate
64
+ self.window = 160 # points per frame
65
+ self.t_pad = self.sr * self.x_pad # Pad time before and after each bar
66
+ self.t_pad_tgt = tgt_sr * self.x_pad
67
+ self.t_pad2 = self.t_pad * 2
68
+ self.t_query = self.sr * self.x_query # Query time before and after the cut point
69
+ self.t_center = self.sr * self.x_center # Query point cut position
70
+ self.t_max = self.sr * self.x_max # Query-free duration threshold
71
+ self.device = config.device
72
+
73
+ def get_f0(
74
+ self,
75
+ input_audio_path,
76
+ x,
77
+ p_len,
78
+ f0_up_key,
79
+ f0_method,
80
+ filter_radius,
81
+ inp_f0=None,
82
+ ):
83
+ global input_audio_path2wav
84
+ time_step = self.window / self.sr * 1000
85
+ f0_min = 50
86
+ f0_max = 1100
87
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
88
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
89
+ if f0_method == "pm":
90
+ f0 = (
91
+ parselmouth.Sound(x, self.sr)
92
+ .to_pitch_ac(
93
+ time_step=time_step / 1000,
94
+ voicing_threshold=0.6,
95
+ pitch_floor=f0_min,
96
+ pitch_ceiling=f0_max,
97
+ )
98
+ .selected_array["frequency"]
99
+ )
100
+ pad_size = (p_len - len(f0) + 1) // 2
101
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
102
+ f0 = np.pad(
103
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
104
+ )
105
+ elif f0_method == "harvest":
106
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
107
+ f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
108
+ if filter_radius > 2:
109
+ f0 = signal.medfilt(f0, 3)
110
+ elif f0_method == "crepe":
111
+ model = "full"
112
+ # Pick a batch size that doesn't cause memory errors on your gpu
113
+ batch_size = 512
114
+ # Compute pitch using first gpu
115
+ audio = torch.tensor(np.copy(x))[None].float()
116
+ f0, pd = torchcrepe.predict(
117
+ audio,
118
+ self.sr,
119
+ self.window,
120
+ f0_min,
121
+ f0_max,
122
+ model,
123
+ batch_size=batch_size,
124
+ device=self.device,
125
+ return_periodicity=True,
126
+ )
127
+ pd = torchcrepe.filter.median(pd, 3)
128
+ f0 = torchcrepe.filter.mean(f0, 3)
129
+ f0[pd < 0.1] = 0
130
+ f0 = f0[0].cpu().numpy()
131
+ elif "rmvpe" in f0_method:
132
+ if hasattr(self, "model_rmvpe") == False:
133
+ from lib.rmvpe import RMVPE
134
+
135
+ logger.info("Loading vocal pitch estimator model")
136
+ self.model_rmvpe = RMVPE(
137
+ "rmvpe.pt", is_half=self.is_half, device=self.device
138
+ )
139
+ thred = 0.03
140
+ if "+" in f0_method:
141
+ f0 = self.model_rmvpe.pitch_based_audio_inference(x, thred, f0_min, f0_max)
142
+ else:
143
+ f0 = self.model_rmvpe.infer_from_audio(x, thred)
144
+
145
+ f0 *= pow(2, f0_up_key / 12)
146
+ # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
147
+ tf0 = self.sr // self.window # f0 points per second
148
+ if inp_f0 is not None:
149
+ delta_t = np.round(
150
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
151
+ ).astype("int16")
152
+ replace_f0 = np.interp(
153
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
154
+ )
155
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
156
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
157
+ :shape
158
+ ]
159
+ # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
160
+ f0bak = f0.copy()
161
+ f0_mel = 1127 * np.log(1 + f0 / 700)
162
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
163
+ f0_mel_max - f0_mel_min
164
+ ) + 1
165
+ f0_mel[f0_mel <= 1] = 1
166
+ f0_mel[f0_mel > 255] = 255
167
+ try:
168
+ f0_coarse = np.rint(f0_mel).astype(np.int)
169
+ except: # noqa
170
+ f0_coarse = np.rint(f0_mel).astype(int)
171
+ return f0_coarse, f0bak # 1-0
172
+
173
+ def vc(
174
+ self,
175
+ model,
176
+ net_g,
177
+ sid,
178
+ audio0,
179
+ pitch,
180
+ pitchf,
181
+ times,
182
+ index,
183
+ big_npy,
184
+ index_rate,
185
+ version,
186
+ protect,
187
+ ): # ,file_index,file_big_npy
188
+ feats = torch.from_numpy(audio0)
189
+ if self.is_half:
190
+ feats = feats.half()
191
+ else:
192
+ feats = feats.float()
193
+ if feats.dim() == 2: # double channels
194
+ feats = feats.mean(-1)
195
+ assert feats.dim() == 1, feats.dim()
196
+ feats = feats.view(1, -1)
197
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
198
+
199
+ inputs = {
200
+ "source": feats.to(self.device),
201
+ "padding_mask": padding_mask,
202
+ "output_layer": 9 if version == "v1" else 12,
203
+ }
204
+ t0 = ttime()
205
+ with torch.no_grad():
206
+ logits = model.extract_features(**inputs)
207
+ feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
208
+ if protect < 0.5 and pitch != None and pitchf != None:
209
+ feats0 = feats.clone()
210
+ if (
211
+ isinstance(index, type(None)) == False
212
+ and isinstance(big_npy, type(None)) == False
213
+ and index_rate != 0
214
+ ):
215
+ npy = feats[0].cpu().numpy()
216
+ if self.is_half:
217
+ npy = npy.astype("float32")
218
+
219
+ # _, I = index.search(npy, 1)
220
+ # npy = big_npy[I.squeeze()]
221
+
222
+ score, ix = index.search(npy, k=8)
223
+ weight = np.square(1 / score)
224
+ weight /= weight.sum(axis=1, keepdims=True)
225
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
226
+
227
+ if self.is_half:
228
+ npy = npy.astype("float16")
229
+ feats = (
230
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
231
+ + (1 - index_rate) * feats
232
+ )
233
+
234
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
235
+ if protect < 0.5 and pitch != None and pitchf != None:
236
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
237
+ 0, 2, 1
238
+ )
239
+ t1 = ttime()
240
+ p_len = audio0.shape[0] // self.window
241
+ if feats.shape[1] < p_len:
242
+ p_len = feats.shape[1]
243
+ if pitch != None and pitchf != None:
244
+ pitch = pitch[:, :p_len]
245
+ pitchf = pitchf[:, :p_len]
246
+
247
+ if protect < 0.5 and pitch != None and pitchf != None:
248
+ pitchff = pitchf.clone()
249
+ pitchff[pitchf > 0] = 1
250
+ pitchff[pitchf < 1] = protect
251
+ pitchff = pitchff.unsqueeze(-1)
252
+ feats = feats * pitchff + feats0 * (1 - pitchff)
253
+ feats = feats.to(feats0.dtype)
254
+ p_len = torch.tensor([p_len], device=self.device).long()
255
+ with torch.no_grad():
256
+ if pitch != None and pitchf != None:
257
+ audio1 = (
258
+ (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
259
+ .data.cpu()
260
+ .float()
261
+ .numpy()
262
+ )
263
+ else:
264
+ audio1 = (
265
+ (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
266
+ )
267
+ del feats, p_len, padding_mask
268
+ if torch.cuda.is_available():
269
+ torch.cuda.empty_cache()
270
+ t2 = ttime()
271
+ times[0] += t1 - t0
272
+ times[2] += t2 - t1
273
+ return audio1
274
+
275
+ def pipeline(
276
+ self,
277
+ model,
278
+ net_g,
279
+ sid,
280
+ audio,
281
+ input_audio_path,
282
+ times,
283
+ f0_up_key,
284
+ f0_method,
285
+ file_index,
286
+ # file_big_npy,
287
+ index_rate,
288
+ if_f0,
289
+ filter_radius,
290
+ tgt_sr,
291
+ resample_sr,
292
+ rms_mix_rate,
293
+ version,
294
+ protect,
295
+ f0_file=None,
296
+ ):
297
+ if (
298
+ file_index != ""
299
+ # and file_big_npy != ""
300
+ # and os.path.exists(file_big_npy) == True
301
+ and os.path.exists(file_index) == True
302
+ and index_rate != 0
303
+ ):
304
+ try:
305
+ index = faiss.read_index(file_index)
306
+ # big_npy = np.load(file_big_npy)
307
+ big_npy = index.reconstruct_n(0, index.ntotal)
308
+ except:
309
+ traceback.print_exc()
310
+ index = big_npy = None
311
+ else:
312
+ index = big_npy = None
313
+ logger.warning("File index Not found, set None")
314
+
315
+ audio = signal.filtfilt(bh, ah, audio)
316
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
317
+ opt_ts = []
318
+ if audio_pad.shape[0] > self.t_max:
319
+ audio_sum = np.zeros_like(audio)
320
+ for i in range(self.window):
321
+ audio_sum += audio_pad[i : i - self.window]
322
+ for t in range(self.t_center, audio.shape[0], self.t_center):
323
+ opt_ts.append(
324
+ t
325
+ - self.t_query
326
+ + np.where(
327
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
328
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
329
+ )[0][0]
330
+ )
331
+ s = 0
332
+ audio_opt = []
333
+ t = None
334
+ t1 = ttime()
335
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
336
+ p_len = audio_pad.shape[0] // self.window
337
+ inp_f0 = None
338
+ if hasattr(f0_file, "name") == True:
339
+ try:
340
+ with open(f0_file.name, "r") as f:
341
+ lines = f.read().strip("\n").split("\n")
342
+ inp_f0 = []
343
+ for line in lines:
344
+ inp_f0.append([float(i) for i in line.split(",")])
345
+ inp_f0 = np.array(inp_f0, dtype="float32")
346
+ except:
347
+ traceback.print_exc()
348
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
349
+ pitch, pitchf = None, None
350
+ if if_f0 == 1:
351
+ pitch, pitchf = self.get_f0(
352
+ input_audio_path,
353
+ audio_pad,
354
+ p_len,
355
+ f0_up_key,
356
+ f0_method,
357
+ filter_radius,
358
+ inp_f0,
359
+ )
360
+ pitch = pitch[:p_len]
361
+ pitchf = pitchf[:p_len]
362
+ if self.device == "mps":
363
+ pitchf = pitchf.astype(np.float32)
364
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
365
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
366
+ t2 = ttime()
367
+ times[1] += t2 - t1
368
+ for t in opt_ts:
369
+ t = t // self.window * self.window
370
+ if if_f0 == 1:
371
+ audio_opt.append(
372
+ self.vc(
373
+ model,
374
+ net_g,
375
+ sid,
376
+ audio_pad[s : t + self.t_pad2 + self.window],
377
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
378
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
379
+ times,
380
+ index,
381
+ big_npy,
382
+ index_rate,
383
+ version,
384
+ protect,
385
+ )[self.t_pad_tgt : -self.t_pad_tgt]
386
+ )
387
+ else:
388
+ audio_opt.append(
389
+ self.vc(
390
+ model,
391
+ net_g,
392
+ sid,
393
+ audio_pad[s : t + self.t_pad2 + self.window],
394
+ None,
395
+ None,
396
+ times,
397
+ index,
398
+ big_npy,
399
+ index_rate,
400
+ version,
401
+ protect,
402
+ )[self.t_pad_tgt : -self.t_pad_tgt]
403
+ )
404
+ s = t
405
+ if if_f0 == 1:
406
+ audio_opt.append(
407
+ self.vc(
408
+ model,
409
+ net_g,
410
+ sid,
411
+ audio_pad[t:],
412
+ pitch[:, t // self.window :] if t is not None else pitch,
413
+ pitchf[:, t // self.window :] if t is not None else pitchf,
414
+ times,
415
+ index,
416
+ big_npy,
417
+ index_rate,
418
+ version,
419
+ protect,
420
+ )[self.t_pad_tgt : -self.t_pad_tgt]
421
+ )
422
+ else:
423
+ audio_opt.append(
424
+ self.vc(
425
+ model,
426
+ net_g,
427
+ sid,
428
+ audio_pad[t:],
429
+ None,
430
+ None,
431
+ times,
432
+ index,
433
+ big_npy,
434
+ index_rate,
435
+ version,
436
+ protect,
437
+ )[self.t_pad_tgt : -self.t_pad_tgt]
438
+ )
439
+ audio_opt = np.concatenate(audio_opt)
440
+ if rms_mix_rate != 1:
441
+ audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
442
+ if resample_sr >= 16000 and tgt_sr != resample_sr:
443
+ audio_opt = librosa.resample(
444
+ audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
445
+ )
446
+ audio_max = np.abs(audio_opt).max() / 0.99
447
+ max_int16 = 32768
448
+ if audio_max > 1:
449
+ max_int16 /= audio_max
450
+ audio_opt = (audio_opt * max_int16).astype(np.int16)
451
+ del pitch, pitchf, sid
452
+ if torch.cuda.is_available():
453
+ torch.cuda.empty_cache()
454
+ return audio_opt
voice_main.py ADDED
@@ -0,0 +1,732 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from quantum_dubbing.logging_setup import logger
2
+ import torch
3
+ import gc
4
+ import numpy as np
5
+ import os
6
+ import shutil
7
+ import warnings
8
+ import threading
9
+ from tqdm import tqdm
10
+ from lib.infer_pack.models import (
11
+ SynthesizerTrnMs256NSFsid,
12
+ SynthesizerTrnMs256NSFsid_nono,
13
+ SynthesizerTrnMs768NSFsid,
14
+ SynthesizerTrnMs768NSFsid_nono,
15
+ )
16
+ from lib.audio import load_audio
17
+ import soundfile as sf
18
+ import edge_tts
19
+ import asyncio
20
+ from quantum_dubbing.utils import remove_directory_contents, create_directories
21
+ from scipy import signal
22
+ from time import time as ttime
23
+ import faiss
24
+ from vci_pipeline import VC, change_rms, bh, ah
25
+ import librosa
26
+
27
+ warnings.filterwarnings("ignore")
28
+
29
+
30
+ class Config:
31
+ def __init__(self, only_cpu=False):
32
+ self.device = "cuda:0"
33
+ self.is_half = True
34
+ self.n_cpu = 0
35
+ self.gpu_name = None
36
+ self.gpu_mem = None
37
+ (
38
+ self.x_pad,
39
+ self.x_query,
40
+ self.x_center,
41
+ self.x_max
42
+ ) = self.device_config(only_cpu)
43
+
44
+ def device_config(self, only_cpu) -> tuple:
45
+ if torch.cuda.is_available() and not only_cpu:
46
+ i_device = int(self.device.split(":")[-1])
47
+ self.gpu_name = torch.cuda.get_device_name(i_device)
48
+ if (
49
+ ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
50
+ or "P40" in self.gpu_name.upper()
51
+ or "1060" in self.gpu_name
52
+ or "1070" in self.gpu_name
53
+ or "1080" in self.gpu_name
54
+ ):
55
+ logger.info(
56
+ "16/10 Series GPUs and P40 excel "
57
+ "in single-precision tasks."
58
+ )
59
+ self.is_half = False
60
+ else:
61
+ self.gpu_name = None
62
+ self.gpu_mem = int(
63
+ torch.cuda.get_device_properties(i_device).total_memory
64
+ / 1024
65
+ / 1024
66
+ / 1024
67
+ + 0.4
68
+ )
69
+ elif torch.backends.mps.is_available() and not only_cpu:
70
+ logger.info("Supported N-card not found, using MPS for inference")
71
+ self.device = "mps"
72
+ else:
73
+ logger.info("No supported N-card found, using CPU for inference")
74
+ self.device = "cpu"
75
+ self.is_half = False
76
+
77
+ if self.n_cpu == 0:
78
+ self.n_cpu = os.cpu_count()
79
+
80
+ if self.is_half:
81
+ # 6GB VRAM configuration
82
+ x_pad = 3
83
+ x_query = 10
84
+ x_center = 60
85
+ x_max = 65
86
+ else:
87
+ # 5GB VRAM configuration
88
+ x_pad = 1
89
+ x_query = 6
90
+ x_center = 38
91
+ x_max = 41
92
+
93
+ if self.gpu_mem is not None and self.gpu_mem <= 4:
94
+ x_pad = 1
95
+ x_query = 5
96
+ x_center = 30
97
+ x_max = 32
98
+
99
+ logger.info(
100
+ f"Config: Device is {self.device}, "
101
+ f"half precision is {self.is_half}"
102
+ )
103
+
104
+ return x_pad, x_query, x_center, x_max
105
+
106
+
107
+ BASE_DOWNLOAD_LINK = "https://huggingface.co/r3gm/sonitranslate_voice_models/resolve/main/"
108
+ BASE_MODELS = [
109
+ "hubert_base.pt",
110
+ "rmvpe.pt"
111
+ ]
112
+ BASE_DIR = "."
113
+
114
+
115
+ def load_hu_bert(config):
116
+ from fairseq import checkpoint_utils
117
+ from quantum_dubbing.utils import download_manager
118
+
119
+ for id_model in BASE_MODELS:
120
+ download_manager(
121
+ os.path.join(BASE_DOWNLOAD_LINK, id_model), BASE_DIR
122
+ )
123
+
124
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
125
+ ["hubert_base.pt"],
126
+ suffix="",
127
+ )
128
+ hubert_model = models[0]
129
+ hubert_model = hubert_model.to(config.device)
130
+ if config.is_half:
131
+ hubert_model = hubert_model.half()
132
+ else:
133
+ hubert_model = hubert_model.float()
134
+ hubert_model.eval()
135
+
136
+ return hubert_model
137
+
138
+
139
+ def load_trained_model(model_path, config):
140
+
141
+ if not model_path:
142
+ raise ValueError("No model found")
143
+
144
+ logger.info("Loading %s" % model_path)
145
+ cpt = torch.load(model_path, map_location="cpu")
146
+ tgt_sr = cpt["config"][-1]
147
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
148
+ if_f0 = cpt.get("f0", 1)
149
+ if if_f0 == 0:
150
+ # protect to 0.5 need?
151
+ pass
152
+
153
+ version = cpt.get("version", "v1")
154
+ if version == "v1":
155
+ if if_f0 == 1:
156
+ net_g = SynthesizerTrnMs256NSFsid(
157
+ *cpt["config"], is_half=config.is_half
158
+ )
159
+ else:
160
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
161
+ elif version == "v2":
162
+ if if_f0 == 1:
163
+ net_g = SynthesizerTrnMs768NSFsid(
164
+ *cpt["config"], is_half=config.is_half
165
+ )
166
+ else:
167
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
168
+ del net_g.enc_q
169
+
170
+ net_g.load_state_dict(cpt["weight"], strict=False)
171
+ net_g.eval().to(config.device)
172
+
173
+ if config.is_half:
174
+ net_g = net_g.half()
175
+ else:
176
+ net_g = net_g.float()
177
+
178
+ vc = VC(tgt_sr, config)
179
+ n_spk = cpt["config"][-3]
180
+
181
+ return n_spk, tgt_sr, net_g, vc, cpt, version
182
+
183
+
184
+ class ClassVoices:
185
+ def __init__(self, only_cpu=False):
186
+ self.model_config = {}
187
+ self.config = None
188
+ self.only_cpu = only_cpu
189
+
190
+ def apply_conf(
191
+ self,
192
+ tag="base_model",
193
+ file_model="",
194
+ pitch_algo="pm",
195
+ pitch_lvl=0,
196
+ file_index="",
197
+ index_influence=0.66,
198
+ respiration_median_filtering=3,
199
+ envelope_ratio=0.25,
200
+ consonant_breath_protection=0.33,
201
+ resample_sr=0,
202
+ file_pitch_algo="",
203
+ ):
204
+
205
+ if not file_model:
206
+ raise ValueError("Model not found")
207
+
208
+ if file_index is None:
209
+ file_index = ""
210
+
211
+ if file_pitch_algo is None:
212
+ file_pitch_algo = ""
213
+
214
+ if not self.config:
215
+ self.config = Config(self.only_cpu)
216
+ self.hu_bert_model = None
217
+ self.model_pitch_estimator = None
218
+
219
+ self.model_config[tag] = {
220
+ "file_model": file_model,
221
+ "pitch_algo": pitch_algo,
222
+ "pitch_lvl": pitch_lvl, # no decimal
223
+ "file_index": file_index,
224
+ "index_influence": index_influence,
225
+ "respiration_median_filtering": respiration_median_filtering,
226
+ "envelope_ratio": envelope_ratio,
227
+ "consonant_breath_protection": consonant_breath_protection,
228
+ "resample_sr": resample_sr,
229
+ "file_pitch_algo": file_pitch_algo,
230
+ }
231
+ return f"CONFIGURATION APPLIED FOR {tag}: {file_model}"
232
+
233
+ def infer(
234
+ self,
235
+ task_id,
236
+ params,
237
+ # load model
238
+ n_spk,
239
+ tgt_sr,
240
+ net_g,
241
+ pipe,
242
+ cpt,
243
+ version,
244
+ if_f0,
245
+ # load index
246
+ index_rate,
247
+ index,
248
+ big_npy,
249
+ # load f0 file
250
+ inp_f0,
251
+ # audio file
252
+ input_audio_path,
253
+ overwrite,
254
+ ):
255
+
256
+ f0_method = params["pitch_algo"]
257
+ f0_up_key = params["pitch_lvl"]
258
+ filter_radius = params["respiration_median_filtering"]
259
+ resample_sr = params["resample_sr"]
260
+ rms_mix_rate = params["envelope_ratio"]
261
+ protect = params["consonant_breath_protection"]
262
+
263
+ if not os.path.exists(input_audio_path):
264
+ raise ValueError(
265
+ "The audio file was not found or is not "
266
+ f"a valid file: {input_audio_path}"
267
+ )
268
+
269
+ f0_up_key = int(f0_up_key)
270
+
271
+ audio = load_audio(input_audio_path, 16000)
272
+
273
+ # Normalize audio
274
+ audio_max = np.abs(audio).max() / 0.95
275
+ if audio_max > 1:
276
+ audio /= audio_max
277
+
278
+ times = [0, 0, 0]
279
+
280
+ # filters audio signal, pads it, computes sliding window sums,
281
+ # and extracts optimized time indices
282
+ audio = signal.filtfilt(bh, ah, audio)
283
+ audio_pad = np.pad(
284
+ audio, (pipe.window // 2, pipe.window // 2), mode="reflect"
285
+ )
286
+ opt_ts = []
287
+ if audio_pad.shape[0] > pipe.t_max:
288
+ audio_sum = np.zeros_like(audio)
289
+ for i in range(pipe.window):
290
+ audio_sum += audio_pad[i:i - pipe.window]
291
+ for t in range(pipe.t_center, audio.shape[0], pipe.t_center):
292
+ opt_ts.append(
293
+ t
294
+ - pipe.t_query
295
+ + np.where(
296
+ np.abs(audio_sum[t - pipe.t_query: t + pipe.t_query])
297
+ == np.abs(audio_sum[t - pipe.t_query: t + pipe.t_query]).min()
298
+ )[0][0]
299
+ )
300
+
301
+ s = 0
302
+ audio_opt = []
303
+ t = None
304
+ t1 = ttime()
305
+
306
+ sid_value = 0
307
+ sid = torch.tensor(sid_value, device=pipe.device).unsqueeze(0).long()
308
+
309
+ # Pads audio symmetrically, calculates length divided by window size.
310
+ audio_pad = np.pad(audio, (pipe.t_pad, pipe.t_pad), mode="reflect")
311
+ p_len = audio_pad.shape[0] // pipe.window
312
+
313
+ # Estimates pitch from audio signal
314
+ pitch, pitchf = None, None
315
+ if if_f0 == 1:
316
+ pitch, pitchf = pipe.get_f0(
317
+ input_audio_path,
318
+ audio_pad,
319
+ p_len,
320
+ f0_up_key,
321
+ f0_method,
322
+ filter_radius,
323
+ inp_f0,
324
+ )
325
+ pitch = pitch[:p_len]
326
+ pitchf = pitchf[:p_len]
327
+ if pipe.device == "mps":
328
+ pitchf = pitchf.astype(np.float32)
329
+ pitch = torch.tensor(
330
+ pitch, device=pipe.device
331
+ ).unsqueeze(0).long()
332
+ pitchf = torch.tensor(
333
+ pitchf, device=pipe.device
334
+ ).unsqueeze(0).float()
335
+
336
+ t2 = ttime()
337
+ times[1] += t2 - t1
338
+ for t in opt_ts:
339
+ t = t // pipe.window * pipe.window
340
+ if if_f0 == 1:
341
+ pitch_slice = pitch[
342
+ :, s // pipe.window: (t + pipe.t_pad2) // pipe.window
343
+ ]
344
+ pitchf_slice = pitchf[
345
+ :, s // pipe.window: (t + pipe.t_pad2) // pipe.window
346
+ ]
347
+ else:
348
+ pitch_slice = None
349
+ pitchf_slice = None
350
+
351
+ audio_slice = audio_pad[s:t + pipe.t_pad2 + pipe.window]
352
+ audio_opt.append(
353
+ pipe.vc(
354
+ self.hu_bert_model,
355
+ net_g,
356
+ sid,
357
+ audio_slice,
358
+ pitch_slice,
359
+ pitchf_slice,
360
+ times,
361
+ index,
362
+ big_npy,
363
+ index_rate,
364
+ version,
365
+ protect,
366
+ )[pipe.t_pad_tgt:-pipe.t_pad_tgt]
367
+ )
368
+ s = t
369
+
370
+ pitch_end_slice = pitch[
371
+ :, t // pipe.window:
372
+ ] if t is not None else pitch
373
+ pitchf_end_slice = pitchf[
374
+ :, t // pipe.window:
375
+ ] if t is not None else pitchf
376
+
377
+ audio_opt.append(
378
+ pipe.vc(
379
+ self.hu_bert_model,
380
+ net_g,
381
+ sid,
382
+ audio_pad[t:],
383
+ pitch_end_slice,
384
+ pitchf_end_slice,
385
+ times,
386
+ index,
387
+ big_npy,
388
+ index_rate,
389
+ version,
390
+ protect,
391
+ )[pipe.t_pad_tgt:-pipe.t_pad_tgt]
392
+ )
393
+
394
+ audio_opt = np.concatenate(audio_opt)
395
+ if rms_mix_rate != 1:
396
+ audio_opt = change_rms(
397
+ audio, 16000, audio_opt, tgt_sr, rms_mix_rate
398
+ )
399
+ if resample_sr >= 16000 and tgt_sr != resample_sr:
400
+ audio_opt = librosa.resample(
401
+ audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
402
+ )
403
+ audio_max = np.abs(audio_opt).max() / 0.99
404
+ max_int16 = 32768
405
+ if audio_max > 1:
406
+ max_int16 /= audio_max
407
+ audio_opt = (audio_opt * max_int16).astype(np.int16)
408
+ del pitch, pitchf, sid
409
+ if torch.cuda.is_available():
410
+ torch.cuda.empty_cache()
411
+
412
+ if tgt_sr != resample_sr >= 16000:
413
+ final_sr = resample_sr
414
+ else:
415
+ final_sr = tgt_sr
416
+
417
+ """
418
+ "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
419
+ times[0],
420
+ times[1],
421
+ times[2],
422
+ ), (final_sr, audio_opt)
423
+
424
+ """
425
+
426
+ if overwrite:
427
+ output_audio_path = input_audio_path # Overwrite
428
+ else:
429
+ basename = os.path.basename(input_audio_path)
430
+ dirname = os.path.dirname(input_audio_path)
431
+
432
+ new_basename = basename.split(
433
+ '.')[0] + "_edited." + basename.split('.')[-1]
434
+ new_path = os.path.join(dirname, new_basename)
435
+ logger.info(str(new_path))
436
+
437
+ output_audio_path = new_path
438
+
439
+ # Save file
440
+ sf.write(
441
+ file=output_audio_path,
442
+ samplerate=final_sr,
443
+ data=audio_opt
444
+ )
445
+
446
+ self.model_config[task_id]["result"].append(output_audio_path)
447
+ self.output_list.append(output_audio_path)
448
+
449
+ def make_test(
450
+ self,
451
+ tts_text,
452
+ tts_voice,
453
+ model_path,
454
+ index_path,
455
+ transpose,
456
+ f0_method,
457
+ ):
458
+
459
+ folder_test = "test"
460
+ tag = "test_edge"
461
+ tts_file = "test/test.wav"
462
+ tts_edited = "test/test_edited.wav"
463
+
464
+ create_directories(folder_test)
465
+ remove_directory_contents(folder_test)
466
+
467
+ if "SET_LIMIT" == os.getenv("DEMO"):
468
+ if len(tts_text) > 60:
469
+ tts_text = tts_text[:60]
470
+ logger.warning("DEMO; limit to 60 characters")
471
+
472
+ try:
473
+ asyncio.run(edge_tts.Communicate(
474
+ tts_text, "-".join(tts_voice.split('-')[:-1])
475
+ ).save(tts_file))
476
+ except Exception as e:
477
+ raise ValueError(
478
+ "No audio was received. Please change the "
479
+ f"tts voice for {tts_voice}. Error: {str(e)}"
480
+ )
481
+
482
+ shutil.copy(tts_file, tts_edited)
483
+
484
+ self.apply_conf(
485
+ tag=tag,
486
+ file_model=model_path,
487
+ pitch_algo=f0_method,
488
+ pitch_lvl=transpose,
489
+ file_index=index_path,
490
+ index_influence=0.66,
491
+ respiration_median_filtering=3,
492
+ envelope_ratio=0.25,
493
+ consonant_breath_protection=0.33,
494
+ )
495
+
496
+ self(
497
+ audio_files=tts_edited,
498
+ tag_list=tag,
499
+ overwrite=True
500
+ )
501
+
502
+ return tts_edited, tts_file
503
+
504
+ def run_threads(self, threads):
505
+ # Start threads
506
+ for thread in threads:
507
+ thread.start()
508
+
509
+ # Wait for all threads to finish
510
+ for thread in threads:
511
+ thread.join()
512
+
513
+ gc.collect()
514
+ torch.cuda.empty_cache()
515
+
516
+ def unload_models(self):
517
+ self.hu_bert_model = None
518
+ self.model_pitch_estimator = None
519
+ gc.collect()
520
+ torch.cuda.empty_cache()
521
+
522
+ def __call__(
523
+ self,
524
+ audio_files=[],
525
+ tag_list=[],
526
+ overwrite=False,
527
+ parallel_workers=1,
528
+ ):
529
+ logger.info(f"Parallel workers: {str(parallel_workers)}")
530
+
531
+ self.output_list = []
532
+
533
+ if not self.model_config:
534
+ raise ValueError("No model has been configured for inference")
535
+
536
+ if isinstance(audio_files, str):
537
+ audio_files = [audio_files]
538
+ if isinstance(tag_list, str):
539
+ tag_list = [tag_list]
540
+
541
+ if not audio_files:
542
+ raise ValueError("No audio found to convert")
543
+ if not tag_list:
544
+ tag_list = [list(self.model_config.keys())[-1]] * len(audio_files)
545
+
546
+ if len(audio_files) > len(tag_list):
547
+ logger.info("Extend tag list to match audio files")
548
+ extend_number = len(audio_files) - len(tag_list)
549
+ tag_list.extend([tag_list[0]] * extend_number)
550
+
551
+ if len(audio_files) < len(tag_list):
552
+ logger.info("Cut list tags")
553
+ tag_list = tag_list[:len(audio_files)]
554
+
555
+ tag_file_pairs = list(zip(tag_list, audio_files))
556
+ sorted_tag_file = sorted(tag_file_pairs, key=lambda x: x[0])
557
+
558
+ # Base params
559
+ if not self.hu_bert_model:
560
+ self.hu_bert_model = load_hu_bert(self.config)
561
+
562
+ cache_params = None
563
+ threads = []
564
+ progress_bar = tqdm(total=len(tag_list), desc="Progress")
565
+ for i, (id_tag, input_audio_path) in enumerate(sorted_tag_file):
566
+
567
+ if id_tag not in self.model_config.keys():
568
+ logger.info(
569
+ f"No configured model for {id_tag} with {input_audio_path}"
570
+ )
571
+ continue
572
+
573
+ if (
574
+ len(threads) >= parallel_workers
575
+ or cache_params != id_tag
576
+ and cache_params is not None
577
+ ):
578
+
579
+ self.run_threads(threads)
580
+ progress_bar.update(len(threads))
581
+
582
+ threads = []
583
+
584
+ if cache_params != id_tag:
585
+
586
+ self.model_config[id_tag]["result"] = []
587
+
588
+ # Unload previous
589
+ (
590
+ n_spk,
591
+ tgt_sr,
592
+ net_g,
593
+ pipe,
594
+ cpt,
595
+ version,
596
+ if_f0,
597
+ index_rate,
598
+ index,
599
+ big_npy,
600
+ inp_f0,
601
+ ) = [None] * 11
602
+ gc.collect()
603
+ torch.cuda.empty_cache()
604
+
605
+ # Model params
606
+ params = self.model_config[id_tag]
607
+
608
+ model_path = params["file_model"]
609
+ f0_method = params["pitch_algo"]
610
+ file_index = params["file_index"]
611
+ index_rate = params["index_influence"]
612
+ f0_file = params["file_pitch_algo"]
613
+
614
+ # Load model
615
+ (
616
+ n_spk,
617
+ tgt_sr,
618
+ net_g,
619
+ pipe,
620
+ cpt,
621
+ version
622
+ ) = load_trained_model(model_path, self.config)
623
+ if_f0 = cpt.get("f0", 1) # pitch data
624
+
625
+ # Load index
626
+ if os.path.exists(file_index) and index_rate != 0:
627
+ try:
628
+ index = faiss.read_index(file_index)
629
+ big_npy = index.reconstruct_n(0, index.ntotal)
630
+ except Exception as error:
631
+ logger.error(f"Index: {str(error)}")
632
+ index_rate = 0
633
+ index = big_npy = None
634
+ else:
635
+ logger.warning("File index not found")
636
+ index_rate = 0
637
+ index = big_npy = None
638
+
639
+ # Load f0 file
640
+ inp_f0 = None
641
+ if os.path.exists(f0_file):
642
+ try:
643
+ with open(f0_file, "r") as f:
644
+ lines = f.read().strip("\n").split("\n")
645
+ inp_f0 = []
646
+ for line in lines:
647
+ inp_f0.append([float(i) for i in line.split(",")])
648
+ inp_f0 = np.array(inp_f0, dtype="float32")
649
+ except Exception as error:
650
+ logger.error(f"f0 file: {str(error)}")
651
+
652
+ if "rmvpe" in f0_method:
653
+ if not self.model_pitch_estimator:
654
+ from lib.rmvpe import RMVPE
655
+
656
+ logger.info("Loading vocal pitch estimator model")
657
+ self.model_pitch_estimator = RMVPE(
658
+ "rmvpe.pt",
659
+ is_half=self.config.is_half,
660
+ device=self.config.device
661
+ )
662
+
663
+ pipe.model_rmvpe = self.model_pitch_estimator
664
+
665
+ cache_params = id_tag
666
+
667
+ # self.infer(
668
+ # id_tag,
669
+ # params,
670
+ # # load model
671
+ # n_spk,
672
+ # tgt_sr,
673
+ # net_g,
674
+ # pipe,
675
+ # cpt,
676
+ # version,
677
+ # if_f0,
678
+ # # load index
679
+ # index_rate,
680
+ # index,
681
+ # big_npy,
682
+ # # load f0 file
683
+ # inp_f0,
684
+ # # output file
685
+ # input_audio_path,
686
+ # overwrite,
687
+ # )
688
+
689
+ thread = threading.Thread(
690
+ target=self.infer,
691
+ args=(
692
+ id_tag,
693
+ params,
694
+ # loaded model
695
+ n_spk,
696
+ tgt_sr,
697
+ net_g,
698
+ pipe,
699
+ cpt,
700
+ version,
701
+ if_f0,
702
+ # loaded index
703
+ index_rate,
704
+ index,
705
+ big_npy,
706
+ # loaded f0 file
707
+ inp_f0,
708
+ # audio file
709
+ input_audio_path,
710
+ overwrite,
711
+ )
712
+ )
713
+
714
+ threads.append(thread)
715
+
716
+ # Run last
717
+ if threads:
718
+ self.run_threads(threads)
719
+
720
+ progress_bar.update(len(threads))
721
+ progress_bar.close()
722
+
723
+ final_result = []
724
+ valid_tags = set(tag_list)
725
+ for tag in valid_tags:
726
+ if (
727
+ tag in self.model_config.keys()
728
+ and "result" in self.model_config[tag].keys()
729
+ ):
730
+ final_result.extend(self.model_config[tag]["result"])
731
+
732
+ return final_result