Spaces:
Runtime error
Runtime error
update
Browse files- .gitattributes +1 -0
- app.py +34 -12
- cached_outputs/0.wav +0 -0
- cached_outputs/1.wav +0 -0
- cached_outputs/2.wav +0 -0
- cached_outputs/3.wav +0 -0
- cached_outputs/4.wav +0 -0
- hash_code_for_cached_output.py +56 -0
- openvoicev2.mp4 +3 -0
- requirements.txt +2 -1
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
|
@@ -6,7 +6,8 @@ import base64
|
|
| 6 |
import json
|
| 7 |
import time
|
| 8 |
import re
|
| 9 |
-
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
API_URL = os.environ.get("API_URL")
|
|
@@ -60,6 +61,24 @@ def predict(prompt, style, audio_file_pth, speed, agree):
|
|
| 60 |
None,
|
| 61 |
)
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
# first detect the input language
|
| 64 |
language_predicted = langid.classify(prompt)[0].strip()
|
| 65 |
print(f"Detected language:{language_predicted}")
|
|
@@ -224,22 +243,25 @@ examples = [
|
|
| 224 |
"examples/speaker3.mp3",
|
| 225 |
True,
|
| 226 |
],
|
| 227 |
-
|
| 228 |
]
|
| 229 |
|
| 230 |
with gr.Blocks(analytics_enabled=False) as demo:
|
| 231 |
|
| 232 |
with gr.Row():
|
| 233 |
-
gr.
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
with gr.Row():
|
| 244 |
gr.HTML(wrapped_markdown_content)
|
| 245 |
|
|
|
|
| 6 |
import json
|
| 7 |
import time
|
| 8 |
import re
|
| 9 |
+
import hashlib
|
| 10 |
+
import hash_code_for_cached_output
|
| 11 |
|
| 12 |
|
| 13 |
API_URL = os.environ.get("API_URL")
|
|
|
|
| 61 |
None,
|
| 62 |
)
|
| 63 |
|
| 64 |
+
# Before we get into inference, we will detect if it is from example table or default value
|
| 65 |
+
# If so, we use a cached Audio. Noted that, it is just for demo efficiency.
|
| 66 |
+
# hash code were generated by `hash_code_for_cached_output.py`
|
| 67 |
+
cached_outputs = {
|
| 68 |
+
"d0f5806f6e_60565a5c20_en_us" : "cached_outputs/0.wav",
|
| 69 |
+
"d0f5806f6e_420ab8211d_en_us" : "cached_outputs/1.wav",
|
| 70 |
+
"6e8a024342_0f96bf44f5_es_default" : "cached_outputs/2.wav",
|
| 71 |
+
"54ad3237d7_3fef5adc6f_zh_default" : "cached_outputs/3.wav",
|
| 72 |
+
"8190e911f8_9897b60a4e_jp_default" : "cached_outputs/4.wav"
|
| 73 |
+
}
|
| 74 |
+
unique_code = hash_code_for_cached_output.get_unique_code(audio_file_pth, style, prompt)
|
| 75 |
+
if unique_code in list(cached_outputs.keys()):
|
| 76 |
+
return (
|
| 77 |
+
'We get the cached output for you, since you are try to generating an example cloning.',
|
| 78 |
+
cached_outputs[unique_code],
|
| 79 |
+
audio_file_pth,
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
# first detect the input language
|
| 83 |
language_predicted = langid.classify(prompt)[0].strip()
|
| 84 |
print(f"Detected language:{language_predicted}")
|
|
|
|
| 243 |
"examples/speaker3.mp3",
|
| 244 |
True,
|
| 245 |
],
|
|
|
|
| 246 |
]
|
| 247 |
|
| 248 |
with gr.Blocks(analytics_enabled=False) as demo:
|
| 249 |
|
| 250 |
with gr.Row():
|
| 251 |
+
with gr.Column():
|
| 252 |
+
with gr.Row():
|
| 253 |
+
gr.Markdown(
|
| 254 |
+
"""
|
| 255 |
+
## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/>
|
| 256 |
+
"""
|
| 257 |
+
)
|
| 258 |
+
with gr.Row():
|
| 259 |
+
gr.Markdown(markdown_table_v2)
|
| 260 |
+
with gr.Row():
|
| 261 |
+
gr.Markdown(description)
|
| 262 |
+
with gr.Column():
|
| 263 |
+
gr.Video('./openvoicev2.mp4', autoplay=True)
|
| 264 |
+
|
| 265 |
with gr.Row():
|
| 266 |
gr.HTML(wrapped_markdown_content)
|
| 267 |
|
cached_outputs/0.wav
ADDED
|
Binary file (36.9 kB). View file
|
|
|
cached_outputs/1.wav
ADDED
|
Binary file (20.4 kB). View file
|
|
|
cached_outputs/2.wav
ADDED
|
Binary file (37.5 kB). View file
|
|
|
cached_outputs/3.wav
ADDED
|
Binary file (41.3 kB). View file
|
|
|
cached_outputs/4.wav
ADDED
|
Binary file (40.1 kB). View file
|
|
|
hash_code_for_cached_output.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydub.utils import mediainfo
|
| 2 |
+
import hashlib
|
| 3 |
+
|
| 4 |
+
def audio_hash(audio_path):
|
| 5 |
+
with open(audio_path, "rb") as f:
|
| 6 |
+
audio_data = f.read()
|
| 7 |
+
hash_object = hashlib.sha256()
|
| 8 |
+
hash_object.update(audio_data)
|
| 9 |
+
audio_hash = hash_object.hexdigest()
|
| 10 |
+
|
| 11 |
+
return audio_hash[:10]
|
| 12 |
+
|
| 13 |
+
def str_to_hash(input_str):
|
| 14 |
+
input_bytes = input_str.encode('utf-8')
|
| 15 |
+
hash_object = hashlib.sha256()
|
| 16 |
+
hash_object.update(input_bytes)
|
| 17 |
+
hash_code = hash_object.hexdigest()
|
| 18 |
+
|
| 19 |
+
return hash_code[:10]
|
| 20 |
+
|
| 21 |
+
def get_unique_code(reference_speaker, text, language):
|
| 22 |
+
return f"{audio_hash(reference_speaker)}_{str_to_hash(text)}_{language}"
|
| 23 |
+
|
| 24 |
+
if __name__ == '__main__':
|
| 25 |
+
|
| 26 |
+
example_inputs = [
|
| 27 |
+
{
|
| 28 |
+
"text": "The bustling city square bustled with street performers, tourists, and local vendors.",
|
| 29 |
+
"language": 'en_us',
|
| 30 |
+
"reference_speaker": "examples/speaker0.mp3"
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"text": "Did you ever hear a folk tale about a giant turtle?",
|
| 34 |
+
"language": 'en_us',
|
| 35 |
+
"reference_speaker": "examples/speaker0.mp3"
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"text": "El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.",
|
| 39 |
+
"language": 'es_default',
|
| 40 |
+
"reference_speaker": "examples/speaker1.mp3",
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"text": "我最近在学习machine learning,希望能够在未来的artificial intelligence领域有所建树。",
|
| 44 |
+
"language": 'zh_default',
|
| 45 |
+
"reference_speaker": "examples/speaker2.mp3",
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"text": "彼は毎朝ジョギングをして体を健康に保っています。",
|
| 49 |
+
"language": 'jp_default',
|
| 50 |
+
"reference_speaker": "examples/speaker3.mp3",
|
| 51 |
+
}
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
for example_input in example_inputs:
|
| 55 |
+
print(get_unique_code(example_input['reference_speaker'], example_input['text'], example_input['language']))
|
| 56 |
+
|
openvoicev2.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e623abfdd5d858005d494b7c04c527927534a9a63ca0005739e40f097d6d75e
|
| 3 |
+
size 12042795
|
requirements.txt
CHANGED
|
@@ -1 +1,2 @@
|
|
| 1 |
-
langid
|
|
|
|
|
|
| 1 |
+
langid
|
| 2 |
+
hashlib
|