OpenVoiceV2

Runtime error

App Files Files Community

XuminYu commited on Apr 24, 2024

Commit

e23742d

0 Parent(s):

init

Browse files

Files changed (9) hide show

.gitattributes +35 -0
README.md +15 -0
app.py +289 -0
count.py +35 -0
examples/speaker0.mp3 +0 -0
examples/speaker1.mp3 +0 -0
examples/speaker2.mp3 +0 -0
examples/speaker3.mp3 +0 -0
requirements.txt +1 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,15 @@

+---
+title: OpenVoiceV2
+emoji: 🤗
+colorFrom: blue
+colorTo: yellow
+sdk: gradio
+sdk_version: 3.48.0
+app_file: app.py
+pinned: false
+license: mit
+models:
+    - myshell-ai/OpenVoice-v2
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import os
+import gradio as gr
+import requests
+import langid
+import base64
+import json
+import time
+import re
+API_URL = os.environ.get("API_URL")
+supported_languages = ['zh', 'en', 'ja', 'ko', 'es', 'fr']
+supported_styles = {
+    'zh': "zh_default",
+    'en': [
+        "en_default",
+        "en_us",
+        "en_br",
+        "en_au",
+        "en_in"
+    ],
+    "es": "es_default",
+    "fr": "fr_default",
+    "ja": "jp_default",
+    "ko": "kr_default"
+}
+output_dir = 'outputs'
+os.makedirs(output_dir, exist_ok=True)
+def audio_to_base64(audio_file):
+    with open(audio_file, "rb") as audio_file:
+        audio_data = audio_file.read()
+        base64_data = base64.b64encode(audio_data).decode("utf-8")
+    return base64_data
+def count_chars_words(sentence):
+    segments = re.findall(r'[\u4e00-\u9fa5]+|\w+', sentence)
+    char_count = 0
+    word_count = 0
+    for segment in segments:
+        if re.match(r'[\u4e00-\u9fa5]+', segment):
+            char_count += len(segment)
+        else:
+            word_count += len(segment.split())
+    return char_count + word_count
+def predict(prompt, style, audio_file_pth, speed, agree):
+    # initialize a empty info
+    text_hint = ''
+    # agree with the terms
+    if agree == False:
+        text_hint += '[ERROR] Please accept the Terms & Condition!\n'
+        gr.Warning("Please accept the Terms & Condition!")
+        return (
+            text_hint,
+            None,
+            None,
+        )
+    # first detect the input language
+    language_predicted = langid.classify(prompt)[0].strip()
+    print(f"Detected language:{language_predicted}")
+    if language_predicted not in supported_languages:
+        text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
+        gr.Warning(
+            f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
+        )
+        return (
+            text_hint,
+            None,
+            None,
+        )
+    # check the style
+    if style not in supported_styles[language_predicted]:
+        text_hint += f"[Warming] The style {style} is not supported for detected language {language_predicted}. For language {language_predicted}, we support styles: {supported_styles[language_predicted]}. Using the wrong style may result in unexpected behavior.\n"
+        gr.Warning(f"[Warming] The style {style} is not supported for detected language {language_predicted}. For language {language_predicted}, we support styles: {supported_styles[language_predicted]}. Using the wrong style may result in unexpected behavior.")
+    prompt_length = count_chars_words(prompt)
+    speaker_wav = audio_file_pth
+    if prompt_length < 2:
+        text_hint += f"[ERROR] Please give a longer prompt text \n"
+        gr.Warning("Please give a longer prompt text")
+        return (
+            text_hint,
+            None,
+            None,
+        )
+    if prompt_length > 50:
+        text_hint += f"[ERROR] Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo or try it on our website https://app.myshell.ai/robot-workshop/widget/174760057433406749 \n"
+        gr.Warning(
+            "Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo or try it on our website https://app.myshell.ai/robot-workshop/widget/174760057433406749"
+        )
+        return (
+            text_hint,
+            None,
+            None,
+        )
+    save_path = f'{output_dir}/output.wav'
+    speaker_audio_base64 = audio_to_base64(speaker_wav)
+    if style == 'en_us':  # we update us accent
+        style = 'en_newest'
+    data = {
+        "text": prompt,
+        "reference_speaker": speaker_audio_base64,
+        "language": style,
+        "speed": speed
+    }
+    start = time.time()
+    # Send the data as a POST request
+    response = requests.post(API_URL, json=data, timeout=60)
+    print(f'Get response successfully within {time.time() - start}')
+    # Check the response
+    if response.status_code == 200:
+        try:
+            json_data = json.loads(response.content)
+            text_hint += f"[ERROR] {json_data['error']} \n"
+            gr.Warning(
+                f"[ERROR] {json_data['error']} \n"
+            )
+            return (
+                text_hint,
+                None,
+                None,
+            )
+        except:
+            with open(save_path, 'wb') as f:
+                f.write(response.content)
+    else:
+        text_hint += f"[HTTP ERROR] {response.status_code} - {response.text} \n"
+        gr.Warning(
+            f"[HTTP ERROR] {response.status_code} - {response.text} \n"
+        )
+        return (
+            text_hint,
+            None,
+            None,
+        )
+    text_hint += f'''Get response successfully \n'''
+    return (
+        text_hint,
+        save_path,
+        speaker_wav,
+    )
+title = "MyShell OpenVoice V2"
+description = """
+In December 2023, we released [OpenVoice V1](https://huggingface.co/spaces/myshell-ai/OpenVoice), an instant voice cloning approach that replicates a speaker's voice and generates speech in multiple languages using only a short audio clip. OpenVoice V1 enables granular control over voice styles, replicates the tone color of the reference speaker and achieves zero-shot cross-lingual voice cloning.
+In April 2024, we released **OpenVoice V2**, which includes all features in V1 and has:
+- **Better Audio Quality**. OpenVoice V2 adopts a different training strategy that delivers better audio quality.
+- **Native Multi-lingual Support**. English, Spanish, French, Chinese, Japanese and Korean are natively supported in OpenVoice V2.
+- **Free Commercial Use**. Starting from April 2024, both V2 and V1 are released under MIT License. Free for commercial use.
+"""
+markdown_table = """
+<div align="center" style="margin-bottom: 10px;">
+|               |               |               |
+| :-----------: | :-----------: | :-----------: |
+| **OpenSource Repo** | **Project Page** | **Join the Community** |
+| <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | [OpenVoice](https://research.myshell.ai/open-voice) | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
+</div>
+"""
+markdown_table_v2 = """
+<div align="center" style="margin-bottom: 2px;">
+|               |               |               |              |
+| :-----------: | :-----------: | :-----------: | :-----------: |
+| **Github Repo** | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> |  **Project Page** |  [OpenVoice](https://research.myshell.ai/open-voice) |
+| | |
+| :-----------: | :-----------: |
+**Join the Community** |   [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
+</div>
+"""
+content = """
+<div>
+  <strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/docs/QA.md'>this QnA</a>.</strong> <strong>If you want to deploy the model by yourself and perform inference, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part3.ipynb'>this jupyter notebook</a>.</strong>
+</div>
+"""
+wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
+examples = [
+    [
+        "Did you ever hear a folk tale about a giant turtle?",
+        'en_us',
+        "examples/speaker0.mp3",
+        True,
+    ],[
+        "El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.",
+        'es_default',
+        "examples/speaker1.mp3",
+        True,
+    ],[
+        "我最近在学习machine learning，希望能够在未来的artificial intelligence领域有所建树。",
+        'zh_default',
+        "examples/speaker2.mp3",
+        True,
+    ],[
+        "彼は毎朝ジョギングをして体を健康に保っています。",
+        'jp_default',
+        "examples/speaker3.mp3",
+        True,
+    ],
+]
+with gr.Blocks(analytics_enabled=False) as demo:
+    with gr.Row():
+        gr.Markdown(
+            """
+            ## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="20"/>
+            """
+        )
+    with gr.Row():
+        gr.Markdown(markdown_table)
+    with gr.Row():
+        gr.Markdown(description)
+    with gr.Row():
+        gr.HTML(wrapped_markdown_content)
+    with gr.Row():
+        with gr.Column():
+            input_text_gr = gr.Textbox(
+                label="Text Prompt",
+                info="One or two sentences at a time is better. Up to 200 text characters.",
+                value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
+            )
+            style_gr = gr.Dropdown(
+                label="Style",
+                info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
+                choices=["en_default", "en_us", "en_br", "en_au", "en_in", "es_default", "fr_default", "jp_default", "zh_default", "kr_default",],
+                max_choices=1,
+                value="en_us",
+            )
+            ref_gr = gr.Audio(
+                label="Reference Audio",
+                info="Click on the ✎ button to upload your own target speaker audio",
+                type="filepath",
+                value="examples/speaker0.mp3",
+            )
+            tos_gr = gr.Checkbox(
+                label="Agree",
+                value=False,
+                info="I agree to the terms of the cc-by-nc-4.0 license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
+            )
+            tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
+        with gr.Column():
+            out_text_gr = gr.Text(label="Info")
+            audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
+            ref_audio_gr = gr.Audio(label="Reference Audio Used")
+            gr.Examples(examples,
+                        label="Examples",
+                        inputs=[input_text_gr, style_gr, ref_gr, tos_gr],
+                        outputs=[out_text_gr, audio_gr, ref_audio_gr],
+                        fn=predict,
+                        cache_examples=False,)
+            tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr])
+demo.queue(concurrency_count=6)
+demo.launch(debug=True, show_api=True)

count.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import re
+def count_chars_words(sentence):
+    # 使用正则表达式分割句子，其中中文按字分割，英文按词分割
+    segments = re.findall(r'[\u4e00-\u9fa5]+|\w+', sentence)
+    # 统计字符数和词数
+    char_count = 0
+    word_count = 0
+    for segment in segments:
+        # print(segment)
+        if re.match(r'[\u4e00-\u9fa5]+', segment):  # 中文部分，每个汉字算一个字符
+            char_count += len(segment)
+        else:  # 英文部分，每个单词算一个词
+            word_count += len(segment.split())
+    return char_count + word_count
+sentence = "如果您 want to deploy the 模型并进行推理"
+count = count_chars_words(sentence)
+print(f"字符数：{count}")
+sentence = "今天天气真好，我们一起出去吃饭吧。"
+count = count_chars_words(sentence)
+print(f"字符数：{count}")
+sentence = "我最近在学习machine learning，希望能够在未来的artificial intelligence领域有所建树。"
+count = count_chars_words(sentence)
+print(f"字符数：{count}")
+sentence = "El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante。"
+count = count_chars_words(sentence)
+print(f"字符数：{count}")

examples/speaker0.mp3 ADDED Viewed

Binary file (961 kB). View file

examples/speaker1.mp3 ADDED Viewed

Binary file (309 kB). View file

examples/speaker2.mp3 ADDED Viewed

Binary file (117 kB). View file

examples/speaker3.mp3 ADDED Viewed

Binary file (472 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ langid