seamless-streaming

Paused

App Files Files Community

mduppes commited on Sep 6, 2023

Commit

1727d3b

1 Parent(s): a7361bc

work with m4t model

Browse files

Files changed (2) hide show

app.py +125 -57
m4t_app.py +463 -0

app.py CHANGED Viewed

@@ -11,73 +11,141 @@ from seamless_communication.models.inference.translator import Translator
 from m4t_app import *
-from transformers import pipeline
-p = pipeline("automatic-speech-recognition")
 from pydub import AudioSegment
 import time
 from time import sleep
-m4t_demo()
-def transcribe(audio, state=""):
-    # sleep(2)
-    print('state', state)
-    text = p(audio)["text"]
-    state += text + " "
-    return state
 def blocks():
     with gr.Blocks() as demo:
-        total_audio_bytes_state = gr.State(bytes())
-        total_text_state = gr.State("")
         # input_audio = gr.Audio(label="Input Audio", type="filepath", format="mp3")
-        input_audio = gr.Audio(label="Input Audio", type="filepath", format="mp3", source="microphone", streaming=True)
-        with gr.Row():
-            with gr.Column():
-                stream_as_bytes_btn = gr.Button("Stream as Bytes")
-                stream_as_bytes_output = gr.Audio(format="bytes", streaming=True)
-                stream_output_text = gr.Textbox(label="Translated text")
-                def stream_bytes(audio_file, total_audio_bytes_state, total_text_state):
-                    chunk_size = 30000
-                    print(f"audio_file {audio_file}, size {os.path.getsize(audio_file)}")
-                    with open(audio_file, "rb") as f:
-                        while True:
-                            chunk = f.read(chunk_size)
-                            if chunk:
-                                total_audio_bytes_state += chunk
-                                print('yielding chunk', len(chunk))
-                                print('total audio bytes', len(total_audio_bytes_state))
-                                print(f"Text state: {total_text_state}")
-                                # This does the whole thing every time
-                                # total_text = transcribe(chunk, "")
-                                # yield total_audio_bytes_state, total_text, total_audio_bytes_state, total_text_state
-                                # This translates just the new part every time
-                                total_text_state = transcribe(chunk, total_text_state)
-                                total_text = total_text_state
-                                # total_text = transcribe(chunk, total_text)
-                                yield total_audio_bytes_state, total_text, total_audio_bytes_state, total_text_state
-                                # sleep(3)
-                            else:
-                                break
-                def clear():
-                    print('clearing')
-                    return [bytes(), ""]
-                stream_as_bytes_btn.click(stream_bytes, [input_audio, total_audio_bytes_state, total_text_state], [stream_as_bytes_output, stream_output_text, total_audio_bytes_state, total_text_state])
-        input_audio.change(stream_bytes, [input_audio, total_audio_bytes_state, total_text_state], [stream_as_bytes_output, stream_output_text, total_audio_bytes_state, total_text_state])
-        input_audio.clear(clear, None, [total_audio_bytes_state, total_text_state])
-        input_audio.start_recording(clear, None, [total_audio_bytes_state, total_text_state])
     demo.queue().launch()

 from m4t_app import *
 from pydub import AudioSegment
 import time
 from time import sleep
+# m4t_demo()
+USE_M4T = True
+def translate_audio_file_segment(audio_file):
+    print("translate_m4t state")
+    return predict(
+        task_name="S2ST",
+        audio_source="microphone",
+        input_audio_mic=audio_file,
+        input_audio_file=None,
+        input_text="",
+        source_language="English",
+        target_language="Portuguese",
+    )
+def translate_m4t_callback(
+    audio_file, translated_audio_bytes_state, translated_text_state
+):
+    translated_wav_segment, translated_text = translate_audio_file_segment(audio_file)
+    print('translated_audio_bytes_state', translated_audio_bytes_state)
+    print('translated_wav_segment', translated_wav_segment)
+    # combine translated wav into larger..
+    if type(translated_audio_bytes_state) is not tuple:
+        translated_audio_bytes_state = translated_wav_segment
+    else:
+        translated_audio_bytes_state = (translated_audio_bytes_state[0], np.append(translated_audio_bytes_state[1], translated_wav_segment[1]))
+    # translated_wav_segment[1]
+    translated_text_state += " | " + str(translated_text)
+    return [
+        audio_file,
+        translated_wav_segment,
+        translated_audio_bytes_state,
+        translated_text_state,
+        translated_audio_bytes_state,
+        translated_text_state,
+    ]
+def clear():
+    print("Clearing State")
+    return [bytes(), ""]
 def blocks():
     with gr.Blocks() as demo:
+        translated_audio_bytes_state = gr.State(None)
+        translated_text_state = gr.State("")
         # input_audio = gr.Audio(label="Input Audio", type="filepath", format="mp3")
+        if USE_M4T:
+            input_audio = gr.Audio(
+                label="Input Audio",
+                type="filepath",
+                source="microphone",
+                streaming=True,
+            )
+        else:
+            input_audio = gr.Audio(
+                label="Input Audio",
+                type="filepath",
+                format="mp3",
+                source="microphone",
+                streaming=True,
+            )
+        most_recent_input_audio_segment = gr.Audio(
+            label="Recent Input Audio Segment segments", format="bytes", streaming=True
+        )
+        # TODO: Should add combined input audio segments...
+        stream_as_bytes_btn = gr.Button("Translate most recent recording segment")
+        output_translation_segment = gr.Audio(
+            label="Translated audio segment",
+            autoplay=False,
+            streaming=True,
+            type="numpy",
+        )
+        output_translation_combined = gr.Audio(
+            label="Translated audio combined",
+            autoplay=False,
+            streaming=True,
+            type="numpy",
+        )
+        # Could add output text segment
+        stream_output_text = gr.Textbox(label="Translated text")
+        stream_as_bytes_btn.click(
+            translate_m4t_callback,
+            [input_audio, translated_audio_bytes_state, translated_text_state],
+            [
+                most_recent_input_audio_segment,
+                output_translation_segment,
+                output_translation_combined,
+                stream_output_text,
+                translated_audio_bytes_state,
+                translated_text_state,
+            ],
+        )
+        input_audio.change(
+            translate_m4t_callback,
+            [input_audio, translated_audio_bytes_state, translated_text_state],
+            [
+                most_recent_input_audio_segment,
+                output_translation_segment,
+                output_translation_combined,
+                stream_output_text,
+                translated_audio_bytes_state,
+                translated_text_state,
+            ],
+        )
+        # input_audio.change(stream_bytes, [input_audio, translated_audio_bytes_state, translated_text_state], [most_recent_input_audio_segment, stream_output_text, translated_audio_bytes_state, translated_text_state])
+        # input_audio.change(lambda input_audio: recorded_audio, [input_audio], [recorded_audio])
+        input_audio.clear(
+            clear, None, [translated_audio_bytes_state, translated_text_state]
+        )
+        input_audio.start_recording(
+            clear, None, [translated_audio_bytes_state, translated_text_state]
+        )
     demo.queue().launch()

m4t_app.py ADDED Viewed

	@@ -0,0 +1,463 @@

+from __future__ import annotations
+import os
+import gradio as gr
+import numpy as np
+import torch
+import torchaudio
+from seamless_communication.models.inference.translator import Translator
+from lang_list import (
+    LANGUAGE_NAME_TO_CODE,
+    S2ST_TARGET_LANGUAGE_NAMES,
+    S2TT_TARGET_LANGUAGE_NAMES,
+    T2TT_TARGET_LANGUAGE_NAMES,
+    TEXT_SOURCE_LANGUAGE_NAMES,
+)
+DESCRIPTION = """# SeamlessM4T
+# mduppes aaaaaa
+[SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality
+translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
+This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST)
+translation and more, without relying on multiple separate models.
+"""
+CACHE_EXAMPLES = os.getenv("CACHE_EXAMPLES") == "1"
+TASK_NAMES = [
+    "S2ST (Speech to Speech translation)",
+    "S2TT (Speech to Text translation)",
+    "T2ST (Text to Speech translation)",
+    "T2TT (Text to Text translation)",
+    "ASR (Automatic Speech Recognition)",
+]
+AUDIO_SAMPLE_RATE = 16000.0
+MAX_INPUT_AUDIO_LENGTH = 60  # in seconds
+DEFAULT_TARGET_LANGUAGE = "French"
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+print("DEVICE", device)
+translator = Translator(
+    model_name_or_card="seamlessM4T_medium",
+    vocoder_name_or_card="vocoder_36langs",
+    device=device,
+    # dtype=torch.float16,
+    # For CPU Mode need to use 32, float16 causes errors downstream
+    dtype=torch.float32,
+)
+def get_translator():
+    return translator
+def transcribe(audio):
+    print(audio)
+    text = p(audio)["text"]
+    return text
+def transcribe_state(audio, state = ""):
+    print(audio)
+    text = p(audio)["text"]
+    state += text + " "
+    return state, state
+def predict(
+    task_name: str,
+    audio_source: str,
+    input_audio_mic: str | None,
+    input_audio_file: str | None,
+    input_text: str | None,
+    source_language: str | None,
+    target_language: str,
+) -> tuple[tuple[int, np.ndarray] | None, str]:
+    task_name = task_name.split()[0]
+    source_language_code = LANGUAGE_NAME_TO_CODE[source_language] if source_language else None
+    target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
+    if task_name in ["S2ST", "S2TT", "ASR"]:
+        if audio_source == "microphone":
+            input_data = input_audio_mic
+        else:
+            input_data = input_audio_file
+        arr, org_sr = torchaudio.load(input_data)
+        print(task_name, audio_source, input_audio_mic, type(input_audio_file), type(input_text), source_language, target_language)
+        new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
+        max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
+        if new_arr.shape[1] > max_length:
+            new_arr = new_arr[:, :max_length]
+            gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
+        torchaudio.save(input_data, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))
+    else:
+        input_data = input_text
+    text_out, wav, sr = translator.predict(
+        input=input_data,
+        task_str=task_name,
+        tgt_lang=target_language_code,
+        src_lang=source_language_code,
+        ngram_filtering=True,
+        sample_rate=AUDIO_SAMPLE_RATE,
+    )
+    print("translation response", text_out, wav, sr)
+    # text_out = "Testing"
+    # return None, text_out
+    if task_name in ["S2ST", "T2ST"]:
+        return (sr, wav.cpu().detach().numpy()), text_out
+    else:
+        return None, text_out
+def process_s2st_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
+    return predict(
+        task_name="S2ST",
+        audio_source="file",
+        input_audio_mic=None,
+        input_audio_file=input_audio_file,
+        input_text=None,
+        source_language=None,
+        target_language=target_language,
+    )
+def process_s2tt_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
+    return predict(
+        task_name="S2TT",
+        audio_source="file",
+        input_audio_mic=None,
+        input_audio_file=input_audio_file,
+        input_text=None,
+        source_language=None,
+        target_language=target_language,
+    )
+def process_t2st_example(
+    input_text: str, source_language: str, target_language: str
+) -> tuple[tuple[int, np.ndarray] | None, str]:
+    return predict(
+        task_name="T2ST",
+        audio_source="",
+        input_audio_mic=None,
+        input_audio_file=None,
+        input_text=input_text,
+        source_language=source_language,
+        target_language=target_language,
+    )
+def process_t2tt_example(
+    input_text: str, source_language: str, target_language: str
+) -> tuple[tuple[int, np.ndarray] | None, str]:
+    return predict(
+        task_name="T2TT",
+        audio_source="",
+        input_audio_mic=None,
+        input_audio_file=None,
+        input_text=input_text,
+        source_language=source_language,
+        target_language=target_language,
+    )
+def process_asr_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
+    return predict(
+        task_name="ASR",
+        audio_source="file",
+        input_audio_mic=None,
+        input_audio_file=input_audio_file,
+        input_text=None,
+        source_language=None,
+        target_language=target_language,
+    )
+def update_audio_ui(audio_source: str) -> tuple[dict, dict]:
+    mic = audio_source == "microphone"
+    return (
+        gr.update(visible=mic, value=None),  # input_audio_mic
+        gr.update(visible=not mic, value=None),  # input_audio_file
+    )
+def update_input_ui(task_name: str) -> tuple[dict, dict, dict, dict]:
+    task_name = task_name.split()[0]
+    if task_name == "S2ST":
+        return (
+            gr.update(visible=True),  # audio_box
+            gr.update(visible=False),  # input_text
+            gr.update(visible=False),  # source_language
+            gr.update(
+                visible=True, choices=S2ST_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
+            ),  # target_language
+        )
+    elif task_name == "S2TT":
+        return (
+            gr.update(visible=True),  # audio_box
+            gr.update(visible=False),  # input_text
+            gr.update(visible=False),  # source_language
+            gr.update(
+                visible=True, choices=S2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
+            ),  # target_language
+        )
+    elif task_name == "T2ST":
+        return (
+            gr.update(visible=False),  # audio_box
+            gr.update(visible=True),  # input_text
+            gr.update(visible=True),  # source_language
+            gr.update(
+                visible=True, choices=S2ST_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
+            ),  # target_language
+        )
+    elif task_name == "T2TT":
+        return (
+            gr.update(visible=False),  # audio_box
+            gr.update(visible=True),  # input_text
+            gr.update(visible=True),  # source_language
+            gr.update(
+                visible=True, choices=T2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
+            ),  # target_language
+        )
+    elif task_name == "ASR":
+        return (
+            gr.update(visible=True),  # audio_box
+            gr.update(visible=False),  # input_text
+            gr.update(visible=False),  # source_language
+            gr.update(
+                visible=True, choices=S2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
+            ),  # target_language
+        )
+    else:
+        raise ValueError(f"Unknown task: {task_name}")
+def update_output_ui(task_name: str) -> tuple[dict, dict]:
+    task_name = task_name.split()[0]
+    if task_name in ["S2ST", "T2ST"]:
+        return (
+            gr.update(visible=True, value=None),  # output_audio
+            gr.update(value=None),  # output_text
+        )
+    elif task_name in ["S2TT", "T2TT", "ASR"]:
+        return (
+            gr.update(visible=False, value=None),  # output_audio
+            gr.update(value=None),  # output_text
+        )
+    else:
+        raise ValueError(f"Unknown task: {task_name}")
+def update_example_ui(task_name: str) -> tuple[dict, dict, dict, dict, dict]:
+    task_name = task_name.split()[0]
+    return (
+        gr.update(visible=task_name == "S2ST"),  # s2st_example_row
+        gr.update(visible=task_name == "S2TT"),  # s2tt_example_row
+        gr.update(visible=task_name == "T2ST"),  # t2st_example_row
+        gr.update(visible=task_name == "T2TT"),  # t2tt_example_row
+        gr.update(visible=task_name == "ASR"),  # asr_example_row
+    )
+def m4t_demo():
+    with gr.Blocks(css="style.css") as demo:
+        gr.Markdown(DESCRIPTION)
+        gr.DuplicateButton(
+            value="Duplicate Space for private use",
+            elem_id="duplicate-button",
+            visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
+        )
+        with gr.Group():
+            task_name = gr.Dropdown(
+                label="Task",
+                choices=TASK_NAMES,
+                value=TASK_NAMES[0],
+            )
+            with gr.Row():
+                source_language = gr.Dropdown(
+                    label="Source language",
+                    choices=TEXT_SOURCE_LANGUAGE_NAMES,
+                    value="English",
+                    visible=False,
+                )
+                target_language = gr.Dropdown(
+                    label="Target language",
+                    choices=S2ST_TARGET_LANGUAGE_NAMES,
+                    value=DEFAULT_TARGET_LANGUAGE,
+                )
+            with gr.Row() as audio_box:
+                audio_source = gr.Radio(
+                    label="Audio source",
+                    choices=["file", "microphone"],
+                    value="file",
+                )
+                input_audio_mic = gr.Audio(
+                    label="Input speech",
+                    type="filepath",
+                    source="microphone",
+                    visible=False,
+                )
+                input_audio_file = gr.Audio(
+                    label="Input speech",
+                    type="filepath",
+                    source="upload",
+                    visible=True,
+                )
+            input_text = gr.Textbox(label="Input text", visible=False)
+            btn = gr.Button("Translate")
+            with gr.Column():
+                output_audio = gr.Audio(
+                    label="Translated speech",
+                    autoplay=False,
+                    streaming=False,
+                    type="numpy",
+                )
+                output_text = gr.Textbox(label="Translated text")
+        with gr.Row(visible=True) as s2st_example_row:
+            s2st_examples = gr.Examples(
+                examples=[
+                    ["assets/sample_input.mp3", "French"],
+                    ["assets/sample_input.mp3", "Mandarin Chinese"],
+                    ["assets/sample_input_2.mp3", "Hindi"],
+                    ["assets/sample_input_2.mp3", "Spanish"],
+                ],
+                inputs=[input_audio_file, target_language],
+                outputs=[output_audio, output_text],
+                fn=process_s2st_example,
+                cache_examples=CACHE_EXAMPLES,
+            )
+        with gr.Row(visible=False) as s2tt_example_row:
+            s2tt_examples = gr.Examples(
+                examples=[
+                    ["assets/sample_input.mp3", "French"],
+                    ["assets/sample_input.mp3", "Mandarin Chinese"],
+                    ["assets/sample_input_2.mp3", "Hindi"],
+                    ["assets/sample_input_2.mp3", "Spanish"],
+                ],
+                inputs=[input_audio_file, target_language],
+                outputs=[output_audio, output_text],
+                fn=process_s2tt_example,
+                cache_examples=CACHE_EXAMPLES,
+            )
+        with gr.Row(visible=False) as t2st_example_row:
+            t2st_examples = gr.Examples(
+                examples=[
+                    ["My favorite animal is the elephant.", "English", "French"],
+                    ["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
+                    [
+                        "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
+                        "English",
+                        "Hindi",
+                    ],
+                    [
+                        "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
+                        "English",
+                        "Spanish",
+                    ],
+                ],
+                inputs=[input_text, source_language, target_language],
+                outputs=[output_audio, output_text],
+                fn=process_t2st_example,
+                cache_examples=CACHE_EXAMPLES,
+            )
+        with gr.Row(visible=False) as t2tt_example_row:
+            t2tt_examples = gr.Examples(
+                examples=[
+                    ["My favorite animal is the elephant.", "English", "French"],
+                    ["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
+                    [
+                        "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
+                        "English",
+                        "Hindi",
+                    ],
+                    [
+                        "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
+                        "English",
+                        "Spanish",
+                    ],
+                ],
+                inputs=[input_text, source_language, target_language],
+                outputs=[output_audio, output_text],
+                fn=process_t2tt_example,
+                cache_examples=CACHE_EXAMPLES,
+            )
+        with gr.Row(visible=False) as asr_example_row:
+            asr_examples = gr.Examples(
+                examples=[
+                    ["assets/sample_input.mp3", "English"],
+                    ["assets/sample_input_2.mp3", "English"],
+                ],
+                inputs=[input_audio_file, target_language],
+                outputs=[output_audio, output_text],
+                fn=process_asr_example,
+                cache_examples=CACHE_EXAMPLES,
+            )
+        audio_source.change(
+            fn=update_audio_ui,
+            inputs=audio_source,
+            outputs=[
+                input_audio_mic,
+                input_audio_file,
+            ],
+            queue=False,
+            api_name=False,
+        )
+        task_name.change(
+            fn=update_input_ui,
+            inputs=task_name,
+            outputs=[
+                audio_box,
+                input_text,
+                source_language,
+                target_language,
+            ],
+            queue=False,
+            api_name=False,
+        ).then(
+            fn=update_output_ui,
+            inputs=task_name,
+            outputs=[output_audio, output_text],
+            queue=False,
+            api_name=False,
+        ).then(
+            fn=update_example_ui,
+            inputs=task_name,
+            outputs=[
+                s2st_example_row,
+                s2tt_example_row,
+                t2st_example_row,
+                t2tt_example_row,
+                asr_example_row,
+            ],
+            queue=False,
+            api_name=False,
+        )
+        btn.click(
+            fn=predict,
+            inputs=[
+                task_name,
+                audio_source,
+                input_audio_mic,
+                input_audio_file,
+                input_text,
+                source_language,
+                target_language,
+            ],
+            outputs=[output_audio, output_text],
+            api_name="run",
+        )
+    demo.queue(max_size=50).launch()
+# Linking models to the space
+# 'facebook/seamless-m4t-large'
+# 'facebook/SONAR'