Spaces:

kotoba-tech
/

seamless_m4t-large-v2

Runtime error

App Files Files Community

reach-vb HF Staff

ylacombe commited on Oct 30, 2023

Commit

3d59a60

1 Parent(s): c111ea2

hf_implementation (#23)

Browse files

- update with HF implementation (b3882fafaf5d0c32dd9b458e7efbcac2469293a1)

Co-authored-by: Yoach Lacombe <[email protected]>

Files changed (5) hide show

Dockerfile +0 -56
README.md +3 -2
app.py +23 -19
lang_list.py +148 -0
requirements.txt +2 -5

Dockerfile DELETED Viewed

@@ -1,56 +0,0 @@
-FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && \
-    apt-get upgrade -y && \
-    apt-get install -y --no-install-recommends \
-    git \
-    git-lfs \
-    wget \
-    curl \
-    # python build dependencies \
-    build-essential \
-    libssl-dev \
-    zlib1g-dev \
-    libbz2-dev \
-    libreadline-dev \
-    libsqlite3-dev \
-    libncursesw5-dev \
-    xz-utils \
-    tk-dev \
-    libxml2-dev \
-    libxmlsec1-dev \
-    libffi-dev \
-    liblzma-dev \
-    # gradio dependencies \
-    ffmpeg \
-    # fairseq2 dependencies \
-    libsndfile-dev && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-RUN useradd -m -u 1000 user
-USER user
-ENV HOME=/home/user \
-    PATH=/home/user/.local/bin:${PATH}
-WORKDIR ${HOME}/app
-RUN curl https://pyenv.run | bash
-ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
-ARG PYTHON_VERSION=3.10.12
-RUN pyenv install ${PYTHON_VERSION} && \
-    pyenv global ${PYTHON_VERSION} && \
-    pyenv rehash && \
-    pip install --no-cache-dir -U pip setuptools wheel
-COPY --chown=1000 ./requirements.txt /tmp/requirements.txt
-RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt
-COPY --chown=1000 . ${HOME}/app
-ENV PYTHONPATH=${HOME}/app \
-    PYTHONUNBUFFERED=1 \
-    GRADIO_ALLOW_FLAGGING=never \
-    GRADIO_NUM_PORTS=1 \
-    GRADIO_SERVER_NAME=0.0.0.0 \
-    GRADIO_THEME=huggingface \
-    SYSTEM=spaces
-CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -3,9 +3,10 @@ title: Seamless M4T
 emoji: 📞
 colorFrom: blue
 colorTo: yellow
-sdk: docker
 pinned: false
 suggested_hardware: t4-medium
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 emoji: 📞
 colorFrom: blue
 colorTo: yellow
+sdk: gradio
+app_file: app.py
 pinned: false
 suggested_hardware: t4-medium
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ import gradio as gr
 import numpy as np
 import torch
 import torchaudio
-from seamless_communication.models.inference.translator import Translator
 from lang_list import (
     LANGUAGE_NAME_TO_CODE,
@@ -14,13 +14,12 @@ from lang_list import (
     S2TT_TARGET_LANGUAGE_NAMES,
     T2TT_TARGET_LANGUAGE_NAMES,
     TEXT_SOURCE_LANGUAGE_NAMES,
 )
 DESCRIPTION = """# SeamlessM4T
 [SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality
 translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
 This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST)
 translation and more, without relying on multiple separate models.
 """
@@ -39,11 +38,9 @@ MAX_INPUT_AUDIO_LENGTH = 60  # in seconds
 DEFAULT_TARGET_LANGUAGE = "French"
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-translator = Translator(
-    model_name_or_card="seamlessM4T_large",
-    vocoder_name_or_card="vocoder_36langs",
-    device=device,
-)
 def predict(
@@ -71,18 +68,25 @@ def predict(
         if new_arr.shape[1] > max_length:
             new_arr = new_arr[:, :max_length]
             gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
-        torchaudio.save(input_data, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))
     else:
-        input_data = input_text
-    text_out, wav, sr = translator.predict(
-        input=input_data,
-        task_str=task_name,
-        tgt_lang=target_language_code,
-        src_lang=source_language_code,
-        ngram_filtering=True,
-    )
     if task_name in ["S2ST", "T2ST"]:
-        return (sr, wav.cpu().detach().numpy()), text_out
     else:
         return None, text_out
@@ -430,4 +434,4 @@ demo.queue(max_size=50).launch()
 # Linking models to the space
 # 'facebook/seamless-m4t-large'
-# 'facebook/SONAR'

 import numpy as np
 import torch
 import torchaudio
+from transformers import AutoProcessor, SeamlessM4TModel
 from lang_list import (
     LANGUAGE_NAME_TO_CODE,
     S2TT_TARGET_LANGUAGE_NAMES,
     T2TT_TARGET_LANGUAGE_NAMES,
     TEXT_SOURCE_LANGUAGE_NAMES,
+    LANG_TO_SPKR_ID,
 )
 DESCRIPTION = """# SeamlessM4T
 [SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality
 translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
 This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST)
 translation and more, without relying on multiple separate models.
 """
 DEFAULT_TARGET_LANGUAGE = "French"
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+processor = AutoProcessor.from_pretrained("ylacombe/hf-seamless-m4t-large")
+model = SeamlessM4TModel.from_pretrained("ylacombe/hf-seamless-m4t-large").to(device)
 def predict(
         if new_arr.shape[1] > max_length:
             new_arr = new_arr[:, :max_length]
             gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
+        input_data = processor(audios = new_arr, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt").to(device)
     else:
+        input_data = processor(text = input_text, src_lang=source_language_code, return_tensors="pt").to(device)
+    if task_name in ["S2TT", "T2TT"]:
+        tokens_ids = model.generate(**input_data, generate_speech=False, tgt_lang=target_language_code, num_beams=5, do_sample=True)[0].cpu().squeeze().detach().tolist()
+    else:
+        output = model.generate(**input_data, return_intermediate_token_ids=True, tgt_lang=target_language_code, num_beams=5, do_sample=True, spkr_id=LANG_TO_SPKR_ID[target_language_code][0])
+        waveform = output.waveform.cpu().squeeze().detach().numpy()
+        tokens_ids = output.sequences.cpu().squeeze().detach().tolist()
+    text_out = processor.decode(tokens_ids, skip_special_tokens=True)
     if task_name in ["S2ST", "T2ST"]:
+        return (AUDIO_SAMPLE_RATE, waveform), text_out
     else:
         return None, text_out
 # Linking models to the space
 # 'facebook/seamless-m4t-large'
+# 'facebook/SONAR'

lang_list.py CHANGED Viewed

@@ -252,3 +252,151 @@ S2ST_TARGET_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in s2s
 S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
 # T2TT
 T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES

 S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
 # T2TT
 T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
+LANG_TO_SPKR_ID = {
+    "arb": [
+        0
+    ],
+    "ben": [
+        2,
+        1
+    ],
+    "cat": [
+        3
+    ],
+    "ces": [
+        4
+    ],
+    "cmn": [
+        5
+    ],
+    "cym": [
+        6
+    ],
+    "dan": [
+        7,
+        8
+    ],
+    "deu": [
+        9
+    ],
+    "eng": [
+        10
+    ],
+    "est": [
+        11,
+        12,
+        13
+    ],
+    "fin": [
+        14
+    ],
+    "fra": [
+        15
+    ],
+    "hin": [
+        16
+    ],
+    "ind": [
+        17,
+        24,
+        18,
+        20,
+        19,
+        21,
+        23,
+        27,
+        26,
+        22,
+        25
+    ],
+    "ita": [
+        29,
+        28
+    ],
+    "jpn": [
+        30
+    ],
+    "kor": [
+        31
+    ],
+    "mlt": [
+        32,
+        33,
+        34
+    ],
+    "nld": [
+        35
+    ],
+    "pes": [
+        36
+    ],
+    "pol": [
+        37
+    ],
+    "por": [
+        38
+    ],
+    "ron": [
+        39
+    ],
+    "rus": [
+        40
+    ],
+    "slk": [
+        41
+    ],
+    "spa": [
+        42
+    ],
+    "swe": [
+        43,
+        45,
+        44
+    ],
+    "swh": [
+        46,
+        48,
+        47
+    ],
+    "tel": [
+        49
+    ],
+    "tgl": [
+        50
+    ],
+    "tha": [
+        51,
+        54,
+        55,
+        52,
+        53
+    ],
+    "tur": [
+        58,
+        57,
+        56
+    ],
+    "ukr": [
+        59
+    ],
+    "urd": [
+        60,
+        61,
+        62
+    ],
+    "uzn": [
+        63,
+        64,
+        65
+    ],
+    "vie": [
+        66,
+        67,
+        70,
+        71,
+        68,
+        69
+    ]
+}

requirements.txt CHANGED Viewed

@@ -1,6 +1,3 @@
-fairseq2==0.1.0
-git+https://github.com/facebookresearch/seamless_communication
-gradio==3.40.1
-huggingface_hub==0.16.4
-torch==2.0.1
 torchaudio==2.0.2

+git+https://github.com/huggingface/transformers
 torchaudio==2.0.2
+sentencepiece