Spaces:
Runtime error
Runtime error
hf_implementation (#23)
Browse files- update with HF implementation (b3882fafaf5d0c32dd9b458e7efbcac2469293a1)
Co-authored-by: Yoach Lacombe <[email protected]>
- Dockerfile +0 -56
- README.md +3 -2
- app.py +23 -19
- lang_list.py +148 -0
- requirements.txt +2 -5
Dockerfile
DELETED
|
@@ -1,56 +0,0 @@
|
|
| 1 |
-
FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
|
| 2 |
-
ENV DEBIAN_FRONTEND=noninteractive
|
| 3 |
-
RUN apt-get update && \
|
| 4 |
-
apt-get upgrade -y && \
|
| 5 |
-
apt-get install -y --no-install-recommends \
|
| 6 |
-
git \
|
| 7 |
-
git-lfs \
|
| 8 |
-
wget \
|
| 9 |
-
curl \
|
| 10 |
-
# python build dependencies \
|
| 11 |
-
build-essential \
|
| 12 |
-
libssl-dev \
|
| 13 |
-
zlib1g-dev \
|
| 14 |
-
libbz2-dev \
|
| 15 |
-
libreadline-dev \
|
| 16 |
-
libsqlite3-dev \
|
| 17 |
-
libncursesw5-dev \
|
| 18 |
-
xz-utils \
|
| 19 |
-
tk-dev \
|
| 20 |
-
libxml2-dev \
|
| 21 |
-
libxmlsec1-dev \
|
| 22 |
-
libffi-dev \
|
| 23 |
-
liblzma-dev \
|
| 24 |
-
# gradio dependencies \
|
| 25 |
-
ffmpeg \
|
| 26 |
-
# fairseq2 dependencies \
|
| 27 |
-
libsndfile-dev && \
|
| 28 |
-
apt-get clean && \
|
| 29 |
-
rm -rf /var/lib/apt/lists/*
|
| 30 |
-
|
| 31 |
-
RUN useradd -m -u 1000 user
|
| 32 |
-
USER user
|
| 33 |
-
ENV HOME=/home/user \
|
| 34 |
-
PATH=/home/user/.local/bin:${PATH}
|
| 35 |
-
WORKDIR ${HOME}/app
|
| 36 |
-
|
| 37 |
-
RUN curl https://pyenv.run | bash
|
| 38 |
-
ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
|
| 39 |
-
ARG PYTHON_VERSION=3.10.12
|
| 40 |
-
RUN pyenv install ${PYTHON_VERSION} && \
|
| 41 |
-
pyenv global ${PYTHON_VERSION} && \
|
| 42 |
-
pyenv rehash && \
|
| 43 |
-
pip install --no-cache-dir -U pip setuptools wheel
|
| 44 |
-
|
| 45 |
-
COPY --chown=1000 ./requirements.txt /tmp/requirements.txt
|
| 46 |
-
RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt
|
| 47 |
-
|
| 48 |
-
COPY --chown=1000 . ${HOME}/app
|
| 49 |
-
ENV PYTHONPATH=${HOME}/app \
|
| 50 |
-
PYTHONUNBUFFERED=1 \
|
| 51 |
-
GRADIO_ALLOW_FLAGGING=never \
|
| 52 |
-
GRADIO_NUM_PORTS=1 \
|
| 53 |
-
GRADIO_SERVER_NAME=0.0.0.0 \
|
| 54 |
-
GRADIO_THEME=huggingface \
|
| 55 |
-
SYSTEM=spaces
|
| 56 |
-
CMD ["python", "app.py"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -3,9 +3,10 @@ title: Seamless M4T
|
|
| 3 |
emoji: π
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: yellow
|
| 6 |
-
sdk:
|
|
|
|
| 7 |
pinned: false
|
| 8 |
suggested_hardware: t4-medium
|
| 9 |
---
|
| 10 |
|
| 11 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 3 |
emoji: π
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: yellow
|
| 6 |
+
sdk: gradio
|
| 7 |
+
app_file: app.py
|
| 8 |
pinned: false
|
| 9 |
suggested_hardware: t4-medium
|
| 10 |
---
|
| 11 |
|
| 12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
|
@@ -6,7 +6,7 @@ import gradio as gr
|
|
| 6 |
import numpy as np
|
| 7 |
import torch
|
| 8 |
import torchaudio
|
| 9 |
-
from
|
| 10 |
|
| 11 |
from lang_list import (
|
| 12 |
LANGUAGE_NAME_TO_CODE,
|
|
@@ -14,13 +14,12 @@ from lang_list import (
|
|
| 14 |
S2TT_TARGET_LANGUAGE_NAMES,
|
| 15 |
T2TT_TARGET_LANGUAGE_NAMES,
|
| 16 |
TEXT_SOURCE_LANGUAGE_NAMES,
|
|
|
|
| 17 |
)
|
| 18 |
|
| 19 |
DESCRIPTION = """# SeamlessM4T
|
| 20 |
-
|
| 21 |
[SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality
|
| 22 |
translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
|
| 23 |
-
|
| 24 |
This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST)
|
| 25 |
translation and more, without relying on multiple separate models.
|
| 26 |
"""
|
|
@@ -39,11 +38,9 @@ MAX_INPUT_AUDIO_LENGTH = 60 # in seconds
|
|
| 39 |
DEFAULT_TARGET_LANGUAGE = "French"
|
| 40 |
|
| 41 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
device=device,
|
| 46 |
-
)
|
| 47 |
|
| 48 |
|
| 49 |
def predict(
|
|
@@ -71,18 +68,25 @@ def predict(
|
|
| 71 |
if new_arr.shape[1] > max_length:
|
| 72 |
new_arr = new_arr[:, :max_length]
|
| 73 |
gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
|
| 74 |
-
|
|
|
|
|
|
|
| 75 |
else:
|
| 76 |
-
input_data = input_text
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
tgt_lang=target_language_code,
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
if task_name in ["S2ST", "T2ST"]:
|
| 85 |
-
return (
|
| 86 |
else:
|
| 87 |
return None, text_out
|
| 88 |
|
|
@@ -430,4 +434,4 @@ demo.queue(max_size=50).launch()
|
|
| 430 |
|
| 431 |
# Linking models to the space
|
| 432 |
# 'facebook/seamless-m4t-large'
|
| 433 |
-
# 'facebook/SONAR'
|
|
|
|
| 6 |
import numpy as np
|
| 7 |
import torch
|
| 8 |
import torchaudio
|
| 9 |
+
from transformers import AutoProcessor, SeamlessM4TModel
|
| 10 |
|
| 11 |
from lang_list import (
|
| 12 |
LANGUAGE_NAME_TO_CODE,
|
|
|
|
| 14 |
S2TT_TARGET_LANGUAGE_NAMES,
|
| 15 |
T2TT_TARGET_LANGUAGE_NAMES,
|
| 16 |
TEXT_SOURCE_LANGUAGE_NAMES,
|
| 17 |
+
LANG_TO_SPKR_ID,
|
| 18 |
)
|
| 19 |
|
| 20 |
DESCRIPTION = """# SeamlessM4T
|
|
|
|
| 21 |
[SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality
|
| 22 |
translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
|
|
|
|
| 23 |
This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST)
|
| 24 |
translation and more, without relying on multiple separate models.
|
| 25 |
"""
|
|
|
|
| 38 |
DEFAULT_TARGET_LANGUAGE = "French"
|
| 39 |
|
| 40 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 41 |
+
|
| 42 |
+
processor = AutoProcessor.from_pretrained("ylacombe/hf-seamless-m4t-large")
|
| 43 |
+
model = SeamlessM4TModel.from_pretrained("ylacombe/hf-seamless-m4t-large").to(device)
|
|
|
|
|
|
|
| 44 |
|
| 45 |
|
| 46 |
def predict(
|
|
|
|
| 68 |
if new_arr.shape[1] > max_length:
|
| 69 |
new_arr = new_arr[:, :max_length]
|
| 70 |
gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
input_data = processor(audios = new_arr, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt").to(device)
|
| 74 |
else:
|
| 75 |
+
input_data = processor(text = input_text, src_lang=source_language_code, return_tensors="pt").to(device)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
if task_name in ["S2TT", "T2TT"]:
|
| 79 |
+
tokens_ids = model.generate(**input_data, generate_speech=False, tgt_lang=target_language_code, num_beams=5, do_sample=True)[0].cpu().squeeze().detach().tolist()
|
| 80 |
+
else:
|
| 81 |
+
output = model.generate(**input_data, return_intermediate_token_ids=True, tgt_lang=target_language_code, num_beams=5, do_sample=True, spkr_id=LANG_TO_SPKR_ID[target_language_code][0])
|
| 82 |
+
|
| 83 |
+
waveform = output.waveform.cpu().squeeze().detach().numpy()
|
| 84 |
+
tokens_ids = output.sequences.cpu().squeeze().detach().tolist()
|
| 85 |
+
|
| 86 |
+
text_out = processor.decode(tokens_ids, skip_special_tokens=True)
|
| 87 |
+
|
| 88 |
if task_name in ["S2ST", "T2ST"]:
|
| 89 |
+
return (AUDIO_SAMPLE_RATE, waveform), text_out
|
| 90 |
else:
|
| 91 |
return None, text_out
|
| 92 |
|
|
|
|
| 434 |
|
| 435 |
# Linking models to the space
|
| 436 |
# 'facebook/seamless-m4t-large'
|
| 437 |
+
# 'facebook/SONAR'
|
lang_list.py
CHANGED
|
@@ -252,3 +252,151 @@ S2ST_TARGET_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in s2s
|
|
| 252 |
S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
|
| 253 |
# T2TT
|
| 254 |
T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
|
| 253 |
# T2TT
|
| 254 |
T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
LANG_TO_SPKR_ID = {
|
| 258 |
+
"arb": [
|
| 259 |
+
0
|
| 260 |
+
],
|
| 261 |
+
"ben": [
|
| 262 |
+
2,
|
| 263 |
+
1
|
| 264 |
+
],
|
| 265 |
+
"cat": [
|
| 266 |
+
3
|
| 267 |
+
],
|
| 268 |
+
"ces": [
|
| 269 |
+
4
|
| 270 |
+
],
|
| 271 |
+
"cmn": [
|
| 272 |
+
5
|
| 273 |
+
],
|
| 274 |
+
"cym": [
|
| 275 |
+
6
|
| 276 |
+
],
|
| 277 |
+
"dan": [
|
| 278 |
+
7,
|
| 279 |
+
8
|
| 280 |
+
],
|
| 281 |
+
"deu": [
|
| 282 |
+
9
|
| 283 |
+
],
|
| 284 |
+
"eng": [
|
| 285 |
+
10
|
| 286 |
+
],
|
| 287 |
+
"est": [
|
| 288 |
+
11,
|
| 289 |
+
12,
|
| 290 |
+
13
|
| 291 |
+
],
|
| 292 |
+
"fin": [
|
| 293 |
+
14
|
| 294 |
+
],
|
| 295 |
+
"fra": [
|
| 296 |
+
15
|
| 297 |
+
],
|
| 298 |
+
"hin": [
|
| 299 |
+
16
|
| 300 |
+
],
|
| 301 |
+
"ind": [
|
| 302 |
+
17,
|
| 303 |
+
24,
|
| 304 |
+
18,
|
| 305 |
+
20,
|
| 306 |
+
19,
|
| 307 |
+
21,
|
| 308 |
+
23,
|
| 309 |
+
27,
|
| 310 |
+
26,
|
| 311 |
+
22,
|
| 312 |
+
25
|
| 313 |
+
],
|
| 314 |
+
"ita": [
|
| 315 |
+
29,
|
| 316 |
+
28
|
| 317 |
+
],
|
| 318 |
+
"jpn": [
|
| 319 |
+
30
|
| 320 |
+
],
|
| 321 |
+
"kor": [
|
| 322 |
+
31
|
| 323 |
+
],
|
| 324 |
+
"mlt": [
|
| 325 |
+
32,
|
| 326 |
+
33,
|
| 327 |
+
34
|
| 328 |
+
],
|
| 329 |
+
"nld": [
|
| 330 |
+
35
|
| 331 |
+
],
|
| 332 |
+
"pes": [
|
| 333 |
+
36
|
| 334 |
+
],
|
| 335 |
+
"pol": [
|
| 336 |
+
37
|
| 337 |
+
],
|
| 338 |
+
"por": [
|
| 339 |
+
38
|
| 340 |
+
],
|
| 341 |
+
"ron": [
|
| 342 |
+
39
|
| 343 |
+
],
|
| 344 |
+
"rus": [
|
| 345 |
+
40
|
| 346 |
+
],
|
| 347 |
+
"slk": [
|
| 348 |
+
41
|
| 349 |
+
],
|
| 350 |
+
"spa": [
|
| 351 |
+
42
|
| 352 |
+
],
|
| 353 |
+
"swe": [
|
| 354 |
+
43,
|
| 355 |
+
45,
|
| 356 |
+
44
|
| 357 |
+
],
|
| 358 |
+
"swh": [
|
| 359 |
+
46,
|
| 360 |
+
48,
|
| 361 |
+
47
|
| 362 |
+
],
|
| 363 |
+
"tel": [
|
| 364 |
+
49
|
| 365 |
+
],
|
| 366 |
+
"tgl": [
|
| 367 |
+
50
|
| 368 |
+
],
|
| 369 |
+
"tha": [
|
| 370 |
+
51,
|
| 371 |
+
54,
|
| 372 |
+
55,
|
| 373 |
+
52,
|
| 374 |
+
53
|
| 375 |
+
],
|
| 376 |
+
"tur": [
|
| 377 |
+
58,
|
| 378 |
+
57,
|
| 379 |
+
56
|
| 380 |
+
],
|
| 381 |
+
"ukr": [
|
| 382 |
+
59
|
| 383 |
+
],
|
| 384 |
+
"urd": [
|
| 385 |
+
60,
|
| 386 |
+
61,
|
| 387 |
+
62
|
| 388 |
+
],
|
| 389 |
+
"uzn": [
|
| 390 |
+
63,
|
| 391 |
+
64,
|
| 392 |
+
65
|
| 393 |
+
],
|
| 394 |
+
"vie": [
|
| 395 |
+
66,
|
| 396 |
+
67,
|
| 397 |
+
70,
|
| 398 |
+
71,
|
| 399 |
+
68,
|
| 400 |
+
69
|
| 401 |
+
]
|
| 402 |
+
}
|
requirements.txt
CHANGED
|
@@ -1,6 +1,3 @@
|
|
| 1 |
-
|
| 2 |
-
git+https://github.com/facebookresearch/seamless_communication
|
| 3 |
-
gradio==3.40.1
|
| 4 |
-
huggingface_hub==0.16.4
|
| 5 |
-
torch==2.0.1
|
| 6 |
torchaudio==2.0.2
|
|
|
|
|
|
| 1 |
+
git+https://github.com/huggingface/transformers
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
torchaudio==2.0.2
|
| 3 |
+
sentencepiece
|