Spaces:
Running
on
Zero
Running
on
Zero
import sys | |
import time | |
try: | |
import spaces | |
except ImportError: | |
print("ZeroGPU is not available, skipping...") | |
import torch | |
import torchaudio | |
import gradio as gr | |
import torchaudio.transforms as T | |
import polars as pl | |
from importlib.metadata import version | |
from gradio.utils import is_zero_gpu_space | |
from gradio.themes import Base | |
from paddleocr import PaddleOCR | |
from transformers import ( | |
AutoModelForCausalLM, | |
AutoTokenizer, | |
AutoModelForCTC, | |
Wav2Vec2BertProcessor, | |
) | |
use_zero_gpu = is_zero_gpu_space() | |
use_cuda = torch.cuda.is_available() | |
if use_zero_gpu: | |
spaces_version = version("spaces") | |
print("ZeroGPU is available, changing inference call.") | |
else: | |
spaces_version = "N/A" | |
print("ZeroGPU is not available, skipping...") | |
print(f"Spaces version: {spaces_version}") | |
if use_cuda: | |
print("CUDA is available, setting correct `device` variable.") | |
device = "cuda" | |
torch_dtype = torch.bfloat16 | |
else: | |
device = "cpu" | |
torch_dtype = torch.bfloat16 | |
# Config | |
model_name = "Yehor/kulyk-uk-en" | |
concurrency_limit = 5 | |
min_duration = 0.5 | |
max_duration = 60 | |
current_theme = Base() | |
# Load the model | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
device_map=device, | |
torch_dtype=torch_dtype, | |
) | |
model.eval() | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# Load ASR | |
audio_model = AutoModelForCTC.from_pretrained( | |
"Yehor/w2v-bert-uk-v2.1-bf16", torch_dtype=torch_dtype, device_map=device | |
) | |
processor = Wav2Vec2BertProcessor.from_pretrained("Yehor/w2v-bert-uk-v2.1-bf16") | |
# Load OCR | |
ocr_model = PaddleOCR( | |
lang="uk", | |
use_doc_orientation_classify=False, | |
use_doc_unwarping=False, | |
use_textline_orientation=False, | |
) | |
# Examples | |
examples_text = [ | |
"WP: F-16 навряд чи суттєво змінять ситуацію на полі бою", | |
"Над Україною збито ракету та 7 із 8 «Шахедів»", | |
"Олімпійські ігри 2024. Розклад змагань українських спортсменів на 28 липня", | |
"Кампанія Гарріс зібрала понад 200 мільйонів доларів менш ніж за тиждень", | |
"За тиждень НБУ продав майже 800 мільйонів доларів на міжбанківському ринку", | |
"Париж 2024. День 2: Текстова трансляція", | |
] | |
examples_audio = [ | |
"example_1.wav", | |
"example_2.wav", | |
"example_3.wav", | |
"example_4.wav", | |
"example_5.wav", | |
"example_6.wav", | |
] | |
examples_image = [ | |
"example_1.jpg", | |
"example_2.jpg", | |
"example_3.jpg", | |
"example_4.jpg", | |
"example_5.jpg", | |
"example_6.jpg", | |
] | |
title = "UK-EN Translator" | |
authors_table = """ | |
## Authors | |
Follow them on social networks and **contact** if you need any help or have any questions: | |
| <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> **Yehor Smoliakov** | | |
|-------------------------------------------------------------------------------------------------| | |
| https://t.me/smlkw in Telegram | | |
| https://x.com/yehor_smoliakov at X | | |
| https://github.com/egorsmkv at GitHub | | |
| https://huggingface.co/Yehor at Hugging Face | | |
| or use [email protected] | | |
""".strip() | |
description_head = f""" | |
# {title} | |
This space translates your text, audio, image from Ukrainian to English using [kulyk-uk-en](https://huggingface.co/Yehor/kulyk-uk-en) model. Also, check [EN-UK Translator](https://huggingface.co/spaces/Yehor/en-uk-translator) out. | |
""".strip() | |
tech_env = f""" | |
#### Environment | |
- Python: {sys.version} | |
- Torch device: {device} | |
- Torch dtype: {torch_dtype} | |
#### Models | |
- [kulyk-uk-en](https://huggingface.co/Yehor/kulyk-en-uk) | |
- [wav2vec2-bert](https://huggingface.co/Yehor/w2v-bert-uk-v2.1-bf16) | |
- [PaddleOCR](https://huggingface.co/PaddlePaddle/eslav_PP-OCRv5_mobile_rec) | |
""".strip() | |
tech_libraries = f""" | |
#### Libraries | |
- torch: {version("torch")} | |
- torchaudio: {version("torchaudio")} | |
- transformers: {version("transformers")} | |
- accelerate: {version("accelerate")} | |
- gradio: {version("gradio")} | |
""".strip() | |
def translate(text: str) -> str: | |
prompt = "Translate the text to Ukrainian:\n" + text | |
input_ids = tokenizer.apply_chat_template( | |
[{"role": "user", "content": prompt}], | |
add_generation_prompt=True, | |
return_tensors="pt", | |
tokenize=True, | |
).to(model.device) | |
output = model.generate( | |
input_ids, | |
max_new_tokens=2048, | |
# Greedy Search | |
do_sample=False, | |
repetition_penalty=1.05, | |
# Sampling | |
# do_sample=True, | |
# temperature=0.1, | |
# # top_k=1, | |
# min_p=0.9, | |
# repetition_penalty=1.05, | |
) | |
prompt_len = input_ids.shape[1] | |
generated_tokens = output[:, prompt_len:] | |
translated_text = tokenizer.batch_decode( | |
generated_tokens, skip_special_tokens=True | |
)[0] | |
return translated_text.strip() | |
def inference_text(text, progress=gr.Progress()): | |
if not text: | |
raise gr.Error("Please paste your text.") | |
progress(0, desc="Translating...") | |
results = [] | |
sentences = text.split("\n") | |
non_empty_sentences = [] | |
for sentence in sentences: | |
s = sentence.strip() | |
if len(s) != 0: | |
non_empty_sentences.append(s) | |
for sentence in progress.tqdm( | |
non_empty_sentences, desc="Translating...", unit="sentence" | |
): | |
t0 = time.time() | |
translated_text = translate(sentence) | |
elapsed_time = round(time.time() - t0, 2) | |
results.append( | |
{ | |
"sentence": sentence, | |
"translated_text": translated_text, | |
"elapsed_time": elapsed_time, | |
} | |
) | |
gr.Info("Finished!", duration=2) | |
return pl.DataFrame(results) | |
def inference_audio(audio, progress=gr.Progress()): | |
if not audio: | |
raise gr.Error("Please paste your audio file.") | |
progress(0, desc="Translating...") | |
meta = torchaudio.info(audio) | |
duration = meta.num_frames / meta.sample_rate | |
if duration < min_duration: | |
raise gr.Error( | |
f"The duration of the file is less than {min_duration} seconds, it is {round(duration, 2)} seconds." | |
) | |
if duration > max_duration: | |
raise gr.Error(f"The duration of the file exceeds {max_duration} seconds.") | |
audio_input, sr = torchaudio.load(audio) | |
if meta.num_channels > 1: | |
audio_input = torch.mean(audio_input, dim=0, keepdim=True) | |
if meta.sample_rate != 16_000: | |
resampler = T.Resample(sr, 16_000, dtype=audio_input.dtype) | |
audio_input = resampler(audio_input) | |
audio_input = audio_input.squeeze().numpy() | |
features = processor([audio_input], sampling_rate=16_000).input_features | |
features = torch.tensor(features).to(device, dtype=torch_dtype) | |
with torch.inference_mode(): | |
logits = audio_model(features).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
predictions = processor.batch_decode(predicted_ids) | |
print("Predictions:", predictions) | |
if not predictions: | |
text = "-" | |
else: | |
text = "\n".join(predictions) | |
print("Text:", text) | |
results = [] | |
sentences = text.split("\n") | |
non_empty_sentences = [] | |
for sentence in sentences: | |
s = sentence.strip() | |
if len(s) != 0: | |
non_empty_sentences.append(s) | |
for sentence in progress.tqdm( | |
non_empty_sentences, desc="Translating...", unit="sentence" | |
): | |
t0 = time.time() | |
translated_text = translate(sentence) | |
elapsed_time = round(time.time() - t0, 2) | |
results.append( | |
{ | |
"sentence": sentence, | |
"translated_text": translated_text, | |
"elapsed_time": elapsed_time, | |
} | |
) | |
gr.Info("Finished!", duration=2) | |
return pl.DataFrame(results) | |
def inference_image(image, progress=gr.Progress()): | |
if not image: | |
raise gr.Error("Please paste your image file.") | |
progress(0, desc="Translating...") | |
if not isinstance(image, str): | |
raise gr.Error("Please paste your image file.") | |
predictions = ocr_model.predict(image) | |
results = [] | |
for prediction in predictions: | |
results.append(' '.join(prediction['rec_texts'])) | |
text = " ".join(results) | |
print("Text:", text) | |
results = [] | |
sentences = [text] | |
for sentence in progress.tqdm(sentences, desc="Translating...", unit="sentence"): | |
t0 = time.time() | |
translated_text = translate(sentence) | |
elapsed_time = round(time.time() - t0, 2) | |
results.append( | |
{ | |
"sentence": sentence, | |
"translated_text": translated_text, | |
"elapsed_time": elapsed_time, | |
} | |
) | |
gr.Info("Finished!", duration=2) | |
return pl.DataFrame(results) | |
def create_app(): | |
tab = gr.Blocks( | |
title=title, | |
analytics_enabled=False, | |
theme=current_theme, | |
) | |
with tab: | |
gr.Markdown(description_head) | |
gr.Markdown("## Usage") | |
translated_text = gr.DataFrame( | |
label="Translated text", | |
) | |
text = gr.Textbox(label="Text", autofocus=True, lines=5) | |
gr.Button("Translate").click( | |
inference_text, | |
concurrency_limit=concurrency_limit, | |
inputs=text, | |
outputs=translated_text, | |
) | |
with gr.Row(): | |
gr.Examples(label="Choose an example", inputs=text, examples=examples_text) | |
return tab | |
def create_audio_app(): | |
with gr.Blocks(theme=current_theme) as tab: | |
gr.Markdown(description_head) | |
gr.Markdown("## Usage") | |
translated_text = gr.DataFrame( | |
label="Translated text", | |
) | |
audio = gr.Audio(label="Audio file", sources="upload", type="filepath") | |
gr.Button("Translate").click( | |
inference_audio, | |
concurrency_limit=concurrency_limit, | |
inputs=audio, | |
outputs=translated_text, | |
) | |
with gr.Row(): | |
gr.Examples( | |
label="Choose an example", inputs=audio, examples=examples_audio | |
) | |
gr.Markdown( | |
f"> Due to resource limitations, audio duration **must not** exceed **{max_duration}** seconds." | |
) | |
return tab | |
def create_image_app(): | |
with gr.Blocks(theme=current_theme) as tab: | |
gr.Markdown(description_head) | |
gr.Markdown("## Usage") | |
translated_text = gr.DataFrame( | |
label="Translated text", | |
) | |
image = gr.Image(label="Image file", sources="upload", type="filepath") | |
gr.Button("Translate").click( | |
inference_image, | |
concurrency_limit=concurrency_limit, | |
inputs=image, | |
outputs=translated_text, | |
) | |
with gr.Row(): | |
gr.Examples( | |
label="Choose an example", inputs=image, examples=examples_image | |
) | |
return tab | |
def create_env(): | |
with gr.Blocks(theme=current_theme) as tab: | |
gr.Markdown(tech_env) | |
gr.Markdown(tech_libraries) | |
return tab | |
def create_authors(): | |
with gr.Blocks(theme=current_theme) as tab: | |
gr.Markdown(authors_table) | |
return tab | |
def create_demo(): | |
app_tab = create_app() | |
app_audio_tab = create_audio_app() | |
app_image_tab = create_image_app() | |
authors_tab = create_authors() | |
env_tab = create_env() | |
return gr.TabbedInterface( | |
[app_tab, app_audio_tab, app_image_tab, authors_tab, env_tab], | |
tab_names=[ | |
"✍️ Text", | |
"🔊 Audio", | |
"👀 Image", | |
"👥 Authors", | |
"📦 Environment, Models, and Libraries", | |
], | |
) | |
if __name__ == "__main__": | |
demo = create_demo() | |
demo.queue() | |
demo.launch() | |