uk-en-translator

Running on Zero

App Files Files Community

uk-en-translator / app.py

Yehor

Update app.py

c53a7a8 verified 27 days ago

raw

history blame contribute delete

12.3 kB

	import sys
	import time

	try:
	import spaces
	except ImportError:
	print("ZeroGPU is not available, skipping...")

	import torch
	import torchaudio
	import gradio as gr
	import torchaudio.transforms as T
	import polars as pl

	from importlib.metadata import version
	from gradio.utils import is_zero_gpu_space
	from gradio.themes import Base

	from paddleocr import PaddleOCR
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	AutoModelForCTC,
	Wav2Vec2BertProcessor,
	)

	use_zero_gpu = is_zero_gpu_space()
	use_cuda = torch.cuda.is_available()

	if use_zero_gpu:
	spaces_version = version("spaces")
	print("ZeroGPU is available, changing inference call.")
	else:
	spaces_version = "N/A"
	print("ZeroGPU is not available, skipping...")

	print(f"Spaces version: {spaces_version}")

	if use_cuda:
	print("CUDA is available, setting correct `device` variable.")
	device = "cuda"
	torch_dtype = torch.bfloat16
	else:
	device = "cpu"
	torch_dtype = torch.bfloat16

	# Config
	model_name = "Yehor/kulyk-uk-en"
	concurrency_limit = 5
	min_duration = 0.5
	max_duration = 60
	current_theme = Base()

	# Load the model
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	device_map=device,
	torch_dtype=torch_dtype,
	)
	model.eval()
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	# Load ASR
	audio_model = AutoModelForCTC.from_pretrained(
	"Yehor/w2v-bert-uk-v2.1-bf16", torch_dtype=torch_dtype, device_map=device
	)
	processor = Wav2Vec2BertProcessor.from_pretrained("Yehor/w2v-bert-uk-v2.1-bf16")

	# Load OCR
	ocr_model = PaddleOCR(
	lang="uk",
	use_doc_orientation_classify=False,
	use_doc_unwarping=False,
	use_textline_orientation=False,
	)

	# Examples
	examples_text = [
	"WP: F-16 навряд чи суттєво змінять ситуацію на полі бою",
	"Над Україною збито ракету та 7 із 8 «Шахедів»",
	"Олімпійські ігри 2024. Розклад змагань українських спортсменів на 28 липня",
	"Кампанія Гарріс зібрала понад 200 мільйонів доларів менш ніж за тиждень",
	"За тиждень НБУ продав майже 800 мільйонів доларів на міжбанківському ринку",
	"Париж 2024. День 2: Текстова трансляція",
	]
	examples_audio = [
	"example_1.wav",
	"example_2.wav",
	"example_3.wav",
	"example_4.wav",
	"example_5.wav",
	"example_6.wav",
	]
	examples_image = [
	"example_1.jpg",
	"example_2.jpg",
	"example_3.jpg",
	"example_4.jpg",
	"example_5.jpg",
	"example_6.jpg",
	]

	title = "UK-EN Translator"

	authors_table = """
	## Authors

	Follow them on social networks and contact if you need any help or have any questions:

	\| <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> Yehor Smoliakov \|
	\|-------------------------------------------------------------------------------------------------\|
	\| https://t.me/smlkw in Telegram \|
	\| https://x.com/yehor_smoliakov at X \|
	\| https://github.com/egorsmkv at GitHub \|
	\| https://huggingface.co/Yehor at Hugging Face \|
	\| or use [email protected] \|
	""".strip()

	description_head = f"""
	# {title}

	This space translates your text, audio, image from Ukrainian to English using [kulyk-uk-en](https://huggingface.co/Yehor/kulyk-uk-en) model. Also, check [EN-UK Translator](https://huggingface.co/spaces/Yehor/en-uk-translator) out.
	""".strip()


	tech_env = f"""
	#### Environment

	- Python: {sys.version}
	- Torch device: {device}
	- Torch dtype: {torch_dtype}

	#### Models

	- [kulyk-uk-en](https://huggingface.co/Yehor/kulyk-en-uk)
	- [wav2vec2-bert](https://huggingface.co/Yehor/w2v-bert-uk-v2.1-bf16)
	- [PaddleOCR](https://huggingface.co/PaddlePaddle/eslav_PP-OCRv5_mobile_rec)
	""".strip()

	tech_libraries = f"""
	#### Libraries

	- torch: {version("torch")}
	- torchaudio: {version("torchaudio")}
	- transformers: {version("transformers")}
	- accelerate: {version("accelerate")}
	- gradio: {version("gradio")}
	""".strip()


	def translate(text: str) -> str:
	prompt = "Translate the text to Ukrainian:\n" + text

	input_ids = tokenizer.apply_chat_template(
	[{"role": "user", "content": prompt}],
	add_generation_prompt=True,
	return_tensors="pt",
	tokenize=True,
	).to(model.device)

	output = model.generate(
	input_ids,
	max_new_tokens=2048,
	# Greedy Search
	do_sample=False,
	repetition_penalty=1.05,
	# Sampling
	# do_sample=True,
	# temperature=0.1,
	# # top_k=1,
	# min_p=0.9,
	# repetition_penalty=1.05,
	)

	prompt_len = input_ids.shape[1]
	generated_tokens = output[:, prompt_len:]
	translated_text = tokenizer.batch_decode(
	generated_tokens, skip_special_tokens=True
	)[0]

	return translated_text.strip()


	@spaces.GPU
	def inference_text(text, progress=gr.Progress()):
	if not text:
	raise gr.Error("Please paste your text.")

	progress(0, desc="Translating...")

	results = []

	sentences = text.split("\n")

	non_empty_sentences = []
	for sentence in sentences:
	s = sentence.strip()
	if len(s) != 0:
	non_empty_sentences.append(s)

	for sentence in progress.tqdm(
	non_empty_sentences, desc="Translating...", unit="sentence"
	):
	t0 = time.time()
	translated_text = translate(sentence)
	elapsed_time = round(time.time() - t0, 2)

	results.append(
	{
	"sentence": sentence,
	"translated_text": translated_text,
	"elapsed_time": elapsed_time,
	}
	)

	gr.Info("Finished!", duration=2)

	return pl.DataFrame(results)


	@spaces.GPU
	def inference_audio(audio, progress=gr.Progress()):
	if not audio:
	raise gr.Error("Please paste your audio file.")

	progress(0, desc="Translating...")

	meta = torchaudio.info(audio)
	duration = meta.num_frames / meta.sample_rate

	if duration < min_duration:
	raise gr.Error(
	f"The duration of the file is less than {min_duration} seconds, it is {round(duration, 2)} seconds."
	)
	if duration > max_duration:
	raise gr.Error(f"The duration of the file exceeds {max_duration} seconds.")

	audio_input, sr = torchaudio.load(audio)

	if meta.num_channels > 1:
	audio_input = torch.mean(audio_input, dim=0, keepdim=True)

	if meta.sample_rate != 16_000:
	resampler = T.Resample(sr, 16_000, dtype=audio_input.dtype)
	audio_input = resampler(audio_input)

	audio_input = audio_input.squeeze().numpy()

	features = processor([audio_input], sampling_rate=16_000).input_features
	features = torch.tensor(features).to(device, dtype=torch_dtype)

	with torch.inference_mode():
	logits = audio_model(features).logits

	predicted_ids = torch.argmax(logits, dim=-1)
	predictions = processor.batch_decode(predicted_ids)

	print("Predictions:", predictions)

	if not predictions:
	text = "-"
	else:
	text = "\n".join(predictions)

	print("Text:", text)

	results = []

	sentences = text.split("\n")

	non_empty_sentences = []
	for sentence in sentences:
	s = sentence.strip()
	if len(s) != 0:
	non_empty_sentences.append(s)

	for sentence in progress.tqdm(
	non_empty_sentences, desc="Translating...", unit="sentence"
	):
	t0 = time.time()
	translated_text = translate(sentence)
	elapsed_time = round(time.time() - t0, 2)

	results.append(
	{
	"sentence": sentence,
	"translated_text": translated_text,
	"elapsed_time": elapsed_time,
	}
	)

	gr.Info("Finished!", duration=2)

	return pl.DataFrame(results)


	@spaces.GPU
	def inference_image(image, progress=gr.Progress()):
	if not image:
	raise gr.Error("Please paste your image file.")

	progress(0, desc="Translating...")

	if not isinstance(image, str):
	raise gr.Error("Please paste your image file.")

	predictions = ocr_model.predict(image)

	results = []
	for prediction in predictions:
	results.append(' '.join(prediction['rec_texts']))

	text = " ".join(results)

	print("Text:", text)

	results = []

	sentences = [text]

	for sentence in progress.tqdm(sentences, desc="Translating...", unit="sentence"):
	t0 = time.time()
	translated_text = translate(sentence)
	elapsed_time = round(time.time() - t0, 2)

	results.append(
	{
	"sentence": sentence,
	"translated_text": translated_text,
	"elapsed_time": elapsed_time,
	}
	)

	gr.Info("Finished!", duration=2)

	return pl.DataFrame(results)


	def create_app():
	tab = gr.Blocks(
	title=title,
	analytics_enabled=False,
	theme=current_theme,
	)

	with tab:
	gr.Markdown(description_head)
	gr.Markdown("## Usage")

	translated_text = gr.DataFrame(
	label="Translated text",
	)

	text = gr.Textbox(label="Text", autofocus=True, lines=5)

	gr.Button("Translate").click(
	inference_text,
	concurrency_limit=concurrency_limit,
	inputs=text,
	outputs=translated_text,
	)

	with gr.Row():
	gr.Examples(label="Choose an example", inputs=text, examples=examples_text)

	return tab


	def create_audio_app():
	with gr.Blocks(theme=current_theme) as tab:
	gr.Markdown(description_head)
	gr.Markdown("## Usage")

	translated_text = gr.DataFrame(
	label="Translated text",
	)

	audio = gr.Audio(label="Audio file", sources="upload", type="filepath")

	gr.Button("Translate").click(
	inference_audio,
	concurrency_limit=concurrency_limit,
	inputs=audio,
	outputs=translated_text,
	)

	with gr.Row():
	gr.Examples(
	label="Choose an example", inputs=audio, examples=examples_audio
	)

	gr.Markdown(
	f"> Due to resource limitations, audio duration must not exceed {max_duration} seconds."
	)

	return tab


	def create_image_app():
	with gr.Blocks(theme=current_theme) as tab:
	gr.Markdown(description_head)
	gr.Markdown("## Usage")

	translated_text = gr.DataFrame(
	label="Translated text",
	)

	image = gr.Image(label="Image file", sources="upload", type="filepath")

	gr.Button("Translate").click(
	inference_image,
	concurrency_limit=concurrency_limit,
	inputs=image,
	outputs=translated_text,
	)

	with gr.Row():
	gr.Examples(
	label="Choose an example", inputs=image, examples=examples_image
	)

	return tab


	def create_env():
	with gr.Blocks(theme=current_theme) as tab:
	gr.Markdown(tech_env)
	gr.Markdown(tech_libraries)

	return tab


	def create_authors():
	with gr.Blocks(theme=current_theme) as tab:
	gr.Markdown(authors_table)

	return tab


	def create_demo():
	app_tab = create_app()
	app_audio_tab = create_audio_app()
	app_image_tab = create_image_app()
	authors_tab = create_authors()
	env_tab = create_env()

	return gr.TabbedInterface(
	[app_tab, app_audio_tab, app_image_tab, authors_tab, env_tab],
	tab_names=[
	"✍️ Text",
	"🔊 Audio",
	"👀 Image",
	"👥 Authors",
	"📦 Environment, Models, and Libraries",
	],
	)


	if __name__ == "__main__":
	demo = create_demo()
	demo.queue()
	demo.launch()