Spaces:
Sleeping
Sleeping
File size: 2,513 Bytes
59fdc4b dfc2219 59fdc4b dfc2219 a18ecbd dfc2219 59fdc4b 4dace66 59fdc4b dfc2219 59fdc4b 4dace66 dfc2219 4dace66 dfc2219 4dace66 59fdc4b dfc2219 59fdc4b dfc2219 59fdc4b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import streamlit as st
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
translation_model_name = 'sarahai/nllb-uzbek-cyrillic-to-russian'
translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name)
translation_tokenizer = NllbTokenizer.from_pretrained(translation_model_name)
def split_into_chunks(text, tokenizer, max_length=150):
tokens = tokenizer.tokenize(text)
chunks = []
current_chunk = []
current_length = 0
for token in tokens:
current_chunk.append(token)
current_length += 1
if current_length >= max_length:
chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
current_chunk = []
current_length = 0
if current_chunk:
chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
return chunks
def translate(text, model, tokenizer, src_lang='uzb_Cyrl', tgt_lang='rus_Cyrl'):
tokenizer.src_lang = src_lang
tokenizer.tgt_lang = tgt_lang
chunks = split_into_chunks(text, tokenizer)
translated_chunks = []
for chunk in chunks:
inputs = tokenizer(chunk, return_tensors='pt', padding=True, truncation=True, max_length=128)
outputs = model.generate(inputs['input_ids'])
translated_chunks.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
return ' '.join(translated_chunks)
st.markdown("""
<style>
.big-font {
font-size:30px !important;
font-weight: bold;
}
.small-font {
font-size:18px !important;
}
</style>
""", unsafe_allow_html=True)
st.sidebar.markdown('## Навигация')
uploaded_file = st.sidebar.file_uploader("Загрузите текстовый файл...", type=["txt"])
process_btn = False
if uploaded_file:
st.sidebar.text("Файл загружен")
process_btn = st.sidebar.button("Перевести")
st.markdown('<h1 class="big-font">Перевод текста</h1>', unsafe_allow_html=True)
st.markdown('<div class="big-font">Перевод с узбекского на русский</div>', unsafe_allow_html=True)
if process_btn and uploaded_file:
uploaded_text = uploaded_file.read().decode('utf-8')
st.text_area("Исходный текст", uploaded_text, height=150)
with st.spinner('Переводим...'):
translated_text = translate(uploaded_text, translation_model, translation_tokenizer)
st.text_area("Переведенный текст (на русском):", value=translated_text, height=200)
|