File size: 2,513 Bytes
59fdc4b
 
 
 
 
 
 
dfc2219
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59fdc4b
 
 
dfc2219
 
 
 
a18ecbd
dfc2219
 
59fdc4b
4dace66
59fdc4b
 
 
 
 
 
dfc2219
 
 
59fdc4b
 
 
4dace66
dfc2219
 
4dace66
dfc2219
 
 
 
 
4dace66
59fdc4b
 
 
dfc2219
 
 
59fdc4b
 
dfc2219
59fdc4b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import streamlit as st
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer

translation_model_name = 'sarahai/nllb-uzbek-cyrillic-to-russian'
translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name)
translation_tokenizer = NllbTokenizer.from_pretrained(translation_model_name)

def split_into_chunks(text, tokenizer, max_length=150):
    tokens = tokenizer.tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0
    for token in tokens:
        current_chunk.append(token)
        current_length += 1
        if current_length >= max_length:
            chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
            current_chunk = []
            current_length = 0
    if current_chunk:
        chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
    return chunks

def translate(text, model, tokenizer, src_lang='uzb_Cyrl', tgt_lang='rus_Cyrl'):
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    chunks = split_into_chunks(text, tokenizer)
    translated_chunks = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors='pt', padding=True, truncation=True, max_length=128)
        outputs = model.generate(inputs['input_ids'])
        translated_chunks.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
    return ' '.join(translated_chunks)


st.markdown("""
<style>
.big-font {
    font-size:30px !important;
    font-weight: bold;
}
.small-font {
    font-size:18px !important;
}
</style>
""", unsafe_allow_html=True)


st.sidebar.markdown('## Навигация')
uploaded_file = st.sidebar.file_uploader("Загрузите текстовый файл...", type=["txt"])
process_btn = False  

if uploaded_file:
    st.sidebar.text("Файл загружен")
    process_btn = st.sidebar.button("Перевести")


st.markdown('<h1 class="big-font">Перевод текста</h1>', unsafe_allow_html=True)
st.markdown('<div class="big-font">Перевод с узбекского на русский</div>', unsafe_allow_html=True)

if process_btn and uploaded_file:
    uploaded_text = uploaded_file.read().decode('utf-8')
    st.text_area("Исходный текст", uploaded_text, height=150)

    with st.spinner('Переводим...'):
        translated_text = translate(uploaded_text, translation_model, translation_tokenizer)
        st.text_area("Переведенный текст (на русском):", value=translated_text, height=200)