In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="3"
import torch
torch.cuda.set_device(0)
torch.cuda.current_device()

0

In [2]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

from accelerate import Accelerator

class Summarizer:
    def __init__(self, device="cpu"):
        model_name = "sarahai/ruT5-base-summarizer"
        self.device = device
        self.tokenizer  = T5Tokenizer.from_pretrained(model_name, device_map=device)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name, device_map=device)

    def summarize(self, text, max_length=100, min_length=50, num_beams=5):
        input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.device)
        # input_ids = torch.nn.utils.rnn.pad_sequence(input_ids.squeeze().chunk(chunk_num, 0), batch_first=True, padding_value=-100)
        outputs = self.model.generate(input_ids, max_length=max_length,
                                      min_length=min_length,
                                      length_penalty=2.0,
                                      num_beams=num_beams, early_stopping=True)

        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def generate(self, indexes, max_length=100, min_length=50, num_beams=5):
        return self.model.generate(indexes.unsqueeze(0), max_length=max_length,
                                      min_length=min_length,
                                      length_penalty=2.0,
                                      num_beams=num_beams, early_stopping=True).squeeze()

In [3]:
# summarizer = Summarizer("cuda:3")

In [51]:
from bs4 import BeautifulSoup
from pydantic import BaseModel
import requests
from typing import Optional

class Pager(BaseModel):
    title: str
    text: str
    original_tags: list[str]

def is_valid_page(url):
    return True

def get_pager(url)->Optional[Pager]:
    try:
        req = requests.get(url)
        soup = BeautifulSoup(req.text, 'lxml')
        query = soup.find("div", class_="article-formatted-body")
        title = soup.title.string
        tags = []
        for tag in soup.find_all("meta"):
            if tag.get("name", None) == "keywords":
                tags = [x for x in re.split(',| ', tag["content"]) if len(x) > 0]
        return Pager(title=title, text=query.get_text(), original_tags=tags)
    except:
        return None

In [52]:
import re

habr_prefix = "https://habr.com/ru/articles/"
article_prefix = "https://habr.com/ru/"

articles_urls = []


for page_suffix in [""] + [f"page{i}/" for i in range(50)]:
    page_url = habr_prefix + page_suffix
    src = requests.get(page_url).text
    for article_suffix in set(re.findall('articles/\d+/', src)):
        articles_urls.append(article_prefix + article_suffix)

articles_urls

['https://habr.com/ru/articles/897282/',
 'https://habr.com/ru/articles/897472/',
 'https://habr.com/ru/articles/891132/',
 'https://habr.com/ru/articles/897224/',
 'https://habr.com/ru/articles/897636/',
 'https://habr.com/ru/articles/897496/',
 'https://habr.com/ru/articles/897630/',
 'https://habr.com/ru/articles/897518/',
 'https://habr.com/ru/articles/897640/',
 'https://habr.com/ru/articles/897574/',
 'https://habr.com/ru/articles/897632/',
 'https://habr.com/ru/articles/891488/',
 'https://habr.com/ru/articles/896972/',
 'https://habr.com/ru/articles/897624/',
 'https://habr.com/ru/articles/897534/',
 'https://habr.com/ru/articles/897620/',
 'https://habr.com/ru/articles/897654/',
 'https://habr.com/ru/articles/897648/',
 'https://habr.com/ru/articles/897642/',
 'https://habr.com/ru/articles/897634/',
 'https://habr.com/ru/articles/897282/',
 'https://habr.com/ru/articles/897472/',
 'https://habr.com/ru/articles/891132/',
 'https://habr.com/ru/articles/897224/',
 'https://habr.c

In [53]:
len(articles_urls)

1000

In [54]:
import time
from tqdm import tqdm

pagers = []

for url in tqdm(articles_urls):
    pagers.append(get_pager(url))
    

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [09:35<00:00,  1.74it/s]


In [55]:
pagers = [x for x in pagers if x is not None]
len(pagers)

1000

In [56]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('russian'))

stop_words

for pager in pagers:
    pager.original_tags = list(set([x for x in pager.original_tags if x not in stop_words]))

In [57]:
# summarizer = Summarizer("cuda")

for pager in pagers:
    pager.text = pager.text[:3000]

In [58]:
import pandas as pd

df = pd.DataFrame([pager.dict() for pager in pagers])
df

/tmp/ipykernel_57433/3926614870.py:3: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  df = pd.DataFrame([pager.dict() for pager in pagers])


Unnamed: 0,title,text,original_tags
0,–†–∞—Å–∫—Ä–∞—Å–∫–∞ –ª–∏—Å—Ç–∏–Ω–≥–∞ –ø—Ä–æ—Ü–µ–¥—É—Ä—ã T-SQL –∑–Ω–∞—á–µ–Ω–∏—è–º–∏ ...,"–°—Ä–∞–∑—É –ø–æ–∫–∞–∂—É, –æ —á–µ–º –∏–¥–µ—Ç —Ä–µ—á—å, —á—Ç–æ–±—ã –≤—ã —Ä–µ—à–∏–ª–∏...","[sql, tsql, markup, profiler, performance]"
1,–ò—Å–∫—É—Å—Å—Ç–≤–µ–Ω–Ω—ã–π –∏–Ω—Ç–µ–ª–ª–µ–∫—Ç –∏ –∞–ª–≥–æ—Ä–∏—Ç–º—ã –≤ —ç–Ω–µ—Ä–≥–µ—Ç–∏...,–≠–Ω–µ—Ä–≥–µ—Ç–∏—á–µ—Å–∫–∏–µ —Å–∏—Å—Ç–µ–º—ã ‚Äî –æ–¥–Ω–∏ –∏–∑ —Å–∞–º—ã—Ö —Å–ª–æ–∂–Ω—ã—Ö...,"[–∞–ª–≥–æ—Ä–∏—Ç–º—ã, –∏–∏, –∏—Ç, –∏–Ω—Ç–µ–ª–ª–µ–∫—Ç, –ª—ç–ø, —ç–Ω–µ—Ä–≥–µ—Ç–∏–∫–∞..."
2,¬´–≠—Ç–∏ —Ñ–∏–ª—å–º—ã –±—ã–ª–∏ —É–∂–∞—Å–Ω—ã¬ª ‚Äî –∫–æ—Ä–æ—Ç–∫–∏–π —Ä–∞—Å—Å–∫–∞–∑ –æ ...,–ö–æ–Ω—Ü–µ–ø—Ç-–∞—Ä—Ç –∫ ¬´–ö—Å–µ–Ω–æ–≥–µ–Ω–µ–∑–∏—Å—É¬ª (1978) ‚Äî –∫ —Ç–∞–∫ –∏...,"[–∫–∏–Ω–æ, —Ä–∏—Å–æ–≤–∞–Ω–∏–µ, —Ñ–∏–ª—å–º—ã, —Ñ–∞–Ω—Ç–∞—Å—Ç–∏–∫–∞, —ç—Ñ—Ñ–µ–∫—Ç—ã,..."
3,"–ß—Ç–æ –±—É–¥–µ—Ç, –µ—Å–ª–∏ –Ω–µ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å TCP –∏–ª–∏ UDP? /...","–ö–æ–º–º—É—Ç–∞—Ç–æ—Ä—ã, –º–∞—Ä—à—Ä—É—Ç–∏–∑–∞—Ç–æ—Ä—ã, –±—Ä–∞–Ω–¥–º–∞—É—ç—Ä—ã ‚Äî –≤—Å–µ...","[—ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç, –¥–∞–Ω–Ω—ã—Ö, tcp, –ø–µ—Ä–µ–¥–∞—á–∞, –ø—Ä–æ—Ç–æ–∫–æ–ª—ã..."
4,–ü–æ—á–µ–º—É —Ç—Ä–µ–Ω–¥ –Ω–∞ –∞—ç—Ä–æ—à–æ—Å—Å–µ—Ä—ã –≤–æ–∑–≤—Ä–∞—â–∞—é—Ç—Å—è / –•–∞–±—Ä,–ü–æ—á–µ–º—É –∞—ç—Ä–æ—à–æ—Å—Å–µ—Ä—ã –≤–æ–∑–≤—Ä–∞—â–∞—é—Ç—Å—è –≤ 2025 –≥–æ–¥—É–ù–µ—Å...,"[—Å–∫–æ—Ä–æ—Å—Ç—å, –≤–µ—Å, —à–æ—Å—Å–µ–π–Ω—ã–π, –∞—ç—Ä–æ—Ç—Ä—É–±–∞, –≤–µ–ª–æ—Å–∏–ø–µ..."
...,...,...,...
995,Service Mesh –≤ –¥–∏–∫–æ–π –ø—Ä–∏—Ä–æ–¥–µ –∏–ª–∏ –∫–∞–∫ –Ω–µ —Å—Ç–∞—Ç—å ...,–í–≤–µ–¥–µ–Ω–∏–µ–£–≥—Ä–æ–∑—ã –±–µ–∑–æ–ø–∞—Å–Ω–æ—Å—Ç–∏ –≤ Service Mesh1. –û...,"[–±–µ–∑–æ–ø–∞—Å–Ω–æ—Å—Ç—å, –ø—Ä–∏–ª–æ–∂–µ–Ω–∏–π, –º–∏–∫—Ä–æ—Å–µ—Ä–≤–∏—Å—ã, –º–∏–∫—Ä–æ..."
996,Apple Pro Weekly News (17.03 ‚Äì 23.03.25) / –•–∞–±—Ä,"–ß—Ç–æ —Å–ª–æ–º–∞–ª–∏ –≤ –ø–æ—Å–ª–µ–¥–Ω–µ–º –æ–±–Ω–æ–≤–ª–µ–Ω–∏–∏ iOS, –æ—Ç —á–µ–≥...","[Apple, Siri, iMazing, iPhone, App, iPad, iOS,..."
997,–ö–∞–∫ –Ω–µ –∑–∞–≤—è–∑–Ω—É—Ç—å –≤ –±–æ–ª–æ—Ç–µ —Ä—É—Ç–∏–Ω—ã –∏ –æ—Å—Ç–∞–≤–∞—Ç—å—Å—è ...,"–° —Ç–æ–≥–æ –º–æ–º–µ–Ω—Ç–∞, –∫–∞–∫ —è –Ω–∞—á–∞–ª —Ä–∞–±–æ—Ç–∞—Ç—å IT –º–µ–Ω–µ–¥–∂...","[–∫–æ–º–∞–Ω–¥–æ–π, –ø—Ä–æ–µ–∫—Ç–æ–≤, —Ä—É–∫–æ–≤–æ–¥—Å—Ç–≤–æ, –ø—Ä–æ–µ–∫—Ç–∞–º–∏, –ª..."
998,–†–µ–ª–∏–∑ Linux 6.14 / –•–∞–±—Ä,24 –º–∞—Ä—Ç–∞ 2025 –≥–æ–¥–∞ –õ–∏–Ω—É—Å –¢–æ—Ä–≤–∞–ª—å–¥—Å –ø—Ä–µ–¥—Å—Ç–∞–≤–∏–ª ...,"[6.14, Linux, —Ç–æ—Ä–≤–∞–ª—å–¥—Å]"


In [62]:
df.to_csv("pagers.csv")

In [4]:
model_name = "sarahai/ruT5-base-summarizer"
tokenizer = T5Tokenizer.from_pretrained(model_name, device_map="cpu")

In [5]:
import pandas as pd
df = pd.read_csv("pagers.csv")

In [6]:
from torch.nn.utils.rnn import pad_sequence

class PagerDataset:
    def __init__(self, device="cpu", pagers=df):
        model_name = "sarahai/ruT5-base-summarizer"
        self.device = device
        self.tokenizer  = T5Tokenizer.from_pretrained(model_name, device_map=device)
        self.texts = [self.tokenizer(pager, return_tensors="pt").input_ids[0] for pager in pagers['text']]
        self.tags = [self.tokenizer(' '.join(eval(pager)), return_tensors="pt").input_ids[0] for pager in pagers['original_tags']]

    def __len__(self):
        return len(self.tags)

    def __getitem__(self, index):
        return self.texts[index], self.tags[index]

class PagerCollator:
    @staticmethod
    def collate_tokens(tokens_batch, padding_value):
        return pad_sequence(tokens_batch, batch_first=True, padding_value=padding_value).to(torch.long)

    def __call__(self, batch):
        values = [item[0] for item in batch]
        labels = [item[1] for item in batch]
        input_values = PagerCollator.collate_tokens(values, 0)
        mask = (input_values == PagerCollator.collate_tokens(values, 1)).to(torch.long)
        labels = PagerCollator.collate_tokens(labels, -100)

        return {
            'input_ids': input_values, 
            'labels': labels,
            'attention_mask': mask
        }


In [7]:
threshold = int(len(df) * 0.9)

train_set = df.sample(threshold)


train = PagerDataset(pagers=train_set)
test = PagerDataset(pagers=df.drop(index = train_set.index))

In [8]:
train_set['text']

45     –ü—Ä–∏–≤–µ—Ç! –ú–µ–Ω—è –∑–æ–≤—É—Ç –ò—Ä–∏–Ω–∞, —É–∂–µ –±–æ–ª–µ–µ –ø—è—Ç–∏ –ª–µ—Ç —è...
562    –í—Å–µ–º –ø—Ä–∏–≤–µ—Ç! –ù–∞ —Å–≤—è–∑–∏ –ù–∏–∫–æ–ª–∞–π –ï–¥–æ–º—Å–∫–∏–π, —Ä—É–∫–æ–≤–æ...
448    –í –æ–¥–Ω–æ–º –∏–∑ –±–æ–ª—å—à–∏—Ö –∫–ª–∞—Å—Ç–µ—Ä–æ–≤ S3 –≤ –¢–æ—á–∫–µ —Ö—Ä–∞–Ω–∏—Ç...
727    –ú–æ—è –∏—Å—Ç–æ—Ä–∏—è –ø—Ä–æ—Å—Ç–∞ –∫–∞–∫ —Ç—Ä–∏ –∫–æ–ø–µ–π–∫–∏. –í –∫–æ–Ω—Ü–µ 20...
282    –ù–µ–¥–∞–≤–Ω–æ –≤ –û–±—â–µ—Å—Ç–≤–µ–Ω–Ω–æ–π –ø–∞–ª–∞—Ç–µ –†–æ—Å—Å–∏–∏ —Å–æ—Å—Ç–æ—è–ª—Å—è...
                             ...                        
802    Arcana OPS ‚Äî —ç—Ç–æ —Å–æ–≤—Ä–µ–º–µ–Ω–Ω—ã–π –ø–æ–¥—Ö–æ–¥ –∫ —É–ø—Ä–∞–≤–ª–µ–Ω...
2      –ö–æ–Ω—Ü–µ–ø—Ç-–∞—Ä—Ç –∫ ¬´–ö—Å–µ–Ω–æ–≥–µ–Ω–µ–∑–∏—Å—É¬ª (1978) ‚Äî –∫ —Ç–∞–∫ –∏...
583    –ü–æ –ø—Ä–æ–≥–Ω–æ–∑—É Gartner, –∑–∞–ø—Ä–æ—Å—ã –Ω–∞ –µ—Å—Ç–µ—Å—Ç–≤–µ–Ω–Ω–æ–º —è...
100    –í—Å–µ —Å–æ–≤—Ä–µ–º–µ–Ω–Ω—ã–µ —Å—Ä–µ–¥—Å—Ç–≤–∞ —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∏¬†‚Äî –ø—Ä–∞–∫—Ç–∏—á–µ...
470    \n–Ø —Å–ª—É—à–∞—é –∞—É–¥–∏–æ–∫–Ω–∏–≥–∏ —Å 2014 –≥–æ–¥–∞. –ó–∞ —ç—Ç–æ –≤—Ä–µ–º...
Name: text, Lengt

In [9]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "sarahai/ruT5-base-summarizer"
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="cuda")

model.resize_token_embeddings(len(tokenizer))

Embedding(32100, 768)

In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="test",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    evaluation_strategy="steps",
    max_steps=4000,
    fp16=False,
    save_steps=2000,
    eval_steps=32,
    logging_steps=30,
    learning_rate=1e-3,
    weight_decay=0.007,
    warmup_steps=250,
    gradient_checkpointing=True,
    report_to=[],
)



In [11]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=PagerCollator(),
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
)

In [12]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
32,5.4164,4.127873
64,4.426,4.02282
96,4.4103,3.992565
128,4.2236,4.14772
160,4.01,4.239121
192,4.0416,4.379041
224,4.2064,4.391558
256,3.9877,4.585879
288,3.9088,4.618155
320,4.0041,4.647362


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


TrainOutput(global_step=4000, training_loss=0.8493731454242952, metrics={'train_runtime': 5394.6282, 'train_samples_per_second': 5.932, 'train_steps_per_second': 0.741, 'total_flos': 2.281669114761216e+16, 'train_loss': 0.8493731454242952, 'epoch': 35.39823008849557})

In [16]:
result = model.generate(test[2][0].unsqueeze(0).to("cuda"), max_length=20,
                                      min_length=20,
                                      length_penalty=2.0,
                                      num_beams=10, early_stopping=False)

In [17]:
tokenizer.decode(test[2][0], skip_special_tokens=True)

'–û–¥–Ω–∞–∂–¥—ã –∑–∏–º–Ω–∏–º –≤–µ—á–µ—Ä–æ–º —Ç—É—Å–∏–ª–∏ –≤ —Ö–æ—Ä–æ—à–µ–π –¥—Ä—É–∂–µ—Å–∫–æ–π –∫–æ–º–ø–∞–Ω–∏–∏ –∏ –º–µ–∂–¥—É –¥–µ–ª–æ–º –∑–∞—Ç—Ä–æ–Ω—É–ª–∏ —Ç–µ–º—É –ø—Ä–æ ¬´—Å–µ—Ä—ã–µ –±—É–¥–Ω–∏ –ò–¢-—à–Ω–∏–∫–∞¬ª. –ü—Ä–æ —Ç–æ, —á—Ç–æ –º–Ω–æ–≥–∏–µ –∫–æ–Ω—Ü–µ–ø—Ç—É–∞–ª—å–Ω—ã–µ –≤–µ—â–∏ (–∞ –Ω–µ —á–∏—Å—Ç–æ —Ä–µ–º–µ—Å–ª–µ–Ω–Ω—ã–µ) –ø–ª–æ—Ö–æ –≥—É–≥–ª—è—Ç—Å—è –∏ –≤—ã–∑—ã–≤–∞—é—Ç —Å—Ç—É–ø–æ—Ä —É –±–æ–ª—å—à–∏–Ω—Å—Ç–≤–∞ –Ω–∞—á–∏–Ω–∞—é—â–∏—Ö –ø—Ä–æ—Ü–µ—Å—Å–Ω—ã—Ö –∞–Ω–∞–ª–∏—Ç–∏–∫–æ–≤ –∏ —Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫–æ–≤. –ù—É –∏ –∫–∞–∫-—Ç–æ —Ç–∞–∫, —Å–ª–æ–≤–æ –∑–∞ —Å–ª–æ–≤–æ, –ø—Ä–∏—à–ª–∏ –∫ –∏–¥–µ–µ –æ —Ç–æ–º, —á—Ç–æ–±—ã –±—ã–ª–æ –±—ã –∑–¥–æ—Ä–æ–≤–æ —Ä–∞–∑–±–∞–≤–∏—Ç—å —è—Ä–∫–∏–º–∏ –∫—Ä–∞—Å–∫–∞–º–∏ –∏ —ç–º–æ—Ü–∏–æ–Ω–∞–ª—å–Ω—ã–º–∏ –ø–µ—Ä–µ–∂–∏–≤–∞–Ω–∏—è–º–∏ —Å—É—Ö–æ–π —è–∑—ã–∫ –∏ –≤–µ—Å—å–º–∞ —É—Å–ª–æ–≤–Ω—ã–µ –ø–∏–∫—Ç–æ–≥—Ä–∞–º–º—ã –ø—Ä–æ–º—ã—à–ª–µ–Ω–Ω—ã—Ö —Å–ø–µ—Ü–∏—Ñ–∏–∫–∞—Ü–∏–π. –¢–∞–∫ –ø–æ—è–≤–∏–ª—Å—è –Ω–∞—à –Ω–æ–≤—ã–π —É–Ω–∏–∫–∞–ª—å–Ω—ã–π –º–µ—Ä—á, –Ω—É –∏ –ø—Ä–∏–∫–æ–ª—å–Ω–∞—è –¥–∏–Ω–∞–º–∏—á–Ω–∞—è 

In [18]:
tokenizer.decode(result[0], skip_special_tokens=True)

'–ø—Ä–æ—Ü–µ—Å—Å–æ–≤ BPM Fab –î–æ—Ö–æ–¥–Ω–æ—Å—Ç—å –≥—Ä–∞—Ñ–∏–∫ gamedev —Å–≤–æ–±–æ–¥–Ω—ã–π unreal –†–∞–∑—Ä–∞–±–æ—Ç–∫–∞'

In [19]:
loaded_model = T5ForConditionalGeneration.from_pretrained("test/checkpoint-4000", device_map="cpu")

In [22]:
%time

result = loaded_model.generate(test[2][0].unsqueeze(0), max_length=20,
                                      min_length=20,
                                      length_penalty=2.0,
                                      num_beams=10, early_stopping=False)

CPU times: user 18 ¬µs, sys: 2 ¬µs, total: 20 ¬µs
Wall time: 477 ¬µs


In [21]:
result

tensor([[    0,  6830,   850,   856,   934,   897,  4877,   422,   986,   370,
          5293,     8,  4252, 20543,  6568, 17276,  5888,  2045,   700, 23894]])