ShaeNaZar commited on
Commit
bf399da
Β·
1 Parent(s): 36f9531
Files changed (7) hide show
  1. .streamlit/config.toml +4 -0
  2. Dockerfile +11 -0
  3. app.py +42 -0
  4. main.ipynb +83 -0
  5. requirements.txt +4 -0
  6. src/pager.py +22 -0
  7. src/summarizer.py +16 -0
.streamlit/config.toml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ [theme]
2
+ base="dark"
3
+ font="serif"
4
+ primaryColor="purple"
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12
2
+ WORKDIR .
3
+
4
+ COPY requirements.txt ./
5
+ RUN pip install --no-cache-dir -r requirements.txt
6
+
7
+ COPY . .
8
+ EXPOSE 5000
9
+
10
+
11
+ CMD ["python", "-m", "streamlit", "app.py"]
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from src.pager import get_pager
3
+ from src.summarizer import Summarizer
4
+ import torch
5
+
6
+
7
+ torch.classes.__path__ = []
8
+
9
+ @st.cache_resource
10
+ def GetSummarizer():
11
+ return Summarizer()
12
+
13
+
14
+ url = st.text_input("Please enter your habr article url...")
15
+ text = st.text_input("...or paste text here:strawberry:")
16
+
17
+ def handle_sum_text(sum_text):
18
+ return ['#' + x for x in sum_text.split()]
19
+
20
+ def url_callback():
21
+ summarizer = GetSummarizer()
22
+ pager = get_pager(url)
23
+ if pager is not None:
24
+ st.title(pager.title)
25
+ sum_text = summarizer.summarize(pager.text[:1000])
26
+ st.write("Okay, there your tags :sunglasses:")
27
+ for chunk in handle_sum_text(sum_text):
28
+ st.badge(chunk, icon=":material/check:", color="green")
29
+ st.title(":shit: Π‘Π»ΡƒΡˆΠ°ΠΉ Π½Ρƒ Π½ΠΎΡ€ΠΌΠ°Π»ΡŒΠ½ΠΎ ΠΆΠ΅ ΠΎΠ±Ρ‰Π°Π»ΠΈΡΡŒ")
30
+
31
+ def generator_callback():
32
+ summarizer = GetSummarizer()
33
+ st.title("Your AWESOME:heart: article")
34
+ sum_text = summarizer.summarize(text[:1000])
35
+ st.write("Okay, there your #tags :sunglasses:")
36
+ for chunk in handle_sum_text(sum_text):
37
+ st.badge(chunk, icon=":material/check:", color="green")
38
+
39
+
40
+ st.button("Describe Habr Article", on_click=url_callback)
41
+
42
+ st.button("Describe text", on_click=generator_callback)
main.ipynb ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/home/shaenazar/anaconda3/envs/dsenv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
+ " from .autonotebook import tqdm as notebook_tqdm\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "import torch\n",
19
+ "from transformers import T5Tokenizer, T5ForConditionalGeneration\n",
20
+ "\n",
21
+ "model_name = \"sarahai/ruT5-base-summarizer\"\n",
22
+ "model_path = \"data/checkpoint\"\n",
23
+ "\n",
24
+ "model = T5ForConditionalGeneration.from_pretrained(model_path)"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 5,
30
+ "metadata": {},
31
+ "outputs": [
32
+ {
33
+ "name": "stderr",
34
+ "output_type": "stream",
35
+ "text": [
36
+ "model.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 892M/892M [01:13<00:00, 12.2MB/s] \n"
37
+ ]
38
+ },
39
+ {
40
+ "data": {
41
+ "text/plain": [
42
+ "CommitInfo(commit_url='https://huggingface.co/ShaeNaZar/YsdaSummarizer/commit/fbd9cbe753c47653b6418165c948f69dc160954e', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='fbd9cbe753c47653b6418165c948f69dc160954e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ShaeNaZar/YsdaSummarizer', endpoint='https://huggingface.co', repo_type='model', repo_id='ShaeNaZar/YsdaSummarizer'), pr_revision=None, pr_num=None)"
43
+ ]
44
+ },
45
+ "execution_count": 5,
46
+ "metadata": {},
47
+ "output_type": "execute_result"
48
+ }
49
+ ],
50
+ "source": [
51
+ "model.push_to_hub(\"ShaeNaZar/YsdaSummarizer\")"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": null,
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": []
60
+ }
61
+ ],
62
+ "metadata": {
63
+ "kernelspec": {
64
+ "display_name": "dsenv",
65
+ "language": "python",
66
+ "name": "python3"
67
+ },
68
+ "language_info": {
69
+ "codemirror_mode": {
70
+ "name": "ipython",
71
+ "version": 3
72
+ },
73
+ "file_extension": ".py",
74
+ "mimetype": "text/x-python",
75
+ "name": "python",
76
+ "nbconvert_exporter": "python",
77
+ "pygments_lexer": "ipython3",
78
+ "version": "3.12.3"
79
+ }
80
+ },
81
+ "nbformat": 4,
82
+ "nbformat_minor": 2
83
+ }
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ bs4
2
+ streamlit
3
+ torch
4
+ transformers
src/pager.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ from pydantic import BaseModel
3
+ import requests
4
+ from typing import Optional
5
+
6
+ class Pager(BaseModel):
7
+ title: str
8
+ text: str
9
+ original_tags: list[str]
10
+
11
+ def is_valid_page(url):
12
+ return True
13
+
14
+ def get_pager(url)->Optional[Pager]:
15
+ try:
16
+ req = requests.get(url)
17
+ soup = BeautifulSoup(req.text, 'lxml')
18
+ query = soup.find("div", class_="article-formatted-body")
19
+ title = soup.title.string
20
+ return Pager(title=title, text=query.get_text(), original_tags=["govno"])
21
+ except:
22
+ return None
src/summarizer.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
3
+
4
+ class Summarizer:
5
+ def __init__(self, device="cpu"):
6
+ model_name = "sarahai/ruT5-base-summarizer"
7
+ model_path = "ShaeNaZar/YsdaSummarizer"
8
+ self.device = device
9
+ self.tokenizer = T5Tokenizer.from_pretrained(model_name)
10
+ self.model = T5ForConditionalGeneration.from_pretrained(model_path)
11
+
12
+ def summarize(self, text):
13
+ input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.device)
14
+ outputs = self.model.generate(input_ids, max_length=20, min_length=20, length_penalty=2.0, num_beams=5, early_stopping=True)
15
+
16
+ return self.tokenizer.decode(outputs[0], skip_special_tokens=True)