File size: 9,064 Bytes
347b20d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3362ccd
347b20d
6ee98b4
ee1d7f4
c006e19
7dde707
 
 
 
2ff5b86
7dde707
f900131
347b20d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f0bd962
3362ccd
347b20d
 
c006e19
347b20d
 
 
 
 
 
 
 
 
 
 
3362ccd
347b20d
 
6ee98b4
347b20d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3362ccd
347b20d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3362ccd
347b20d
 
3362ccd
347b20d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3362ccd
347b20d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3362ccd
347b20d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
import transformers 
import datasets 
import torch 
import sentencepiece 
import evaluate


from datasets import load_dataset
from transformers import MT5ForConditionalGeneration, T5Tokenizer
import re

# Load dataset
ds = load_dataset("scillm/scientific_papers-archive", split="test")

# Select the first 1000 examples
small_ds = ds.select(range(1000))

# Preprocessing function to remove unwanted references
def preprocess_text(text):
    # Remove unwanted references like @xcite
    text = re.sub(r'@\w+', '', text)  # Remove anything that starts with @
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Preprocessing function
def preprocess(examples):
    # Preprocess articles and summaries
    articles = [preprocess_text(article) for article in examples["input"]]
    outputs = [preprocess_text(output) for output in examples["output"]]

    # Add prefix to the articles
    inputs = ["summarize: " + article for article in articles]

    # Tokenize articles
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    # Tokenize summaries
    labels = tokenizer(outputs, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Load mT5 model and tokenizer
model_name = "google/mt5-small"  # You can also use other mT5 models
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize the smaller dataset
tokenized_small_ds = small_ds.map(preprocess, batched=True)

# Verify that the dataset is correctly tokenized
print(tokenized_small_ds[0])

# Split the data into train and test set
small_ds = ds.train_test_split(test_size=0.2)

small_ds["train"][0]

print(small_ds['train'].features)

print(small_ds.column_names)

from transformers import T5Tokenizer

model_name = "google/mt5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Apply preprocessing function to dataset
tokenized_ds = small_ds.map(preprocess, batched=True)

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

import torch
torch.cuda.empty_cache()



#pip install wandb
import os
import wandb
api_key = os.getenv("API_KEY")

# Authenticate with WandB
wandb.login(key=api_key)
#print(os.getenv('API_KEY'))
#os.environ["API_KEY"]

from transformers import MT5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

# Load the model
model_name = "google/mt5-small"
model = MT5ForConditionalGeneration.from_pretrained(model_name)

# Set the device
device = torch.device("cpu")
model.to(device)
# Ensure model parameters are contiguous
for name, param in model.named_parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()  # Make the tensor contiguous
        print(f"Made {name} contiguous.")

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,
    evaluation_strategy='epoch',
    logging_dir='./logs',
    predict_with_generate=True
)

# Create trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_small_ds.shuffle().select(range(80)),  # Käytetään 800 esimerkkiä koulutukseen
    eval_dataset=tokenized_small_ds.shuffle().select(range(20, 100)),  # Käytetään 200 esimerkkiä arvioimiseen
)

# train the model
trainer.train()

#pip install rouge_score
import evaluate
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Decode predictions and labels (remove special tokens)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in labels (ignore index) with the padding token id
    labels[labels == -100] = tokenizer.pad_token_id
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores using the `evaluate` library
    rouge_output = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "rouge1": rouge_output["rouge1"],
        "rouge2": rouge_output["rouge2"],
        "rougeL": rouge_output["rougeL"],
    }

# Update trainer to include costom metrics
trainer.compute_metrics = compute_metrics

# Evaluate the model
eval_result = trainer.evaluate()
print(eval_result)

# Save the fine-tuned model
trainer.save_model("fine-tuned-mt5")
tokenizer.save_pretrained("fine-tuned-mt5")

# Load required libraries
from transformers import T5Tokenizer, MT5ForConditionalGeneration

# Load the fine-tuned tokenizer and model
model_name = "fine-tuned-mt5"
new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
new_model = MT5ForConditionalGeneration.from_pretrained(model_name)

from transformers import pipeline
import torch


# Restructured input
text = (
    "Summarize the following information regarding psoriasis, its effects on skin health, and its potential health risks:\n\n"
    "1. Psoriasis is an autoimmune condition that leads to inflammation in the skin.\n"
    "2. Immune system dysfunction causes inflammatory cells to accumulate in the dermis, the middle layer of the skin.\n"
    "3. The condition accelerates skin cell growth, with skin cells shedding more quickly than usual.\n"
    "4. This abnormal shedding results in uncomfortable symptoms like raised plaques, scales, and redness.\n"
    "5. Psoriasis not only affects the skin but also increases the risk of serious health issues, including heart disease, cancer, and inflammatory bowel disease.\n\n"
    "Please provide a summary."
)


# Määrittele laite (GPU tai CPU)
device = 0 if torch.cuda.is_available() else -1

# Load the pipeline
summarizer = pipeline("summarization", model=new_model, tokenizer=new_tokenizer, device=device)

# Summarize the text
summary = summarizer(text,
                     max_length=120,
                     min_length=30,
                     do_sample=False,
                     num_beams=10,
                     repetition_penalty=5.0,
                     no_repeat_ngram_size=2,
                     length_penalty=1.0)[0]["summary_text"]

# Clean the summary by removing the <extra_id_0> token

import re

# Regular expression to match both <extra_id_X> and <id_XX>
pattern = r"<(extra_id_\d+|id_\d+)>"

# Replace all matches with a space
cleaned_summary = re.sub(pattern, " ", summary).strip()


print(cleaned_summary)




import gradio as gr
from transformers import T5Tokenizer, MT5ForConditionalGeneration
import fitz  # PyMuPDF

# Load the fine-tuned tokenizer and model
model_name = "fine-tuned-mt5"
new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
new_model = MT5ForConditionalGeneration.from_pretrained(model_name)

# Function to extract text from PDF using PyMuPDF
def extract_text_from_pdf(pdf_file):
    text = ""
    # Open the PDF file
    with fitz.open(pdf_file) as doc:
        for page in doc:
            text += page.get_text()  # Extract text from each page
    return text

# Summarization function
def summarize_pdf(pdf_file, max_summary_length):
    # Extract text from the PDF
    input_text = extract_text_from_pdf(pdf_file)

    # Tokenize the input to check length
    tokenized_input = new_tokenizer.encode(input_text, return_tensors='pt')



    try:
        # Generate the summary
        summary_ids = new_model.generate(
            tokenized_input,
            max_length=max_summary_length,
            min_length=30,
            num_beams=15,
            repetition_penalty=5.0,
            no_repeat_ngram_size=2
        )

        # Decode the generated summary
        summary = new_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        # Clean up the summary to remove unwanted tokens
        cleaned_summary = ' '.join([token for token in summary.split() if not token.startswith('<extra_id_')]).strip()

        # Ensure the summary ends with a complete sentence
        if cleaned_summary:
            last_period_index = cleaned_summary.rfind('.')
            if last_period_index != -1 and last_period_index < len(cleaned_summary) - 1:
                cleaned_summary = cleaned_summary[:last_period_index + 1]
            else:
                cleaned_summary = cleaned_summary.strip()

        return cleaned_summary if cleaned_summary else "No valid summary generated."

    except Exception as e:
        return str(e)  # Return the error message for debugging

# Define the Gradio interface
interface = gr.Interface(
    fn=summarize_pdf,
    inputs=[
        gr.File(label="Upload PDF"),
        gr.Slider(50, 300, step=10, label="Max summary length")
    ],
    outputs="textbox",  # A textbox for the output summary
    title="PDF Text Summarizer",
    description="Upload a PDF file to summarize its content."
)


# Launch the interface with debug mode enabled
interface.launch(debug=True)