eevaw commited on
Commit
347b20d
·
verified ·
1 Parent(s): 348f4f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +284 -284
app.py CHANGED
@@ -1,284 +1,284 @@
1
- import transformers
2
- import datasets
3
- import torch
4
- import sentencepiece
5
- import evaluate
6
-
7
-
8
- from datasets import load_dataset
9
- from transformers import MT5ForConditionalGeneration, T5Tokenizer
10
- import re
11
-
12
- # Load dataset
13
- ds = load_dataset("scillm/scientific_papers-archive", split="test")
14
-
15
- # Select the first 1000 examples
16
- small_ds = ds.select(range(1000))
17
-
18
- # Preprocessing function to remove unwanted references
19
- def preprocess_text(text):
20
- # Remove unwanted references like @xcite
21
- text = re.sub(r'@\w+', '', text) # Remove anything that starts with @
22
- text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
23
- return text
24
-
25
- # Preprocessing function
26
- def preprocess(examples):
27
- # Preprocess articles and summaries
28
- articles = [preprocess_text(article) for article in examples["input"]]
29
- outputs = [preprocess_text(output) for output in examples["output"]]
30
-
31
- # Add prefix to the articles
32
- inputs = ["summarize: " + article for article in articles]
33
-
34
- # Tokenize articles
35
- model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
36
-
37
- # Tokenize summaries
38
- labels = tokenizer(outputs, max_length=128, truncation=True, padding="max_length")
39
-
40
- model_inputs["labels"] = labels["input_ids"]
41
-
42
- return model_inputs
43
-
44
- # Load mT5 model and tokenizer
45
- model_name = "google/mt5-small" # You can also use other mT5 models
46
- tokenizer = T5Tokenizer.from_pretrained(model_name)
47
- model = MT5ForConditionalGeneration.from_pretrained(model_name)
48
-
49
- # Tokenize the smaller dataset
50
- tokenized_small_ds = small_ds.map(preprocess, batched=True)
51
-
52
- # Verify that the dataset is correctly tokenized
53
- print(tokenized_small_ds[0])
54
-
55
- # Split the data into train and test set
56
- small_ds = ds.train_test_split(test_size=0.2)
57
-
58
- small_ds["train"][0]
59
-
60
- print(small_ds['train'].features)
61
-
62
- print(small_ds.column_names)
63
-
64
- from transformers import T5Tokenizer
65
-
66
- model_name = "google/mt5-small"
67
- tokenizer = T5Tokenizer.from_pretrained(model_name)
68
-
69
- # Apply preprocessing function to dataset
70
- tokenized_ds = small_ds.map(preprocess, batched=True)
71
-
72
- from transformers import DataCollatorForSeq2Seq
73
-
74
- data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)
75
-
76
- import torch
77
- torch.cuda.empty_cache()
78
-
79
- nvidia-smi
80
-
81
- !pip install wandb
82
- import wandb
83
- wandb.login()
84
- from transformers import MT5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
85
- import torch
86
-
87
- # Load the model
88
- model_name = "google/mt5-small"
89
- model = MT5ForConditionalGeneration.from_pretrained(model_name)
90
-
91
- # Set the device
92
- device = torch.device("cpu")
93
- model.to(device)
94
- # Ensure model parameters are contiguous
95
- for name, param in model.named_parameters():
96
- if not param.is_contiguous():
97
- param.data = param.data.contiguous() # Make the tensor contiguous
98
- print(f"Made {name} contiguous.")
99
-
100
- training_args = Seq2SeqTrainingArguments(
101
- output_dir='./results',
102
- num_train_epochs=10,
103
- per_device_train_batch_size=4, # Pienennä batch-kokoa
104
- per_device_eval_batch_size=4,
105
- evaluation_strategy='epoch',
106
- logging_dir='./logs',
107
- predict_with_generate=True
108
- )
109
-
110
- # Create trainer instance
111
- trainer = Seq2SeqTrainer(
112
- model=model,
113
- args=training_args,
114
- train_dataset=tokenized_small_ds.shuffle().select(range(80)), # Käytetään 800 esimerkkiä koulutukseen
115
- eval_dataset=tokenized_small_ds.shuffle().select(range(20, 100)), # Käytetään 200 esimerkkiä arvioimiseen
116
- )
117
-
118
- # Kouluta malli
119
- trainer.train()
120
-
121
- pip install rouge_score
122
- import evaluate
123
- rouge = evaluate.load("rouge")
124
-
125
- def compute_metrics(eval_pred):
126
- predictions, labels = eval_pred
127
-
128
- # Decode predictions and labels (remove special tokens)
129
- decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
130
-
131
- # Replace -100 in labels (ignore index) with the padding token id
132
- labels[labels == -100] = tokenizer.pad_token_id
133
- decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
134
-
135
- # Compute ROUGE scores using the `evaluate` library
136
- rouge_output = rouge.compute(predictions=decoded_preds, references=decoded_labels)
137
-
138
- return {
139
- "rouge1": rouge_output["rouge1"],
140
- "rouge2": rouge_output["rouge2"],
141
- "rougeL": rouge_output["rougeL"],
142
- }
143
-
144
- # Update trainer to include costom metrics
145
- trainer.compute_metrics = compute_metrics
146
-
147
- # Evaluate the model
148
- eval_result = trainer.evaluate()
149
- print(eval_result)
150
-
151
- # Save the fine-tuned model
152
- trainer.save_model("fine-tuned-mt5")
153
- tokenizer.save_pretrained("fine-tuned-mt5")
154
-
155
- # Load required libraries
156
- from transformers import T5Tokenizer, MT5ForConditionalGeneration
157
-
158
- # Load the fine-tuned tokenizer and model
159
- model_name = "fine-tuned-mt5"
160
- new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
161
- new_model = MT5ForConditionalGeneration.from_pretrained(model_name)
162
-
163
- from transformers import pipeline
164
- import torch
165
-
166
- # Syötteesi
167
- # Restructured input
168
- text = (
169
- "Summarize the following information regarding psoriasis, its effects on skin health, and its potential health risks:\n\n"
170
- "1. Psoriasis is an autoimmune condition that leads to inflammation in the skin.\n"
171
- "2. Immune system dysfunction causes inflammatory cells to accumulate in the dermis, the middle layer of the skin.\n"
172
- "3. The condition accelerates skin cell growth, with skin cells shedding more quickly than usual.\n"
173
- "4. This abnormal shedding results in uncomfortable symptoms like raised plaques, scales, and redness.\n"
174
- "5. Psoriasis not only affects the skin but also increases the risk of serious health issues, including heart disease, cancer, and inflammatory bowel disease.\n\n"
175
- "Please provide a summary."
176
- )
177
-
178
-
179
- # Määrittele laite (GPU tai CPU)
180
- device = 0 if torch.cuda.is_available() else -1
181
-
182
- # Lataa tiivistämispipeline
183
- summarizer = pipeline("summarization", model=new_model, tokenizer=new_tokenizer, device=device)
184
-
185
- # Tiivistä teksti
186
- summary = summarizer(text,
187
- max_length=120,
188
- min_length=30,
189
- do_sample=False,
190
- num_beams=10,
191
- repetition_penalty=5.0,
192
- no_repeat_ngram_size=2,
193
- length_penalty=1.0)[0]["summary_text"]
194
-
195
- # Clean the summary by removing the <extra_id_0> token
196
-
197
- import re
198
-
199
- # Regular expression to match both <extra_id_X> and <id_XX>
200
- pattern = r"<(extra_id_\d+|id_\d+)>"
201
-
202
- # Replace all matches with a space
203
- cleaned_summary = re.sub(pattern, " ", summary).strip()
204
-
205
-
206
- print(cleaned_summary)
207
-
208
-
209
- # Niinan koodi
210
- !pip install gradio PyMuPDF
211
-
212
- import gradio as gr
213
- from transformers import T5Tokenizer, MT5ForConditionalGeneration
214
- import fitz # PyMuPDF
215
-
216
- # Load the fine-tuned tokenizer and model
217
- model_name = "fine-tuned-mt5"
218
- new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
219
- new_model = MT5ForConditionalGeneration.from_pretrained(model_name)
220
-
221
- # Function to extract text from PDF using PyMuPDF
222
- def extract_text_from_pdf(pdf_file):
223
- text = ""
224
- # Open the PDF file
225
- with fitz.open(pdf_file) as doc:
226
- for page in doc:
227
- text += page.get_text() # Extract text from each page
228
- return text
229
-
230
- # Summarization function
231
- def summarize_pdf(pdf_file, max_summary_length):
232
- # Extract text from the PDF
233
- input_text = extract_text_from_pdf(pdf_file)
234
-
235
- # Tokenize the input to check length
236
- tokenized_input = new_tokenizer.encode(input_text, return_tensors='pt')
237
-
238
-
239
-
240
- try:
241
- # Generate the summary
242
- summary_ids = new_model.generate(
243
- tokenized_input,
244
- max_length=max_summary_length,
245
- min_length=30,
246
- num_beams=15,
247
- repetition_penalty=5.0,
248
- no_repeat_ngram_size=2
249
- )
250
-
251
- # Decode the generated summary
252
- summary = new_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
253
-
254
- # Clean up the summary to remove unwanted tokens
255
- cleaned_summary = ' '.join([token for token in summary.split() if not token.startswith('<extra_id_')]).strip()
256
-
257
- # Ensure the summary ends with a complete sentence
258
- if cleaned_summary:
259
- last_period_index = cleaned_summary.rfind('.')
260
- if last_period_index != -1 and last_period_index < len(cleaned_summary) - 1:
261
- cleaned_summary = cleaned_summary[:last_period_index + 1]
262
- else:
263
- cleaned_summary = cleaned_summary.strip()
264
-
265
- return cleaned_summary if cleaned_summary else "No valid summary generated."
266
-
267
- except Exception as e:
268
- return str(e) # Return the error message for debugging
269
-
270
- # Define the Gradio interface
271
- interface = gr.Interface(
272
- fn=summarize_pdf,
273
- inputs=[
274
- gr.File(label="Upload PDF"),
275
- gr.Slider(50, 300, step=10, label="Max summary length")
276
- ],
277
- outputs="textbox", # A textbox for the output summary
278
- title="PDF Text Summarizer",
279
- description="Upload a PDF file to summarize its content."
280
- )
281
-
282
- # Launch the interface
283
- # Launch the interface with debug mode enabled
284
- interface.launch(debug=True)
 
1
+ import transformers
2
+ import datasets
3
+ import torch
4
+ import sentencepiece
5
+ import evaluate
6
+
7
+
8
+ from datasets import load_dataset
9
+ from transformers import MT5ForConditionalGeneration, T5Tokenizer
10
+ import re
11
+
12
+ # Load dataset
13
+ ds = load_dataset("scillm/scientific_papers-archive", split="test")
14
+
15
+ # Select the first 1000 examples
16
+ small_ds = ds.select(range(1000))
17
+
18
+ # Preprocessing function to remove unwanted references
19
+ def preprocess_text(text):
20
+ # Remove unwanted references like @xcite
21
+ text = re.sub(r'@\w+', '', text) # Remove anything that starts with @
22
+ text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
23
+ return text
24
+
25
+ # Preprocessing function
26
+ def preprocess(examples):
27
+ # Preprocess articles and summaries
28
+ articles = [preprocess_text(article) for article in examples["input"]]
29
+ outputs = [preprocess_text(output) for output in examples["output"]]
30
+
31
+ # Add prefix to the articles
32
+ inputs = ["summarize: " + article for article in articles]
33
+
34
+ # Tokenize articles
35
+ model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
36
+
37
+ # Tokenize summaries
38
+ labels = tokenizer(outputs, max_length=128, truncation=True, padding="max_length")
39
+
40
+ model_inputs["labels"] = labels["input_ids"]
41
+
42
+ return model_inputs
43
+
44
+ # Load mT5 model and tokenizer
45
+ model_name = "google/mt5-small" # You can also use other mT5 models
46
+ tokenizer = T5Tokenizer.from_pretrained(model_name)
47
+ model = MT5ForConditionalGeneration.from_pretrained(model_name)
48
+
49
+ # Tokenize the smaller dataset
50
+ tokenized_small_ds = small_ds.map(preprocess, batched=True)
51
+
52
+ # Verify that the dataset is correctly tokenized
53
+ print(tokenized_small_ds[0])
54
+
55
+ # Split the data into train and test set
56
+ small_ds = ds.train_test_split(test_size=0.2)
57
+
58
+ small_ds["train"][0]
59
+
60
+ print(small_ds['train'].features)
61
+
62
+ print(small_ds.column_names)
63
+
64
+ from transformers import T5Tokenizer
65
+
66
+ model_name = "google/mt5-small"
67
+ tokenizer = T5Tokenizer.from_pretrained(model_name)
68
+
69
+ # Apply preprocessing function to dataset
70
+ tokenized_ds = small_ds.map(preprocess, batched=True)
71
+
72
+ from transformers import DataCollatorForSeq2Seq
73
+
74
+ data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)
75
+
76
+ import torch
77
+ torch.cuda.empty_cache()
78
+
79
+ nvidia-smi
80
+
81
+ pip install wandb
82
+ import wandb
83
+ wandb.login()
84
+ from transformers import MT5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
85
+ import torch
86
+
87
+ # Load the model
88
+ model_name = "google/mt5-small"
89
+ model = MT5ForConditionalGeneration.from_pretrained(model_name)
90
+
91
+ # Set the device
92
+ device = torch.device("cpu")
93
+ model.to(device)
94
+ # Ensure model parameters are contiguous
95
+ for name, param in model.named_parameters():
96
+ if not param.is_contiguous():
97
+ param.data = param.data.contiguous() # Make the tensor contiguous
98
+ print(f"Made {name} contiguous.")
99
+
100
+ training_args = Seq2SeqTrainingArguments(
101
+ output_dir='./results',
102
+ num_train_epochs=10,
103
+ per_device_train_batch_size=4, # Pienennä batch-kokoa
104
+ per_device_eval_batch_size=4,
105
+ evaluation_strategy='epoch',
106
+ logging_dir='./logs',
107
+ predict_with_generate=True
108
+ )
109
+
110
+ # Create trainer instance
111
+ trainer = Seq2SeqTrainer(
112
+ model=model,
113
+ args=training_args,
114
+ train_dataset=tokenized_small_ds.shuffle().select(range(80)), # Käytetään 800 esimerkkiä koulutukseen
115
+ eval_dataset=tokenized_small_ds.shuffle().select(range(20, 100)), # Käytetään 200 esimerkkiä arvioimiseen
116
+ )
117
+
118
+ # Kouluta malli
119
+ trainer.train()
120
+
121
+ pip install rouge_score
122
+ import evaluate
123
+ rouge = evaluate.load("rouge")
124
+
125
+ def compute_metrics(eval_pred):
126
+ predictions, labels = eval_pred
127
+
128
+ # Decode predictions and labels (remove special tokens)
129
+ decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
130
+
131
+ # Replace -100 in labels (ignore index) with the padding token id
132
+ labels[labels == -100] = tokenizer.pad_token_id
133
+ decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
134
+
135
+ # Compute ROUGE scores using the `evaluate` library
136
+ rouge_output = rouge.compute(predictions=decoded_preds, references=decoded_labels)
137
+
138
+ return {
139
+ "rouge1": rouge_output["rouge1"],
140
+ "rouge2": rouge_output["rouge2"],
141
+ "rougeL": rouge_output["rougeL"],
142
+ }
143
+
144
+ # Update trainer to include costom metrics
145
+ trainer.compute_metrics = compute_metrics
146
+
147
+ # Evaluate the model
148
+ eval_result = trainer.evaluate()
149
+ print(eval_result)
150
+
151
+ # Save the fine-tuned model
152
+ trainer.save_model("fine-tuned-mt5")
153
+ tokenizer.save_pretrained("fine-tuned-mt5")
154
+
155
+ # Load required libraries
156
+ from transformers import T5Tokenizer, MT5ForConditionalGeneration
157
+
158
+ # Load the fine-tuned tokenizer and model
159
+ model_name = "fine-tuned-mt5"
160
+ new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
161
+ new_model = MT5ForConditionalGeneration.from_pretrained(model_name)
162
+
163
+ from transformers import pipeline
164
+ import torch
165
+
166
+ # Syötteesi
167
+ # Restructured input
168
+ text = (
169
+ "Summarize the following information regarding psoriasis, its effects on skin health, and its potential health risks:\n\n"
170
+ "1. Psoriasis is an autoimmune condition that leads to inflammation in the skin.\n"
171
+ "2. Immune system dysfunction causes inflammatory cells to accumulate in the dermis, the middle layer of the skin.\n"
172
+ "3. The condition accelerates skin cell growth, with skin cells shedding more quickly than usual.\n"
173
+ "4. This abnormal shedding results in uncomfortable symptoms like raised plaques, scales, and redness.\n"
174
+ "5. Psoriasis not only affects the skin but also increases the risk of serious health issues, including heart disease, cancer, and inflammatory bowel disease.\n\n"
175
+ "Please provide a summary."
176
+ )
177
+
178
+
179
+ # Määrittele laite (GPU tai CPU)
180
+ device = 0 if torch.cuda.is_available() else -1
181
+
182
+ # Lataa tiivistämispipeline
183
+ summarizer = pipeline("summarization", model=new_model, tokenizer=new_tokenizer, device=device)
184
+
185
+ # Tiivistä teksti
186
+ summary = summarizer(text,
187
+ max_length=120,
188
+ min_length=30,
189
+ do_sample=False,
190
+ num_beams=10,
191
+ repetition_penalty=5.0,
192
+ no_repeat_ngram_size=2,
193
+ length_penalty=1.0)[0]["summary_text"]
194
+
195
+ # Clean the summary by removing the <extra_id_0> token
196
+
197
+ import re
198
+
199
+ # Regular expression to match both <extra_id_X> and <id_XX>
200
+ pattern = r"<(extra_id_\d+|id_\d+)>"
201
+
202
+ # Replace all matches with a space
203
+ cleaned_summary = re.sub(pattern, " ", summary).strip()
204
+
205
+
206
+ print(cleaned_summary)
207
+
208
+
209
+ # Niinan koodi
210
+ !pip install gradio PyMuPDF
211
+
212
+ import gradio as gr
213
+ from transformers import T5Tokenizer, MT5ForConditionalGeneration
214
+ import fitz # PyMuPDF
215
+
216
+ # Load the fine-tuned tokenizer and model
217
+ model_name = "fine-tuned-mt5"
218
+ new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
219
+ new_model = MT5ForConditionalGeneration.from_pretrained(model_name)
220
+
221
+ # Function to extract text from PDF using PyMuPDF
222
+ def extract_text_from_pdf(pdf_file):
223
+ text = ""
224
+ # Open the PDF file
225
+ with fitz.open(pdf_file) as doc:
226
+ for page in doc:
227
+ text += page.get_text() # Extract text from each page
228
+ return text
229
+
230
+ # Summarization function
231
+ def summarize_pdf(pdf_file, max_summary_length):
232
+ # Extract text from the PDF
233
+ input_text = extract_text_from_pdf(pdf_file)
234
+
235
+ # Tokenize the input to check length
236
+ tokenized_input = new_tokenizer.encode(input_text, return_tensors='pt')
237
+
238
+
239
+
240
+ try:
241
+ # Generate the summary
242
+ summary_ids = new_model.generate(
243
+ tokenized_input,
244
+ max_length=max_summary_length,
245
+ min_length=30,
246
+ num_beams=15,
247
+ repetition_penalty=5.0,
248
+ no_repeat_ngram_size=2
249
+ )
250
+
251
+ # Decode the generated summary
252
+ summary = new_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
253
+
254
+ # Clean up the summary to remove unwanted tokens
255
+ cleaned_summary = ' '.join([token for token in summary.split() if not token.startswith('<extra_id_')]).strip()
256
+
257
+ # Ensure the summary ends with a complete sentence
258
+ if cleaned_summary:
259
+ last_period_index = cleaned_summary.rfind('.')
260
+ if last_period_index != -1 and last_period_index < len(cleaned_summary) - 1:
261
+ cleaned_summary = cleaned_summary[:last_period_index + 1]
262
+ else:
263
+ cleaned_summary = cleaned_summary.strip()
264
+
265
+ return cleaned_summary if cleaned_summary else "No valid summary generated."
266
+
267
+ except Exception as e:
268
+ return str(e) # Return the error message for debugging
269
+
270
+ # Define the Gradio interface
271
+ interface = gr.Interface(
272
+ fn=summarize_pdf,
273
+ inputs=[
274
+ gr.File(label="Upload PDF"),
275
+ gr.Slider(50, 300, step=10, label="Max summary length")
276
+ ],
277
+ outputs="textbox", # A textbox for the output summary
278
+ title="PDF Text Summarizer",
279
+ description="Upload a PDF file to summarize its content."
280
+ )
281
+
282
+ # Launch the interface
283
+ # Launch the interface with debug mode enabled
284
+ interface.launch(debug=True)