Summarizer

Runtime error

App Files Files Community

eevaw commited on Oct 21, 2024

Commit

347b20d

verified ·

1 Parent(s): 348f4f3

Update app.py

Browse files

Files changed (1) hide show

app.py +284 -284

app.py CHANGED Viewed

@@ -1,284 +1,284 @@
-import transformers
-import datasets
-import torch
-import sentencepiece
-import evaluate
-from datasets import load_dataset
-from transformers import MT5ForConditionalGeneration, T5Tokenizer
-import re
-# Load dataset
-ds = load_dataset("scillm/scientific_papers-archive", split="test")
-# Select the first 1000 examples
-small_ds = ds.select(range(1000))
-# Preprocessing function to remove unwanted references
-def preprocess_text(text):
-    # Remove unwanted references like @xcite
-    text = re.sub(r'@\w+', '', text)  # Remove anything that starts with @
-    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
-    return text
-# Preprocessing function
-def preprocess(examples):
-    # Preprocess articles and summaries
-    articles = [preprocess_text(article) for article in examples["input"]]
-    outputs = [preprocess_text(output) for output in examples["output"]]
-    # Add prefix to the articles
-    inputs = ["summarize: " + article for article in articles]
-    # Tokenize articles
-    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
-    # Tokenize summaries
-    labels = tokenizer(outputs, max_length=128, truncation=True, padding="max_length")
-    model_inputs["labels"] = labels["input_ids"]
-    return model_inputs
-# Load mT5 model and tokenizer
-model_name = "google/mt5-small"  # You can also use other mT5 models
-tokenizer = T5Tokenizer.from_pretrained(model_name)
-model = MT5ForConditionalGeneration.from_pretrained(model_name)
-# Tokenize the smaller dataset
-tokenized_small_ds = small_ds.map(preprocess, batched=True)
-# Verify that the dataset is correctly tokenized
-print(tokenized_small_ds[0])
-# Split the data into train and test set
-small_ds = ds.train_test_split(test_size=0.2)
-small_ds["train"][0]
-print(small_ds['train'].features)
-print(small_ds.column_names)
-from transformers import T5Tokenizer
-model_name = "google/mt5-small"
-tokenizer = T5Tokenizer.from_pretrained(model_name)
-# Apply preprocessing function to dataset
-tokenized_ds = small_ds.map(preprocess, batched=True)
-from transformers import DataCollatorForSeq2Seq
-data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)
-import torch
-torch.cuda.empty_cache()
-nvidia-smi
-!pip install wandb
-import wandb
-wandb.login()
-from transformers import MT5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
-import torch
-# Load the model
-model_name = "google/mt5-small"
-model = MT5ForConditionalGeneration.from_pretrained(model_name)
-# Set the device
-device = torch.device("cpu")
-model.to(device)
-# Ensure model parameters are contiguous
-for name, param in model.named_parameters():
-    if not param.is_contiguous():
-        param.data = param.data.contiguous()  # Make the tensor contiguous
-        print(f"Made {name} contiguous.")
-training_args = Seq2SeqTrainingArguments(
-    output_dir='./results',
-    num_train_epochs=10,
-    per_device_train_batch_size=4,  # Pienennä batch-kokoa
-    per_device_eval_batch_size=4,
-    evaluation_strategy='epoch',
-    logging_dir='./logs',
-    predict_with_generate=True
-)
-# Create trainer instance
-trainer = Seq2SeqTrainer(
-    model=model,
-    args=training_args,
-    train_dataset=tokenized_small_ds.shuffle().select(range(80)),  # Käytetään 800 esimerkkiä koulutukseen
-    eval_dataset=tokenized_small_ds.shuffle().select(range(20, 100)),  # Käytetään 200 esimerkkiä arvioimiseen
-)
-# Kouluta malli
-trainer.train()
-pip install rouge_score
-import evaluate
-rouge = evaluate.load("rouge")
-def compute_metrics(eval_pred):
-    predictions, labels = eval_pred
-    # Decode predictions and labels (remove special tokens)
-    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
-    # Replace -100 in labels (ignore index) with the padding token id
-    labels[labels == -100] = tokenizer.pad_token_id
-    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
-    # Compute ROUGE scores using the `evaluate` library
-    rouge_output = rouge.compute(predictions=decoded_preds, references=decoded_labels)
-    return {
-        "rouge1": rouge_output["rouge1"],
-        "rouge2": rouge_output["rouge2"],
-        "rougeL": rouge_output["rougeL"],
-    }
-# Update trainer to include costom metrics
-trainer.compute_metrics = compute_metrics
-# Evaluate the model
-eval_result = trainer.evaluate()
-print(eval_result)
-# Save the fine-tuned model
-trainer.save_model("fine-tuned-mt5")
-tokenizer.save_pretrained("fine-tuned-mt5")
-# Load required libraries
-from transformers import T5Tokenizer, MT5ForConditionalGeneration
-# Load the fine-tuned tokenizer and model
-model_name = "fine-tuned-mt5"
-new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
-new_model = MT5ForConditionalGeneration.from_pretrained(model_name)
-from transformers import pipeline
-import torch
-# Syötteesi
-# Restructured input
-text = (
-    "Summarize the following information regarding psoriasis, its effects on skin health, and its potential health risks:\n\n"
-    "1. Psoriasis is an autoimmune condition that leads to inflammation in the skin.\n"
-    "2. Immune system dysfunction causes inflammatory cells to accumulate in the dermis, the middle layer of the skin.\n"
-    "3. The condition accelerates skin cell growth, with skin cells shedding more quickly than usual.\n"
-    "4. This abnormal shedding results in uncomfortable symptoms like raised plaques, scales, and redness.\n"
-    "5. Psoriasis not only affects the skin but also increases the risk of serious health issues, including heart disease, cancer, and inflammatory bowel disease.\n\n"
-    "Please provide a summary."
-)
-# Määrittele laite (GPU tai CPU)
-device = 0 if torch.cuda.is_available() else -1
-# Lataa tiivistämispipeline
-summarizer = pipeline("summarization", model=new_model, tokenizer=new_tokenizer, device=device)
-# Tiivistä teksti
-summary = summarizer(text,
-                     max_length=120,
-                     min_length=30,
-                     do_sample=False,
-                     num_beams=10,
-                     repetition_penalty=5.0,
-                     no_repeat_ngram_size=2,
-                     length_penalty=1.0)[0]["summary_text"]
-# Clean the summary by removing the <extra_id_0> token
-import re
-# Regular expression to match both <extra_id_X> and <id_XX>
-pattern = r"<(extra_id_\d+|id_\d+)>"
-# Replace all matches with a space
-cleaned_summary = re.sub(pattern, " ", summary).strip()
-print(cleaned_summary)
-# Niinan koodi
-!pip install gradio PyMuPDF
-import gradio as gr
-from transformers import T5Tokenizer, MT5ForConditionalGeneration
-import fitz  # PyMuPDF
-# Load the fine-tuned tokenizer and model
-model_name = "fine-tuned-mt5"
-new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
-new_model = MT5ForConditionalGeneration.from_pretrained(model_name)
-# Function to extract text from PDF using PyMuPDF
-def extract_text_from_pdf(pdf_file):
-    text = ""
-    # Open the PDF file
-    with fitz.open(pdf_file) as doc:
-        for page in doc:
-            text += page.get_text()  # Extract text from each page
-    return text
-# Summarization function
-def summarize_pdf(pdf_file, max_summary_length):
-    # Extract text from the PDF
-    input_text = extract_text_from_pdf(pdf_file)
-    # Tokenize the input to check length
-    tokenized_input = new_tokenizer.encode(input_text, return_tensors='pt')
-    try:
-        # Generate the summary
-        summary_ids = new_model.generate(
-            tokenized_input,
-            max_length=max_summary_length,
-            min_length=30,
-            num_beams=15,
-            repetition_penalty=5.0,
-            no_repeat_ngram_size=2
-        )
-        # Decode the generated summary
-        summary = new_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-        # Clean up the summary to remove unwanted tokens
-        cleaned_summary = ' '.join([token for token in summary.split() if not token.startswith('<extra_id_')]).strip()
-        # Ensure the summary ends with a complete sentence
-        if cleaned_summary:
-            last_period_index = cleaned_summary.rfind('.')
-            if last_period_index != -1 and last_period_index < len(cleaned_summary) - 1:
-                cleaned_summary = cleaned_summary[:last_period_index + 1]
-            else:
-                cleaned_summary = cleaned_summary.strip()
-        return cleaned_summary if cleaned_summary else "No valid summary generated."
-    except Exception as e:
-        return str(e)  # Return the error message for debugging
-# Define the Gradio interface
-interface = gr.Interface(
-    fn=summarize_pdf,
-    inputs=[
-        gr.File(label="Upload PDF"),
-        gr.Slider(50, 300, step=10, label="Max summary length")
-    ],
-    outputs="textbox",  # A textbox for the output summary
-    title="PDF Text Summarizer",
-    description="Upload a PDF file to summarize its content."
-)
-# Launch the interface
-# Launch the interface with debug mode enabled
-interface.launch(debug=True)

+import transformers
+import datasets
+import torch
+import sentencepiece
+import evaluate
+from datasets import load_dataset
+from transformers import MT5ForConditionalGeneration, T5Tokenizer
+import re
+# Load dataset
+ds = load_dataset("scillm/scientific_papers-archive", split="test")
+# Select the first 1000 examples
+small_ds = ds.select(range(1000))
+# Preprocessing function to remove unwanted references
+def preprocess_text(text):
+    # Remove unwanted references like @xcite
+    text = re.sub(r'@\w+', '', text)  # Remove anything that starts with @
+    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
+    return text
+# Preprocessing function
+def preprocess(examples):
+    # Preprocess articles and summaries
+    articles = [preprocess_text(article) for article in examples["input"]]
+    outputs = [preprocess_text(output) for output in examples["output"]]
+    # Add prefix to the articles
+    inputs = ["summarize: " + article for article in articles]
+    # Tokenize articles
+    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
+    # Tokenize summaries
+    labels = tokenizer(outputs, max_length=128, truncation=True, padding="max_length")
+    model_inputs["labels"] = labels["input_ids"]
+    return model_inputs
+# Load mT5 model and tokenizer
+model_name = "google/mt5-small"  # You can also use other mT5 models
+tokenizer = T5Tokenizer.from_pretrained(model_name)
+model = MT5ForConditionalGeneration.from_pretrained(model_name)
+# Tokenize the smaller dataset
+tokenized_small_ds = small_ds.map(preprocess, batched=True)
+# Verify that the dataset is correctly tokenized
+print(tokenized_small_ds[0])
+# Split the data into train and test set
+small_ds = ds.train_test_split(test_size=0.2)
+small_ds["train"][0]
+print(small_ds['train'].features)
+print(small_ds.column_names)
+from transformers import T5Tokenizer
+model_name = "google/mt5-small"
+tokenizer = T5Tokenizer.from_pretrained(model_name)
+# Apply preprocessing function to dataset
+tokenized_ds = small_ds.map(preprocess, batched=True)
+from transformers import DataCollatorForSeq2Seq
+data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)
+import torch
+torch.cuda.empty_cache()
+nvidia-smi
+pip install wandb
+import wandb
+wandb.login()
+from transformers import MT5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
+import torch
+# Load the model
+model_name = "google/mt5-small"
+model = MT5ForConditionalGeneration.from_pretrained(model_name)
+# Set the device
+device = torch.device("cpu")
+model.to(device)
+# Ensure model parameters are contiguous
+for name, param in model.named_parameters():
+    if not param.is_contiguous():
+        param.data = param.data.contiguous()  # Make the tensor contiguous
+        print(f"Made {name} contiguous.")
+training_args = Seq2SeqTrainingArguments(
+    output_dir='./results',
+    num_train_epochs=10,
+    per_device_train_batch_size=4,  # Pienennä batch-kokoa
+    per_device_eval_batch_size=4,
+    evaluation_strategy='epoch',
+    logging_dir='./logs',
+    predict_with_generate=True
+)
+# Create trainer instance
+trainer = Seq2SeqTrainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_small_ds.shuffle().select(range(80)),  # Käytetään 800 esimerkkiä koulutukseen
+    eval_dataset=tokenized_small_ds.shuffle().select(range(20, 100)),  # Käytetään 200 esimerkkiä arvioimiseen
+)
+# Kouluta malli
+trainer.train()
+pip install rouge_score
+import evaluate
+rouge = evaluate.load("rouge")
+def compute_metrics(eval_pred):
+    predictions, labels = eval_pred
+    # Decode predictions and labels (remove special tokens)
+    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
+    # Replace -100 in labels (ignore index) with the padding token id
+    labels[labels == -100] = tokenizer.pad_token_id
+    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+    # Compute ROUGE scores using the `evaluate` library
+    rouge_output = rouge.compute(predictions=decoded_preds, references=decoded_labels)
+    return {
+        "rouge1": rouge_output["rouge1"],
+        "rouge2": rouge_output["rouge2"],
+        "rougeL": rouge_output["rougeL"],
+    }
+# Update trainer to include costom metrics
+trainer.compute_metrics = compute_metrics
+# Evaluate the model
+eval_result = trainer.evaluate()
+print(eval_result)
+# Save the fine-tuned model
+trainer.save_model("fine-tuned-mt5")
+tokenizer.save_pretrained("fine-tuned-mt5")
+# Load required libraries
+from transformers import T5Tokenizer, MT5ForConditionalGeneration
+# Load the fine-tuned tokenizer and model
+model_name = "fine-tuned-mt5"
+new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
+new_model = MT5ForConditionalGeneration.from_pretrained(model_name)
+from transformers import pipeline
+import torch
+# Syötteesi
+# Restructured input
+text = (
+    "Summarize the following information regarding psoriasis, its effects on skin health, and its potential health risks:\n\n"
+    "1. Psoriasis is an autoimmune condition that leads to inflammation in the skin.\n"
+    "2. Immune system dysfunction causes inflammatory cells to accumulate in the dermis, the middle layer of the skin.\n"
+    "3. The condition accelerates skin cell growth, with skin cells shedding more quickly than usual.\n"
+    "4. This abnormal shedding results in uncomfortable symptoms like raised plaques, scales, and redness.\n"
+    "5. Psoriasis not only affects the skin but also increases the risk of serious health issues, including heart disease, cancer, and inflammatory bowel disease.\n\n"
+    "Please provide a summary."
+)
+# Määrittele laite (GPU tai CPU)
+device = 0 if torch.cuda.is_available() else -1
+# Lataa tiivistämispipeline
+summarizer = pipeline("summarization", model=new_model, tokenizer=new_tokenizer, device=device)
+# Tiivistä teksti
+summary = summarizer(text,
+                     max_length=120,
+                     min_length=30,
+                     do_sample=False,
+                     num_beams=10,
+                     repetition_penalty=5.0,
+                     no_repeat_ngram_size=2,
+                     length_penalty=1.0)[0]["summary_text"]
+# Clean the summary by removing the <extra_id_0> token
+import re
+# Regular expression to match both <extra_id_X> and <id_XX>
+pattern = r"<(extra_id_\d+|id_\d+)>"
+# Replace all matches with a space
+cleaned_summary = re.sub(pattern, " ", summary).strip()
+print(cleaned_summary)
+# Niinan koodi
+!pip install gradio PyMuPDF
+import gradio as gr
+from transformers import T5Tokenizer, MT5ForConditionalGeneration
+import fitz  # PyMuPDF
+# Load the fine-tuned tokenizer and model
+model_name = "fine-tuned-mt5"
+new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
+new_model = MT5ForConditionalGeneration.from_pretrained(model_name)
+# Function to extract text from PDF using PyMuPDF
+def extract_text_from_pdf(pdf_file):
+    text = ""
+    # Open the PDF file
+    with fitz.open(pdf_file) as doc:
+        for page in doc:
+            text += page.get_text()  # Extract text from each page
+    return text
+# Summarization function
+def summarize_pdf(pdf_file, max_summary_length):
+    # Extract text from the PDF
+    input_text = extract_text_from_pdf(pdf_file)
+    # Tokenize the input to check length
+    tokenized_input = new_tokenizer.encode(input_text, return_tensors='pt')
+    try:
+        # Generate the summary
+        summary_ids = new_model.generate(
+            tokenized_input,
+            max_length=max_summary_length,
+            min_length=30,
+            num_beams=15,
+            repetition_penalty=5.0,
+            no_repeat_ngram_size=2
+        )
+        # Decode the generated summary
+        summary = new_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        # Clean up the summary to remove unwanted tokens
+        cleaned_summary = ' '.join([token for token in summary.split() if not token.startswith('<extra_id_')]).strip()
+        # Ensure the summary ends with a complete sentence
+        if cleaned_summary:
+            last_period_index = cleaned_summary.rfind('.')
+            if last_period_index != -1 and last_period_index < len(cleaned_summary) - 1:
+                cleaned_summary = cleaned_summary[:last_period_index + 1]
+            else:
+                cleaned_summary = cleaned_summary.strip()
+        return cleaned_summary if cleaned_summary else "No valid summary generated."
+    except Exception as e:
+        return str(e)  # Return the error message for debugging
+# Define the Gradio interface
+interface = gr.Interface(
+    fn=summarize_pdf,
+    inputs=[
+        gr.File(label="Upload PDF"),
+        gr.Slider(50, 300, step=10, label="Max summary length")
+    ],
+    outputs="textbox",  # A textbox for the output summary
+    title="PDF Text Summarizer",
+    description="Upload a PDF file to summarize its content."
+)
+# Launch the interface
+# Launch the interface with debug mode enabled
+interface.launch(debug=True)