Summarizer

Runtime error

File size: 9,064 Bytes

import transformers 
import datasets 
import torch 
import sentencepiece 
import evaluate


from datasets import load_dataset
from transformers import MT5ForConditionalGeneration, T5Tokenizer
import re

# Load dataset
ds = load_dataset("scillm/scientific_papers-archive", split="test")

# Select the first 1000 examples
small_ds = ds.select(range(1000))

# Preprocessing function to remove unwanted references
def preprocess_text(text):
    # Remove unwanted references like @xcite
    text = re.sub(r'@\w+', '', text)  # Remove anything that starts with @
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Preprocessing function
def preprocess(examples):
    # Preprocess articles and summaries
    articles = [preprocess_text(article) for article in examples["input"]]
    outputs = [preprocess_text(output) for output in examples["output"]]

    # Add prefix to the articles
    inputs = ["summarize: " + article for article in articles]

    # Tokenize articles
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    # Tokenize summaries
    labels = tokenizer(outputs, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Load mT5 model and tokenizer
model_name = "google/mt5-small"  # You can also use other mT5 models
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize the smaller dataset
tokenized_small_ds = small_ds.map(preprocess, batched=True)

# Verify that the dataset is correctly tokenized
print(tokenized_small_ds[0])

# Split the data into train and test set
small_ds = ds.train_test_split(test_size=0.2)

small_ds["train"][0]

print(small_ds['train'].features)

print(small_ds.column_names)

from transformers import T5Tokenizer

model_name = "google/mt5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Apply preprocessing function to dataset
tokenized_ds = small_ds.map(preprocess, batched=True)

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

import torch
torch.cuda.empty_cache()



#pip install wandb
import os
import wandb
api_key = os.getenv("API_KEY")

# Authenticate with WandB
wandb.login(key=api_key)
#print(os.getenv('API_KEY'))
#os.environ["API_KEY"]

from transformers import MT5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

# Load the model
model_name = "google/mt5-small"
model = MT5ForConditionalGeneration.from_pretrained(model_name)

# Set the device
device = torch.device("cpu")
model.to(device)
# Ensure model parameters are contiguous
for name, param in model.named_parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()  # Make the tensor contiguous
        print(f"Made {name} contiguous.")

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,
    evaluation_strategy='epoch',
    logging_dir='./logs',
    predict_with_generate=True
)

# Create trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_small_ds.shuffle().select(range(80)),  # Käytetään 800 esimerkkiä koulutukseen
    eval_dataset=tokenized_small_ds.shuffle().select(range(20, 100)),  # Käytetään 200 esimerkkiä arvioimiseen
)

# train the model
trainer.train()

#pip install rouge_score
import evaluate
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Decode predictions and labels (remove special tokens)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in labels (ignore index) with the padding token id
    labels[labels == -100] = tokenizer.pad_token_id
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores using the `evaluate` library
    rouge_output = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "rouge1": rouge_output["rouge1"],
        "rouge2": rouge_output["rouge2"],
        "rougeL": rouge_output["rougeL"],
    }

# Update trainer to include costom metrics
trainer.compute_metrics = compute_metrics

# Evaluate the model
eval_result = trainer.evaluate()
print(eval_result)

# Save the fine-tuned model
trainer.save_model("fine-tuned-mt5")
tokenizer.save_pretrained("fine-tuned-mt5")

# Load required libraries
from transformers import T5Tokenizer, MT5ForConditionalGeneration

# Load the fine-tuned tokenizer and model
model_name = "fine-tuned-mt5"
new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
new_model = MT5ForConditionalGeneration.from_pretrained(model_name)

from transformers import pipeline
import torch


# Restructured input
text = (
    "Summarize the following information regarding psoriasis, its effects on skin health, and its potential health risks:\n\n"
    "1. Psoriasis is an autoimmune condition that leads to inflammation in the skin.\n"
    "2. Immune system dysfunction causes inflammatory cells to accumulate in the dermis, the middle layer of the skin.\n"
    "3. The condition accelerates skin cell growth, with skin cells shedding more quickly than usual.\n"
    "4. This abnormal shedding results in uncomfortable symptoms like raised plaques, scales, and redness.\n"
    "5. Psoriasis not only affects the skin but also increases the risk of serious health issues, including heart disease, cancer, and inflammatory bowel disease.\n\n"
    "Please provide a summary."
)


# Määrittele laite (GPU tai CPU)
device = 0 if torch.cuda.is_available() else -1

# Load the pipeline
summarizer = pipeline("summarization", model=new_model, tokenizer=new_tokenizer, device=device)

# Summarize the text
summary = summarizer(text,
                     max_length=120,
                     min_length=30,
                     do_sample=False,
                     num_beams=10,
                     repetition_penalty=5.0,
                     no_repeat_ngram_size=2,
                     length_penalty=1.0)[0]["summary_text"]

# Clean the summary by removing the <extra_id_0> token

import re

# Regular expression to match both <extra_id_X> and <id_XX>
pattern = r"<(extra_id_\d+|id_\d+)>"

# Replace all matches with a space
cleaned_summary = re.sub(pattern, " ", summary).strip()


print(cleaned_summary)




import gradio as gr
from transformers import T5Tokenizer, MT5ForConditionalGeneration
import fitz  # PyMuPDF

# Load the fine-tuned tokenizer and model
model_name = "fine-tuned-mt5"
new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
new_model = MT5ForConditionalGeneration.from_pretrained(model_name)

# Function to extract text from PDF using PyMuPDF
def extract_text_from_pdf(pdf_file):
    text = ""
    # Open the PDF file
    with fitz.open(pdf_file) as doc:
        for page in doc:
            text += page.get_text()  # Extract text from each page
    return text

# Summarization function
def summarize_pdf(pdf_file, max_summary_length):
    # Extract text from the PDF
    input_text = extract_text_from_pdf(pdf_file)

    # Tokenize the input to check length
    tokenized_input = new_tokenizer.encode(input_text, return_tensors='pt')



    try:
        # Generate the summary
        summary_ids = new_model.generate(
            tokenized_input,
            max_length=max_summary_length,
            min_length=30,
            num_beams=15,
            repetition_penalty=5.0,
            no_repeat_ngram_size=2
        )

        # Decode the generated summary
        summary = new_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        # Clean up the summary to remove unwanted tokens
        cleaned_summary = ' '.join([token for token in summary.split() if not token.startswith('<extra_id_')]).strip()

        # Ensure the summary ends with a complete sentence
        if cleaned_summary:
            last_period_index = cleaned_summary.rfind('.')
            if last_period_index != -1 and last_period_index < len(cleaned_summary) - 1:
                cleaned_summary = cleaned_summary[:last_period_index + 1]
            else:
                cleaned_summary = cleaned_summary.strip()

        return cleaned_summary if cleaned_summary else "No valid summary generated."

    except Exception as e:
        return str(e)  # Return the error message for debugging

# Define the Gradio interface
interface = gr.Interface(
    fn=summarize_pdf,
    inputs=[
        gr.File(label="Upload PDF"),
        gr.Slider(50, 300, step=10, label="Max summary length")
    ],
    outputs="textbox",  # A textbox for the output summary
    title="PDF Text Summarizer",
    description="Upload a PDF file to summarize its content."
)


# Launch the interface with debug mode enabled
interface.launch(debug=True)