import gradio as gr
import PyPDF2
from PyPDF2 import PdfReader
from io import BytesIO
import pytesseract
from PIL import Image
import spacy
import json

from transformers import pipeline
from PyPDF2 import PdfReader


# initiate model
ner_model = pipeline("token-classification", model="dslim/bert-large-NER")
summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn")
ner_models = {
    "bert-large-NER": "dslim/bert-large-NER",
    "bioNER": "d4data/biomedical-ner-all",
    "SpaCy English NER": "en_core_web_trf",
}
spacy_ner_model = spacy.load(ner_models["SpaCy English NER"])
ner_model_bio = pipeline("token-classification", model="d4data/biomedical-ner-all")

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
from spacy import displacy


# Extracting text from pdf & image file


def extract_text_from_pdf(pdf_bytes):
    text = ""
    pdf_file = BytesIO(pdf_bytes)
    pdf_reader = PdfReader(pdf_file)

    for page_number in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_number]
        text += page.extract_text()
    return text


def extract_text_from_image_or_pdf(file_bytes):
    try:
        if file_bytes.startswith(b"%PDF"):
            text = extract_text_from_pdf(file_bytes)
            print(text)
        else:
            image = Image.open(BytesIO(file_bytes))
            text = pytesseract.image_to_string(image)

        return text
    except Exception as e:
        return f"Error extracting file"


# Performs Named Entity Recognition (NER) on given text
def perform_ner(text, model_name):
    try:
        if model_name == "SpaCy English NER":
            doc = spacy_ner_model(text)
            extracted_entities = [
                {
                    "text": ent.text,
                    "type": ent.label_,
                    "start_index": ent.start_char,
                    "end_index": ent.end_char,
                }
                for ent in doc.ents
            ]
        elif model_name == "bert-large-NER":
            entities = ner_model(text)
            extracted_entities = [
                {
                    "text": entity["word"],
                    "type": entity["entity"],
                    "start_index": entity["start"],
                    "end_index": entity["end"],
                }
                for entity in entities
            ]
        else:
            entities = ner_model_bio(text)
            extracted_entities = [
                {
                    "text": entity["word"],
                    "type": entity["entity"],
                    "start_index": entity["start"],
                    "end_index": entity["end"],
                }
                for entity in entities
            ]

        return extracted_entities
    except Exception as e:
        return f"Error Performing NER: {str(e)}"


# this function takes row text , a list of entities with their start and end indices and maps with the assigned color
def highlight_entities_with_colors_and_label_tokenized(text, entities, color_mapping, tokenizer):
    highlighted_text = ""
    current_pos = 0

    for ent in entities:
        start, end, label = (
            ent.get("start_index", 0),
            ent.get("end_index", 0),
            ent.get("type", "0"),
        )
        entity_text = text[start:end]

        # tokenize the text
        encoded_entity = tokenizer.encode(entity_text, add_special_tokens=False)
        tokenized_entity_text = tokenizer.convert_ids_to_tokens(encoded_entity)
        tokenized_entity_length = len(tokenized_entity_text)

        # adding non entity text
        highlighted_text += text[current_pos:start]

        # adding highlighted entity text with color and label on the same time
        color = color_mapping.get(label, "#4D94FF")
        highlighted_text += f"<mark style='background-color:{color}' title='{label}'>{entity_text} ({label})</mark>"

        # Update current position
        current_pos = end

        # add any non remaining non-entity text
        highlighted_text += text[current_pos:]
        return highlighted_text


# Highlight named entities in the given color maping
def highlight_entities(text, entities, model_name):
    try:
        if model_name == "SpaCy English NER":
            doc = spacy_ner_model(text)
            color_mapping = {
                "DATE": "#4D94FF",  # Blue
                "PERSON": "#4CAF50",  # Green
                "EVENT": "#FF6666",  # Salmon
                "FAC": "#66B2FF",  # Sky Blue
                "GPE": "#FFCC99",  # Light Apricot
                "LANGUAGE": "#FF80BF",  # Pink
                "LAW": "#66FF99",  # Mint
                "LOC": "#809FFF",  # Lavender Blue
                "MONEY": "#FFFF99",  # Light Yellow
                "NORP": "#808000",  # Olive Green
                "ORDINAL": "#FF9999",  # Misty Rose
                "ORG": "#FFB366",  # Light Peach
                "PERCENT": "#FF99FF",  # Orchid
                "PRODUCT": "#FF6666",  # Salmon
                "QUANTITY": "#CC99FF",  # Pastel Purple
                "TIME": "#FFD54F",  # Amber
                "WORK_OF_ART": "#FFC266",  # Light Orange
                "CARDINAL": "#008080",  # Teal
            }

            options = {
                "ents": [entity["type"] for entity in entities],
                "colors": color_mapping,
            }
            html = displacy.render(doc, style="ent", options=options, page=True)
            colored_text = html
            return colored_text
        else:
            color_mapping = {
                "O": "pink",
                "B-MIS": "red",
                "I-MIS": "brown",
                "B-PER": "green",
                "I-PER": "#FFD54F",
                "B-ORG": "orange",
                "I-ORG": "#FF6666",
                "B-LOC": "purple",
                "I-LOC": "#FFCC99",
            }
            highlighted_example = highlight_entities_with_colors_and_label_tokenized(
                text, entities, color_mapping, tokenizer
            )
            return highlighted_example
    except Exception as e:
        return f"Error highlighted entities: {str(e)}"


# Summarize text
def summarized_text(input_text):
    summarized_text = summarization_pipeline(
        input_text,
        max_length=150,
        min_length=50,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True,
    )
    summarized_text = summarized_text[0]["summary_text"]

    return summarized_text


def image_ner_tool(file, model_name):
    reformatted_ner_output = ""
    try:
        if isinstance(file, str):
            with open(file, "rb") as file_stream:
                file_bytes = file_stream.read()
        else:
            file_bytes = file.getvalue()
        text = extract_text_from_image_or_pdf(file_bytes)
        entities = perform_ner(text, model_name)
        highlighted_text = highlight_entities(text, entities, model_name)

        reformatted_ner_output = json.dumps(entities, indent=2)

        summary = summarized_text(text)

        return text, highlighted_text, reformatted_ner_output, summary
    except Exception as e:
        error_message = f"Error processing file:{str(e)}"
        return error_message, "", reformatted_ner_output


# Gradio

# # adding custom css
# css ="""

# """

# # adding custom js
# js ="""

# """

with gr.Blocks(theme='shivi/calm_seafoam') as demo:
    gr.Markdown(
        """
        <p style="text-align: center; font-weight: bold; font-size: 44px;">
        Intelligent Document Processing
        </p>
        <p style="text-align: center;">
        Upload a PDF or an image file to extract text and identify named entities
        </p>
        """
    )

    with gr.Row() as row:
        with gr.Column():
            text1 = gr.File(label="Upload File")
            model = gr.Dropdown(list(ner_models.keys()), label="Select NER Model")
            btn = gr.Button("Submit")

        with gr.Column():
            with gr.Tab("Extract Text"):
                output1 = gr.Textbox(label="Extracted Text", container=True)
            with gr.Tab("Highlighted Entitiled"):
                output2 = gr.HTML("Summarize Text")
            with gr.Tab("Summarized Text"):
                output3 = gr.HTML("Summarize Text")
            with gr.Tab("Named Entities Extracted"):
                output4 = gr.HTML(label="Named Entities")

    btn.click(
        image_ner_tool,
        [text1, model],
        [output1, output2, output4, output3],
    )

demo.launch(share=True)