File size: 3,621 Bytes
fec6c47
9d718a8
 
 
 
 
 
fec6c47
9d718a8
 
498f44e
9d718a8
efda035
fec6c47
 
f38fa4e
9d718a8
efda035
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fec6c47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
498f44e
fe18d9f
9d718a8
498f44e
 
 
 
9d718a8
498f44e
 
 
 
 
 
 
 
9d718a8
498f44e
9d718a8
 
fec6c47
 
9d718a8
fec6c47
 
 
 
 
 
 
 
9d718a8
 
 
 
 
 
 
 
 
 
 
fec6c47
 
498f44e
 
 
fec6c47
498f44e
 
 
 
fec6c47
 
 
 
 
 
 
 
 
 
 
 
 
498f44e
9d718a8
498f44e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# app.py (Enhanced for Medical Models, Image Analysis, & OSINT)

import os
import re
import json
import torch
import fitz  # PyMuPDF
from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPProcessor, CLIPModel, DataCollatorForLanguageModeling
from sklearn.model_selection import train_test_split
from io import BytesIO
from PIL import Image
import gradio as gr
from torch.utils.data import Dataset, DataLoader
import pytesseract
from PIL.ExifTags import TAGS


# Custom Dataset Class for Better Handling
class ConversationDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx][0],
            'labels': self.labels[idx][0]
        }


def extract_metadata(image):
    """
    Extract metadata from an image file.
    """
    metadata = {}
    try:
        exif_data = image._getexif()
        if exif_data:
            for tag, value in exif_data.items():
                tag_name = TAGS.get(tag, tag)
                metadata[tag_name] = value
    except Exception as e:
        metadata['error'] = str(e)
    return metadata


def extract_text_and_images_from_pdf(pdf_stream):
    doc = fitz.open(stream=pdf_stream, filetype='pdf')
    text = ''
    images = []

    for page_index in range(len(doc)):
        page = doc.load_page(page_index)
        text += page.get_text() + '\n'

        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image['image']
            image = Image.open(BytesIO(image_bytes))
            images.append(image)

    doc.close()
    return text, images


def analyze_images(images):
    results = {}

    for index, image in enumerate(images):
        metadata = extract_metadata(image)
        ocr_text = pytesseract.image_to_string(image)
        results[f"Image_{index+1}"] = {
            'metadata': metadata,
            'ocr_text': ocr_text
        }
    return results


def generate_response(prompt, model, tokenizer, max_length=150):
    model.eval()
    with torch.no_grad():
        input_ids = tokenizer(prompt, return_tensors='pt').input_ids
        output = model.generate(input_ids, max_length=max_length, num_return_sequences=1)
        response = tokenizer.decode(output[0], skip_special_tokens=True)
        return response


tokenizer = AutoTokenizer.from_pretrained('microsoft/BioGPT-Large')
model = AutoModelForCausalLM.from_pretrained('microsoft/BioGPT-Large')


with gr.Blocks() as demo:
    gr.Markdown("# Medical LLM Model Training, PDF Image Analysis & OSINT")

    with gr.Tab("Generate Text"):
        prompt_input = gr.Textbox(label="Enter Medical Prompt")
        generate_output = gr.Textbox(label="Generated Medical Text")
        prompt_input.change(generate_response, inputs=[prompt_input, model, tokenizer], outputs=generate_output)

    with gr.Tab("PDF Analysis"):
        pdf_file = gr.File(label="Upload PDF")
        pdf_output = gr.Textbox(label="Extracted Text")
        image_analysis_output = gr.Textbox(label="Image Metadata & OCR")

        def process_pdf(file):
            pdf_content, images = extract_text_and_images_from_pdf(BytesIO(file.read()))
            image_analysis_results = analyze_images(images)
            return pdf_content, json.dumps(image_analysis_results, indent=4)

        pdf_file.upload(process_pdf, inputs=pdf_file, outputs=[pdf_output, image_analysis_output])


demo.launch()