|
|
|
|
|
import os |
|
import re |
|
import json |
|
import torch |
|
import fitz |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPProcessor, CLIPModel, DataCollatorForLanguageModeling |
|
from sklearn.model_selection import train_test_split |
|
from io import BytesIO |
|
from PIL import Image |
|
import gradio as gr |
|
from torch.utils.data import Dataset, DataLoader |
|
import pytesseract |
|
from PIL.ExifTags import TAGS |
|
|
|
|
|
|
|
class ConversationDataset(Dataset): |
|
def __init__(self, inputs, labels): |
|
self.inputs = inputs |
|
self.labels = labels |
|
|
|
def __len__(self): |
|
return len(self.inputs) |
|
|
|
def __getitem__(self, idx): |
|
return { |
|
'input_ids': self.inputs[idx][0], |
|
'labels': self.labels[idx][0] |
|
} |
|
|
|
|
|
def extract_metadata(image): |
|
""" |
|
Extract metadata from an image file. |
|
""" |
|
metadata = {} |
|
try: |
|
exif_data = image._getexif() |
|
if exif_data: |
|
for tag, value in exif_data.items(): |
|
tag_name = TAGS.get(tag, tag) |
|
metadata[tag_name] = value |
|
except Exception as e: |
|
metadata['error'] = str(e) |
|
return metadata |
|
|
|
|
|
def extract_text_and_images_from_pdf(pdf_stream): |
|
doc = fitz.open(stream=pdf_stream, filetype='pdf') |
|
text = '' |
|
images = [] |
|
|
|
for page_index in range(len(doc)): |
|
page = doc.load_page(page_index) |
|
text += page.get_text() + '\n' |
|
|
|
for img_index, img in enumerate(page.get_images(full=True)): |
|
xref = img[0] |
|
base_image = doc.extract_image(xref) |
|
image_bytes = base_image['image'] |
|
image = Image.open(BytesIO(image_bytes)) |
|
images.append(image) |
|
|
|
doc.close() |
|
return text, images |
|
|
|
|
|
def analyze_images(images): |
|
results = {} |
|
|
|
for index, image in enumerate(images): |
|
metadata = extract_metadata(image) |
|
ocr_text = pytesseract.image_to_string(image) |
|
results[f"Image_{index+1}"] = { |
|
'metadata': metadata, |
|
'ocr_text': ocr_text |
|
} |
|
return results |
|
|
|
|
|
def generate_response(prompt, model, tokenizer, max_length=150): |
|
model.eval() |
|
with torch.no_grad(): |
|
input_ids = tokenizer(prompt, return_tensors='pt').input_ids |
|
output = model.generate(input_ids, max_length=max_length, num_return_sequences=1) |
|
response = tokenizer.decode(output[0], skip_special_tokens=True) |
|
return response |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained('microsoft/BioGPT-Large') |
|
model = AutoModelForCausalLM.from_pretrained('microsoft/BioGPT-Large') |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Medical LLM Model Training, PDF Image Analysis & OSINT") |
|
|
|
with gr.Tab("Generate Text"): |
|
prompt_input = gr.Textbox(label="Enter Medical Prompt") |
|
generate_output = gr.Textbox(label="Generated Medical Text") |
|
prompt_input.change(generate_response, inputs=[prompt_input, model, tokenizer], outputs=generate_output) |
|
|
|
with gr.Tab("PDF Analysis"): |
|
pdf_file = gr.File(label="Upload PDF") |
|
pdf_output = gr.Textbox(label="Extracted Text") |
|
image_analysis_output = gr.Textbox(label="Image Metadata & OCR") |
|
|
|
def process_pdf(file): |
|
pdf_content, images = extract_text_and_images_from_pdf(BytesIO(file.read())) |
|
image_analysis_results = analyze_images(images) |
|
return pdf_content, json.dumps(image_analysis_results, indent=4) |
|
|
|
pdf_file.upload(process_pdf, inputs=pdf_file, outputs=[pdf_output, image_analysis_output]) |
|
|
|
|
|
demo.launch() |
|
|