# app.py (Enhanced for Medical Models, Image Analysis, & OSINT) import os import re import json import torch import fitz # PyMuPDF from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPProcessor, CLIPModel, DataCollatorForLanguageModeling from sklearn.model_selection import train_test_split from io import BytesIO from PIL import Image import gradio as gr from torch.utils.data import Dataset, DataLoader import pytesseract from PIL.ExifTags import TAGS # Custom Dataset Class for Better Handling class ConversationDataset(Dataset): def __init__(self, inputs, labels): self.inputs = inputs self.labels = labels def __len__(self): return len(self.inputs) def __getitem__(self, idx): return { 'input_ids': self.inputs[idx][0], 'labels': self.labels[idx][0] } def extract_metadata(image): """ Extract metadata from an image file. """ metadata = {} try: exif_data = image._getexif() if exif_data: for tag, value in exif_data.items(): tag_name = TAGS.get(tag, tag) metadata[tag_name] = value except Exception as e: metadata['error'] = str(e) return metadata def extract_text_and_images_from_pdf(pdf_stream): doc = fitz.open(stream=pdf_stream, filetype='pdf') text = '' images = [] for page_index in range(len(doc)): page = doc.load_page(page_index) text += page.get_text() + '\n' for img_index, img in enumerate(page.get_images(full=True)): xref = img[0] base_image = doc.extract_image(xref) image_bytes = base_image['image'] image = Image.open(BytesIO(image_bytes)) images.append(image) doc.close() return text, images def analyze_images(images): results = {} for index, image in enumerate(images): metadata = extract_metadata(image) ocr_text = pytesseract.image_to_string(image) results[f"Image_{index+1}"] = { 'metadata': metadata, 'ocr_text': ocr_text } return results def generate_response(prompt, model, tokenizer, max_length=150): model.eval() with torch.no_grad(): input_ids = tokenizer(prompt, return_tensors='pt').input_ids output = model.generate(input_ids, max_length=max_length, num_return_sequences=1) response = tokenizer.decode(output[0], skip_special_tokens=True) return response tokenizer = AutoTokenizer.from_pretrained('microsoft/BioGPT-Large') model = AutoModelForCausalLM.from_pretrained('microsoft/BioGPT-Large') with gr.Blocks() as demo: gr.Markdown("# Medical LLM Model Training, PDF Image Analysis & OSINT") with gr.Tab("Generate Text"): prompt_input = gr.Textbox(label="Enter Medical Prompt") generate_output = gr.Textbox(label="Generated Medical Text") prompt_input.change(generate_response, inputs=[prompt_input, model, tokenizer], outputs=generate_output) with gr.Tab("PDF Analysis"): pdf_file = gr.File(label="Upload PDF") pdf_output = gr.Textbox(label="Extracted Text") image_analysis_output = gr.Textbox(label="Image Metadata & OCR") def process_pdf(file): pdf_content, images = extract_text_and_images_from_pdf(BytesIO(file.read())) image_analysis_results = analyze_images(images) return pdf_content, json.dumps(image_analysis_results, indent=4) pdf_file.upload(process_pdf, inputs=pdf_file, outputs=[pdf_output, image_analysis_output]) demo.launch()