File size: 3,621 Bytes
fec6c47 9d718a8 fec6c47 9d718a8 498f44e 9d718a8 efda035 fec6c47 f38fa4e 9d718a8 efda035 fec6c47 498f44e fe18d9f 9d718a8 498f44e 9d718a8 498f44e 9d718a8 498f44e 9d718a8 fec6c47 9d718a8 fec6c47 9d718a8 fec6c47 498f44e fec6c47 498f44e fec6c47 498f44e 9d718a8 498f44e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
# app.py (Enhanced for Medical Models, Image Analysis, & OSINT)
import os
import re
import json
import torch
import fitz # PyMuPDF
from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPProcessor, CLIPModel, DataCollatorForLanguageModeling
from sklearn.model_selection import train_test_split
from io import BytesIO
from PIL import Image
import gradio as gr
from torch.utils.data import Dataset, DataLoader
import pytesseract
from PIL.ExifTags import TAGS
# Custom Dataset Class for Better Handling
class ConversationDataset(Dataset):
def __init__(self, inputs, labels):
self.inputs = inputs
self.labels = labels
def __len__(self):
return len(self.inputs)
def __getitem__(self, idx):
return {
'input_ids': self.inputs[idx][0],
'labels': self.labels[idx][0]
}
def extract_metadata(image):
"""
Extract metadata from an image file.
"""
metadata = {}
try:
exif_data = image._getexif()
if exif_data:
for tag, value in exif_data.items():
tag_name = TAGS.get(tag, tag)
metadata[tag_name] = value
except Exception as e:
metadata['error'] = str(e)
return metadata
def extract_text_and_images_from_pdf(pdf_stream):
doc = fitz.open(stream=pdf_stream, filetype='pdf')
text = ''
images = []
for page_index in range(len(doc)):
page = doc.load_page(page_index)
text += page.get_text() + '\n'
for img_index, img in enumerate(page.get_images(full=True)):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image['image']
image = Image.open(BytesIO(image_bytes))
images.append(image)
doc.close()
return text, images
def analyze_images(images):
results = {}
for index, image in enumerate(images):
metadata = extract_metadata(image)
ocr_text = pytesseract.image_to_string(image)
results[f"Image_{index+1}"] = {
'metadata': metadata,
'ocr_text': ocr_text
}
return results
def generate_response(prompt, model, tokenizer, max_length=150):
model.eval()
with torch.no_grad():
input_ids = tokenizer(prompt, return_tensors='pt').input_ids
output = model.generate(input_ids, max_length=max_length, num_return_sequences=1)
response = tokenizer.decode(output[0], skip_special_tokens=True)
return response
tokenizer = AutoTokenizer.from_pretrained('microsoft/BioGPT-Large')
model = AutoModelForCausalLM.from_pretrained('microsoft/BioGPT-Large')
with gr.Blocks() as demo:
gr.Markdown("# Medical LLM Model Training, PDF Image Analysis & OSINT")
with gr.Tab("Generate Text"):
prompt_input = gr.Textbox(label="Enter Medical Prompt")
generate_output = gr.Textbox(label="Generated Medical Text")
prompt_input.change(generate_response, inputs=[prompt_input, model, tokenizer], outputs=generate_output)
with gr.Tab("PDF Analysis"):
pdf_file = gr.File(label="Upload PDF")
pdf_output = gr.Textbox(label="Extracted Text")
image_analysis_output = gr.Textbox(label="Image Metadata & OCR")
def process_pdf(file):
pdf_content, images = extract_text_and_images_from_pdf(BytesIO(file.read()))
image_analysis_results = analyze_images(images)
return pdf_content, json.dumps(image_analysis_results, indent=4)
pdf_file.upload(process_pdf, inputs=pdf_file, outputs=[pdf_output, image_analysis_output])
demo.launch()
|