colab / app.py
arohcx's picture
Update app.py
fec6c47 verified
raw
history blame
3.62 kB
# app.py (Enhanced for Medical Models, Image Analysis, & OSINT)
import os
import re
import json
import torch
import fitz # PyMuPDF
from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPProcessor, CLIPModel, DataCollatorForLanguageModeling
from sklearn.model_selection import train_test_split
from io import BytesIO
from PIL import Image
import gradio as gr
from torch.utils.data import Dataset, DataLoader
import pytesseract
from PIL.ExifTags import TAGS
# Custom Dataset Class for Better Handling
class ConversationDataset(Dataset):
def __init__(self, inputs, labels):
self.inputs = inputs
self.labels = labels
def __len__(self):
return len(self.inputs)
def __getitem__(self, idx):
return {
'input_ids': self.inputs[idx][0],
'labels': self.labels[idx][0]
}
def extract_metadata(image):
"""
Extract metadata from an image file.
"""
metadata = {}
try:
exif_data = image._getexif()
if exif_data:
for tag, value in exif_data.items():
tag_name = TAGS.get(tag, tag)
metadata[tag_name] = value
except Exception as e:
metadata['error'] = str(e)
return metadata
def extract_text_and_images_from_pdf(pdf_stream):
doc = fitz.open(stream=pdf_stream, filetype='pdf')
text = ''
images = []
for page_index in range(len(doc)):
page = doc.load_page(page_index)
text += page.get_text() + '\n'
for img_index, img in enumerate(page.get_images(full=True)):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image['image']
image = Image.open(BytesIO(image_bytes))
images.append(image)
doc.close()
return text, images
def analyze_images(images):
results = {}
for index, image in enumerate(images):
metadata = extract_metadata(image)
ocr_text = pytesseract.image_to_string(image)
results[f"Image_{index+1}"] = {
'metadata': metadata,
'ocr_text': ocr_text
}
return results
def generate_response(prompt, model, tokenizer, max_length=150):
model.eval()
with torch.no_grad():
input_ids = tokenizer(prompt, return_tensors='pt').input_ids
output = model.generate(input_ids, max_length=max_length, num_return_sequences=1)
response = tokenizer.decode(output[0], skip_special_tokens=True)
return response
tokenizer = AutoTokenizer.from_pretrained('microsoft/BioGPT-Large')
model = AutoModelForCausalLM.from_pretrained('microsoft/BioGPT-Large')
with gr.Blocks() as demo:
gr.Markdown("# Medical LLM Model Training, PDF Image Analysis & OSINT")
with gr.Tab("Generate Text"):
prompt_input = gr.Textbox(label="Enter Medical Prompt")
generate_output = gr.Textbox(label="Generated Medical Text")
prompt_input.change(generate_response, inputs=[prompt_input, model, tokenizer], outputs=generate_output)
with gr.Tab("PDF Analysis"):
pdf_file = gr.File(label="Upload PDF")
pdf_output = gr.Textbox(label="Extracted Text")
image_analysis_output = gr.Textbox(label="Image Metadata & OCR")
def process_pdf(file):
pdf_content, images = extract_text_and_images_from_pdf(BytesIO(file.read()))
image_analysis_results = analyze_images(images)
return pdf_content, json.dumps(image_analysis_results, indent=4)
pdf_file.upload(process_pdf, inputs=pdf_file, outputs=[pdf_output, image_analysis_output])
demo.launch()