Spaces:

arohcx
/

colab

Runtime error

App Files Files Community

colab / app.py

arohcx

Update app.py

fec6c47 verified 2 months ago

raw

history blame

3.62 kB

	# app.py (Enhanced for Medical Models, Image Analysis, & OSINT)

	import os
	import re
	import json
	import torch
	import fitz # PyMuPDF
	from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPProcessor, CLIPModel, DataCollatorForLanguageModeling
	from sklearn.model_selection import train_test_split
	from io import BytesIO
	from PIL import Image
	import gradio as gr
	from torch.utils.data import Dataset, DataLoader
	import pytesseract
	from PIL.ExifTags import TAGS


	# Custom Dataset Class for Better Handling
	class ConversationDataset(Dataset):
	def __init__(self, inputs, labels):
	self.inputs = inputs
	self.labels = labels

	def __len__(self):
	return len(self.inputs)

	def __getitem__(self, idx):
	return {
	'input_ids': self.inputs[idx][0],
	'labels': self.labels[idx][0]
	}


	def extract_metadata(image):
	"""
	Extract metadata from an image file.
	"""
	metadata = {}
	try:
	exif_data = image._getexif()
	if exif_data:
	for tag, value in exif_data.items():
	tag_name = TAGS.get(tag, tag)
	metadata[tag_name] = value
	except Exception as e:
	metadata['error'] = str(e)
	return metadata


	def extract_text_and_images_from_pdf(pdf_stream):
	doc = fitz.open(stream=pdf_stream, filetype='pdf')
	text = ''
	images = []

	for page_index in range(len(doc)):
	page = doc.load_page(page_index)
	text += page.get_text() + '\n'

	for img_index, img in enumerate(page.get_images(full=True)):
	xref = img[0]
	base_image = doc.extract_image(xref)
	image_bytes = base_image['image']
	image = Image.open(BytesIO(image_bytes))
	images.append(image)

	doc.close()
	return text, images


	def analyze_images(images):
	results = {}

	for index, image in enumerate(images):
	metadata = extract_metadata(image)
	ocr_text = pytesseract.image_to_string(image)
	results[f"Image_{index+1}"] = {
	'metadata': metadata,
	'ocr_text': ocr_text
	}
	return results


	def generate_response(prompt, model, tokenizer, max_length=150):
	model.eval()
	with torch.no_grad():
	input_ids = tokenizer(prompt, return_tensors='pt').input_ids
	output = model.generate(input_ids, max_length=max_length, num_return_sequences=1)
	response = tokenizer.decode(output[0], skip_special_tokens=True)
	return response


	tokenizer = AutoTokenizer.from_pretrained('microsoft/BioGPT-Large')
	model = AutoModelForCausalLM.from_pretrained('microsoft/BioGPT-Large')


	with gr.Blocks() as demo:
	gr.Markdown("# Medical LLM Model Training, PDF Image Analysis & OSINT")

	with gr.Tab("Generate Text"):
	prompt_input = gr.Textbox(label="Enter Medical Prompt")
	generate_output = gr.Textbox(label="Generated Medical Text")
	prompt_input.change(generate_response, inputs=[prompt_input, model, tokenizer], outputs=generate_output)

	with gr.Tab("PDF Analysis"):
	pdf_file = gr.File(label="Upload PDF")
	pdf_output = gr.Textbox(label="Extracted Text")
	image_analysis_output = gr.Textbox(label="Image Metadata & OCR")

	def process_pdf(file):
	pdf_content, images = extract_text_and_images_from_pdf(BytesIO(file.read()))
	image_analysis_results = analyze_images(images)
	return pdf_content, json.dumps(image_analysis_results, indent=4)

	pdf_file.upload(process_pdf, inputs=pdf_file, outputs=[pdf_output, image_analysis_output])


	demo.launch()