Update app.py
Browse files
app.py
CHANGED
@@ -1,22 +1,18 @@
|
|
1 |
-
# app.py (Enhanced for Medical Models
|
2 |
|
3 |
import os
|
4 |
import re
|
5 |
import json
|
6 |
import torch
|
7 |
import fitz # PyMuPDF
|
8 |
-
from transformers import
|
9 |
from sklearn.model_selection import train_test_split
|
10 |
from io import BytesIO
|
11 |
from PIL import Image
|
12 |
import gradio as gr
|
13 |
from torch.utils.data import Dataset, DataLoader
|
14 |
-
|
15 |
-
from
|
16 |
-
|
17 |
-
tokenizer = AutoTokenizer.from_pretrained('microsoft/BioGPT-Large')
|
18 |
-
model = AutoModelForCausalLM.from_pretrained('microsoft/BioGPT-Large')
|
19 |
-
|
20 |
|
21 |
|
22 |
# Custom Dataset Class for Better Handling
|
@@ -35,6 +31,22 @@ class ConversationDataset(Dataset):
|
|
35 |
}
|
36 |
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
def extract_text_and_images_from_pdf(pdf_stream):
|
39 |
doc = fitz.open(stream=pdf_stream, filetype='pdf')
|
40 |
text = ''
|
@@ -44,7 +56,6 @@ def extract_text_and_images_from_pdf(pdf_stream):
|
|
44 |
page = doc.load_page(page_index)
|
45 |
text += page.get_text() + '\n'
|
46 |
|
47 |
-
# Extracting Images
|
48 |
for img_index, img in enumerate(page.get_images(full=True)):
|
49 |
xref = img[0]
|
50 |
base_image = doc.extract_image(xref)
|
@@ -56,72 +67,17 @@ def extract_text_and_images_from_pdf(pdf_stream):
|
|
56 |
return text, images
|
57 |
|
58 |
|
59 |
-
def
|
60 |
-
|
61 |
-
if file.name.endswith('.pdf'):
|
62 |
-
pdf_stream = BytesIO(content)
|
63 |
-
text, images = extract_text_and_images_from_pdf(pdf_stream)
|
64 |
-
else:
|
65 |
-
text = content.decode('utf-8')
|
66 |
-
images = []
|
67 |
-
|
68 |
-
lines = text.split('\n')
|
69 |
-
conversations = []
|
70 |
-
conversation = {'prompt': '', 'response': ''}
|
71 |
-
user_turn = True
|
72 |
-
|
73 |
-
for line in lines:
|
74 |
-
line = line.strip()
|
75 |
-
if line == '':
|
76 |
-
continue
|
77 |
-
if user_turn:
|
78 |
-
conversation['prompt'] += line + ' '
|
79 |
-
else:
|
80 |
-
conversation['response'] += line + ' '
|
81 |
-
|
82 |
-
user_turn = not user_turn
|
83 |
-
|
84 |
-
if not user_turn:
|
85 |
-
conversations.append(conversation)
|
86 |
-
conversation = {'prompt': '', 'response': ''}
|
87 |
-
|
88 |
-
return conversations, images
|
89 |
-
|
90 |
-
|
91 |
-
def analyze_images_in_pdf(images):
|
92 |
-
descriptions = []
|
93 |
-
|
94 |
-
for image in images:
|
95 |
-
inputs = clip_processor(images=image, return_tensors="pt")
|
96 |
-
outputs = clip_model.get_image_features(**inputs)
|
97 |
-
descriptions.append(f"Image processed successfully. Feature vector shape: {outputs.shape}")
|
98 |
-
|
99 |
-
return descriptions
|
100 |
-
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
save_total_limit=2,
|
111 |
-
logging_dir='./logs',
|
112 |
-
)
|
113 |
-
|
114 |
-
trainer = Trainer(
|
115 |
-
model=model,
|
116 |
-
args=training_args,
|
117 |
-
train_dataset=dataset,
|
118 |
-
data_collator=data_collator,
|
119 |
-
)
|
120 |
-
|
121 |
-
trainer.train()
|
122 |
-
|
123 |
-
model.save_pretrained('./trained_model')
|
124 |
-
tokenizer.save_pretrained('./trained_model')
|
125 |
|
126 |
|
127 |
def generate_response(prompt, model, tokenizer, max_length=150):
|
@@ -133,47 +89,29 @@ def generate_response(prompt, model, tokenizer, max_length=150):
|
|
133 |
return response
|
134 |
|
135 |
|
136 |
-
|
137 |
-
|
138 |
-
conversations, images = load_and_preprocess_data(file)
|
139 |
-
inputs, labels = tokenize_data(conversations, tokenizer)
|
140 |
-
train_model(inputs, labels, model, tokenizer)
|
141 |
-
|
142 |
-
if images:
|
143 |
-
image_analysis = analyze_images_in_pdf(images)
|
144 |
-
return f"β
Model training completed successfully. {len(images)} images processed."
|
145 |
-
else:
|
146 |
-
return "β
Model training completed successfully. No images found."
|
147 |
-
except Exception as e:
|
148 |
-
return f"β Error during training: {str(e)}"
|
149 |
-
|
150 |
-
|
151 |
-
def generate_interface(prompt):
|
152 |
-
try:
|
153 |
-
return generate_response(prompt, model, tokenizer)
|
154 |
-
except Exception as e:
|
155 |
-
return f"β Error during generation: {str(e)}"
|
156 |
-
|
157 |
-
|
158 |
-
tokenizer = AutoTokenizer.from_pretrained('microsoft/BioGPT')
|
159 |
-
model = AutoModelForSequenceClassification.from_pretrained('microsoft/BioGPT')
|
160 |
-
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
161 |
|
162 |
-
clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
|
163 |
-
clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
|
164 |
|
165 |
with gr.Blocks() as demo:
|
166 |
-
gr.Markdown("# Medical LLM Model Training
|
167 |
-
|
168 |
-
with gr.Tab("Train LLM"):
|
169 |
-
file_input = gr.File(label="Upload PDF or Text File")
|
170 |
-
train_output = gr.Textbox(label="Training Status")
|
171 |
-
file_input.change(train_interface, inputs=file_input, outputs=train_output)
|
172 |
|
173 |
with gr.Tab("Generate Text"):
|
174 |
prompt_input = gr.Textbox(label="Enter Medical Prompt")
|
175 |
generate_output = gr.Textbox(label="Generated Medical Text")
|
176 |
-
prompt_input.change(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
|
178 |
|
179 |
demo.launch()
|
|
|
1 |
+
# app.py (Enhanced for Medical Models, Image Analysis, & OSINT)
|
2 |
|
3 |
import os
|
4 |
import re
|
5 |
import json
|
6 |
import torch
|
7 |
import fitz # PyMuPDF
|
8 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPProcessor, CLIPModel, DataCollatorForLanguageModeling
|
9 |
from sklearn.model_selection import train_test_split
|
10 |
from io import BytesIO
|
11 |
from PIL import Image
|
12 |
import gradio as gr
|
13 |
from torch.utils.data import Dataset, DataLoader
|
14 |
+
import pytesseract
|
15 |
+
from PIL.ExifTags import TAGS
|
|
|
|
|
|
|
|
|
16 |
|
17 |
|
18 |
# Custom Dataset Class for Better Handling
|
|
|
31 |
}
|
32 |
|
33 |
|
34 |
+
def extract_metadata(image):
|
35 |
+
"""
|
36 |
+
Extract metadata from an image file.
|
37 |
+
"""
|
38 |
+
metadata = {}
|
39 |
+
try:
|
40 |
+
exif_data = image._getexif()
|
41 |
+
if exif_data:
|
42 |
+
for tag, value in exif_data.items():
|
43 |
+
tag_name = TAGS.get(tag, tag)
|
44 |
+
metadata[tag_name] = value
|
45 |
+
except Exception as e:
|
46 |
+
metadata['error'] = str(e)
|
47 |
+
return metadata
|
48 |
+
|
49 |
+
|
50 |
def extract_text_and_images_from_pdf(pdf_stream):
|
51 |
doc = fitz.open(stream=pdf_stream, filetype='pdf')
|
52 |
text = ''
|
|
|
56 |
page = doc.load_page(page_index)
|
57 |
text += page.get_text() + '\n'
|
58 |
|
|
|
59 |
for img_index, img in enumerate(page.get_images(full=True)):
|
60 |
xref = img[0]
|
61 |
base_image = doc.extract_image(xref)
|
|
|
67 |
return text, images
|
68 |
|
69 |
|
70 |
+
def analyze_images(images):
|
71 |
+
results = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
+
for index, image in enumerate(images):
|
74 |
+
metadata = extract_metadata(image)
|
75 |
+
ocr_text = pytesseract.image_to_string(image)
|
76 |
+
results[f"Image_{index+1}"] = {
|
77 |
+
'metadata': metadata,
|
78 |
+
'ocr_text': ocr_text
|
79 |
+
}
|
80 |
+
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
|
83 |
def generate_response(prompt, model, tokenizer, max_length=150):
|
|
|
89 |
return response
|
90 |
|
91 |
|
92 |
+
tokenizer = AutoTokenizer.from_pretrained('microsoft/BioGPT-Large')
|
93 |
+
model = AutoModelForCausalLM.from_pretrained('microsoft/BioGPT-Large')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
|
|
|
|
95 |
|
96 |
with gr.Blocks() as demo:
|
97 |
+
gr.Markdown("# Medical LLM Model Training, PDF Image Analysis & OSINT")
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
with gr.Tab("Generate Text"):
|
100 |
prompt_input = gr.Textbox(label="Enter Medical Prompt")
|
101 |
generate_output = gr.Textbox(label="Generated Medical Text")
|
102 |
+
prompt_input.change(generate_response, inputs=[prompt_input, model, tokenizer], outputs=generate_output)
|
103 |
+
|
104 |
+
with gr.Tab("PDF Analysis"):
|
105 |
+
pdf_file = gr.File(label="Upload PDF")
|
106 |
+
pdf_output = gr.Textbox(label="Extracted Text")
|
107 |
+
image_analysis_output = gr.Textbox(label="Image Metadata & OCR")
|
108 |
+
|
109 |
+
def process_pdf(file):
|
110 |
+
pdf_content, images = extract_text_and_images_from_pdf(BytesIO(file.read()))
|
111 |
+
image_analysis_results = analyze_images(images)
|
112 |
+
return pdf_content, json.dumps(image_analysis_results, indent=4)
|
113 |
+
|
114 |
+
pdf_file.upload(process_pdf, inputs=pdf_file, outputs=[pdf_output, image_analysis_output])
|
115 |
|
116 |
|
117 |
demo.launch()
|