arohcx commited on
Commit
fec6c47
Β·
verified Β·
1 Parent(s): e117acb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -108
app.py CHANGED
@@ -1,22 +1,18 @@
1
- # app.py (Enhanced for Medical Models & Image Analysis in PDFs)
2
 
3
  import os
4
  import re
5
  import json
6
  import torch
7
  import fitz # PyMuPDF
8
- from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling, CLIPProcessor, CLIPModel, AutoModelForSequenceClassification, AutoTokenizer
9
  from sklearn.model_selection import train_test_split
10
  from io import BytesIO
11
  from PIL import Image
12
  import gradio as gr
13
  from torch.utils.data import Dataset, DataLoader
14
-
15
- from transformers import AutoTokenizer, AutoModelForCausalLM
16
-
17
- tokenizer = AutoTokenizer.from_pretrained('microsoft/BioGPT-Large')
18
- model = AutoModelForCausalLM.from_pretrained('microsoft/BioGPT-Large')
19
-
20
 
21
 
22
  # Custom Dataset Class for Better Handling
@@ -35,6 +31,22 @@ class ConversationDataset(Dataset):
35
  }
36
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def extract_text_and_images_from_pdf(pdf_stream):
39
  doc = fitz.open(stream=pdf_stream, filetype='pdf')
40
  text = ''
@@ -44,7 +56,6 @@ def extract_text_and_images_from_pdf(pdf_stream):
44
  page = doc.load_page(page_index)
45
  text += page.get_text() + '\n'
46
 
47
- # Extracting Images
48
  for img_index, img in enumerate(page.get_images(full=True)):
49
  xref = img[0]
50
  base_image = doc.extract_image(xref)
@@ -56,72 +67,17 @@ def extract_text_and_images_from_pdf(pdf_stream):
56
  return text, images
57
 
58
 
59
- def load_and_preprocess_data(file):
60
- content = file.read()
61
- if file.name.endswith('.pdf'):
62
- pdf_stream = BytesIO(content)
63
- text, images = extract_text_and_images_from_pdf(pdf_stream)
64
- else:
65
- text = content.decode('utf-8')
66
- images = []
67
-
68
- lines = text.split('\n')
69
- conversations = []
70
- conversation = {'prompt': '', 'response': ''}
71
- user_turn = True
72
-
73
- for line in lines:
74
- line = line.strip()
75
- if line == '':
76
- continue
77
- if user_turn:
78
- conversation['prompt'] += line + ' '
79
- else:
80
- conversation['response'] += line + ' '
81
-
82
- user_turn = not user_turn
83
-
84
- if not user_turn:
85
- conversations.append(conversation)
86
- conversation = {'prompt': '', 'response': ''}
87
-
88
- return conversations, images
89
-
90
-
91
- def analyze_images_in_pdf(images):
92
- descriptions = []
93
-
94
- for image in images:
95
- inputs = clip_processor(images=image, return_tensors="pt")
96
- outputs = clip_model.get_image_features(**inputs)
97
- descriptions.append(f"Image processed successfully. Feature vector shape: {outputs.shape}")
98
-
99
- return descriptions
100
-
101
 
102
- def train_model(inputs, labels, model, tokenizer):
103
- dataset = ConversationDataset(inputs, labels)
104
- training_args = TrainingArguments(
105
- output_dir='./results',
106
- overwrite_output_dir=True,
107
- num_train_epochs=1,
108
- per_device_train_batch_size=2,
109
- save_steps=500,
110
- save_total_limit=2,
111
- logging_dir='./logs',
112
- )
113
-
114
- trainer = Trainer(
115
- model=model,
116
- args=training_args,
117
- train_dataset=dataset,
118
- data_collator=data_collator,
119
- )
120
-
121
- trainer.train()
122
-
123
- model.save_pretrained('./trained_model')
124
- tokenizer.save_pretrained('./trained_model')
125
 
126
 
127
  def generate_response(prompt, model, tokenizer, max_length=150):
@@ -133,47 +89,29 @@ def generate_response(prompt, model, tokenizer, max_length=150):
133
  return response
134
 
135
 
136
- def train_interface(file):
137
- try:
138
- conversations, images = load_and_preprocess_data(file)
139
- inputs, labels = tokenize_data(conversations, tokenizer)
140
- train_model(inputs, labels, model, tokenizer)
141
-
142
- if images:
143
- image_analysis = analyze_images_in_pdf(images)
144
- return f"βœ… Model training completed successfully. {len(images)} images processed."
145
- else:
146
- return "βœ… Model training completed successfully. No images found."
147
- except Exception as e:
148
- return f"❌ Error during training: {str(e)}"
149
-
150
-
151
- def generate_interface(prompt):
152
- try:
153
- return generate_response(prompt, model, tokenizer)
154
- except Exception as e:
155
- return f"❌ Error during generation: {str(e)}"
156
-
157
-
158
- tokenizer = AutoTokenizer.from_pretrained('microsoft/BioGPT')
159
- model = AutoModelForSequenceClassification.from_pretrained('microsoft/BioGPT')
160
- data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
161
 
162
- clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
163
- clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
164
 
165
  with gr.Blocks() as demo:
166
- gr.Markdown("# Medical LLM Model Training & PDF Image Analysis")
167
-
168
- with gr.Tab("Train LLM"):
169
- file_input = gr.File(label="Upload PDF or Text File")
170
- train_output = gr.Textbox(label="Training Status")
171
- file_input.change(train_interface, inputs=file_input, outputs=train_output)
172
 
173
  with gr.Tab("Generate Text"):
174
  prompt_input = gr.Textbox(label="Enter Medical Prompt")
175
  generate_output = gr.Textbox(label="Generated Medical Text")
176
- prompt_input.change(generate_interface, inputs=prompt_input, outputs=generate_output)
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
 
179
  demo.launch()
 
1
+ # app.py (Enhanced for Medical Models, Image Analysis, & OSINT)
2
 
3
  import os
4
  import re
5
  import json
6
  import torch
7
  import fitz # PyMuPDF
8
+ from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPProcessor, CLIPModel, DataCollatorForLanguageModeling
9
  from sklearn.model_selection import train_test_split
10
  from io import BytesIO
11
  from PIL import Image
12
  import gradio as gr
13
  from torch.utils.data import Dataset, DataLoader
14
+ import pytesseract
15
+ from PIL.ExifTags import TAGS
 
 
 
 
16
 
17
 
18
  # Custom Dataset Class for Better Handling
 
31
  }
32
 
33
 
34
+ def extract_metadata(image):
35
+ """
36
+ Extract metadata from an image file.
37
+ """
38
+ metadata = {}
39
+ try:
40
+ exif_data = image._getexif()
41
+ if exif_data:
42
+ for tag, value in exif_data.items():
43
+ tag_name = TAGS.get(tag, tag)
44
+ metadata[tag_name] = value
45
+ except Exception as e:
46
+ metadata['error'] = str(e)
47
+ return metadata
48
+
49
+
50
  def extract_text_and_images_from_pdf(pdf_stream):
51
  doc = fitz.open(stream=pdf_stream, filetype='pdf')
52
  text = ''
 
56
  page = doc.load_page(page_index)
57
  text += page.get_text() + '\n'
58
 
 
59
  for img_index, img in enumerate(page.get_images(full=True)):
60
  xref = img[0]
61
  base_image = doc.extract_image(xref)
 
67
  return text, images
68
 
69
 
70
+ def analyze_images(images):
71
+ results = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
+ for index, image in enumerate(images):
74
+ metadata = extract_metadata(image)
75
+ ocr_text = pytesseract.image_to_string(image)
76
+ results[f"Image_{index+1}"] = {
77
+ 'metadata': metadata,
78
+ 'ocr_text': ocr_text
79
+ }
80
+ return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
 
83
  def generate_response(prompt, model, tokenizer, max_length=150):
 
89
  return response
90
 
91
 
92
+ tokenizer = AutoTokenizer.from_pretrained('microsoft/BioGPT-Large')
93
+ model = AutoModelForCausalLM.from_pretrained('microsoft/BioGPT-Large')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
 
 
95
 
96
  with gr.Blocks() as demo:
97
+ gr.Markdown("# Medical LLM Model Training, PDF Image Analysis & OSINT")
 
 
 
 
 
98
 
99
  with gr.Tab("Generate Text"):
100
  prompt_input = gr.Textbox(label="Enter Medical Prompt")
101
  generate_output = gr.Textbox(label="Generated Medical Text")
102
+ prompt_input.change(generate_response, inputs=[prompt_input, model, tokenizer], outputs=generate_output)
103
+
104
+ with gr.Tab("PDF Analysis"):
105
+ pdf_file = gr.File(label="Upload PDF")
106
+ pdf_output = gr.Textbox(label="Extracted Text")
107
+ image_analysis_output = gr.Textbox(label="Image Metadata & OCR")
108
+
109
+ def process_pdf(file):
110
+ pdf_content, images = extract_text_and_images_from_pdf(BytesIO(file.read()))
111
+ image_analysis_results = analyze_images(images)
112
+ return pdf_content, json.dumps(image_analysis_results, indent=4)
113
+
114
+ pdf_file.upload(process_pdf, inputs=pdf_file, outputs=[pdf_output, image_analysis_output])
115
 
116
 
117
  demo.launch()