weihongliang commited on
Commit
5f1aa2d
Β·
verified Β·
1 Parent(s): 555d804

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -33
app.py CHANGED
@@ -10,7 +10,9 @@ import os
10
  from PIL import Image, ImageDraw, ImageFont
11
  import numpy as np
12
  import hashlib
13
- import spaces
 
 
14
 
15
  #First output the thinking process in <think> </think> tags and then output the final answer in <answer> </answer> tags.
16
  #Answer the following question based on the information above and the given image, and provide citations for your response.
@@ -133,8 +135,8 @@ def fetch_wikipedia_info(url):
133
  def recognize_celebrities(image_path, confidence_threshold=90):
134
  client = boto3.client(
135
  "rekognition",
136
- aws_access_key_id='AKIA6GSNGQMOW2UX3E52',
137
- aws_secret_access_key='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',
138
  region_name='us-east-1'
139
  )
140
 
@@ -169,7 +171,7 @@ def draw_bounding_boxes(image_path, bounding_boxes, names):
169
 
170
  # Add name label
171
  text = f"[{i+1}]: {names[i]}"
172
- font = ImageFont.truetype("arial.ttf", 20) # Adjust font and size as needed
173
  text_bbox = draw.textbbox((left, top - 20), text, font=font)
174
  draw.rectangle(text_bbox, fill="white")
175
  draw.text((left, top - 20), text, fill="red", font=font)
@@ -190,7 +192,7 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
190
  torch_dtype=torch.float16,
191
  device_map="auto"
192
  )
193
- processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
194
 
195
  # Use Qwen model for Q&A
196
  def qwen_qa(image_path, question, names, bounding_boxes, en_wiki_pedia_links, en_intros):
@@ -261,6 +263,7 @@ def is_example_image(image, examples):
261
  return False, None
262
 
263
  # Main processing function
 
264
  @spaces.GPU
265
  def process_image(image, question, confidence_threshold, examples=None):
266
  if image is None:
@@ -335,7 +338,6 @@ def process_image(image, question, confidence_threshold, examples=None):
335
  print(links)
336
  en_wiki_pedia_links.append("No English Wikipedia link found")
337
  en_intros.append(f"This person is {name}.")
338
-
339
  if not names:
340
  if os.path.exists(temp_image_path):
341
  os.remove(temp_image_path)
@@ -403,19 +405,6 @@ def create_interface():
403
  {'Width': 0.07422236353158951, 'Height': 0.15943190455436707, 'Left': 0.4633428454399109, 'Top': 0.07901764661073685},
404
  {'Width': 0.07562466710805893, 'Height': 0.13936467468738556, 'Left': 0.025178398936986923, 'Top': 0.4953641891479492}],
405
  ['https://www.wikidata.org/wiki/Q3572699', 'https://www.wikidata.org/wiki/Q92894', 'https://www.wikidata.org/wiki/Q3571662']],
406
- """["./clinton.jpg", "Who are the people in the picture, and what is the relationship between them?",
407
- ['Bill Clinton', 'Monica Lewinsky'],
408
- [{'Width': 0.07620880007743835, 'Height': 0.16198107600212097, 'Left': 0.5074607729911804, 'Top': 0.14220821857452393},
409
- {'Width': 0.0722670778632164, 'Height': 0.1512720286846161, 'Left': 0.3914872407913208, 'Top': 0.24376636743545532}],
410
- ['https://www.wikidata.org/wiki/Q1124', 'https://www.wikidata.org/wiki/Q212659']],
411
- ["./epst.jpeg", "Provide image description.",
412
- ['Lisa Randall', 'Kip S. Thorne', 'David Gross', 'Stephen Hawking', 'Brenda Chapman'],
413
- [{'Width': 0.09916354715824127, 'Height': 0.166521355509758, 'Left': 0.7962431311607361, 'Top': 0.4121580123901367},
414
- {'Width': 0.07940348237752914, 'Height': 0.1626593917608261, 'Left': 0.6891748905181885, 'Top': 0.33117005228996277},
415
- {'Width': 0.06350294500589371, 'Height': 0.12757645547389984, 'Left': 0.544218122959137, 'Top': 0.1575603038072586},
416
- {'Width': 0.06830617040395737, 'Height': 0.1128319799900055, 'Left': 0.2937725782394409, 'Top': 0.30404558777809143},
417
- {'Width': 0.03966952860355377, 'Height': 0.11658532917499542, 'Left': 0.18093101680278778, 'Top': 0.31299835443496704}],
418
- ['https://www.wikidata.org/wiki/Q450404', 'https://www.wikidata.org/wiki/Q323320', 'https://www.wikidata.org/wiki/Q40262', 'https://www.wikidata.org/wiki/Q17714', 'https://www.wikidata.org/wiki/Q429715']]"""
419
  ]
420
 
421
  # Filter examples to only include files that exist
@@ -424,19 +413,19 @@ def create_interface():
424
  if os.path.exists(example[0]):
425
  existing_examples.append(example)
426
 
427
- with gr.Blocks(title="Celebrity Recognition and Q&A System") as app:
428
- gr.Markdown("<div style='text-align: center;'><h1 style=' font-size: 28px; '>Celebrity Recognition and Q&A System</h1></div>")
429
  gr.Markdown("**RC-MLLM** model is developed based on the Qwen2-VL model through a novel method called **RCVIT (Region-level Context-aware Visual Instruction Tuning)**, using the specially constructed **RCMU dataset** for training. Its core feature is the capability for **Region-level Context-aware Multimodal Understanding (RCMU)**. This means it can simultaneously understand both the visual content of specific regions/objects within an image and their associated textual information (utilizing bounding boxes coordinates), allowing it to respond to user instructions in a more context-aware manner. Simply put, RC-MLLM not only understands images but can also integrate the textual information linked to specific objects within the image for understanding. It achieves outstanding performance on RCMU tasks and is suitable for applications like personalized conversation.")
430
 
431
  markdown_content = """
432
- πŸ“‘ [Arxiv](https://arxiv.org/abs/your-paper-id) |
433
- πŸ€— [Checkpoint]() |
434
  πŸ“ [Dataset](https://huggingface.co/your-model-name) |
435
- [Github](https://github.com/your-username/your-repo) |
436
- πŸš€ [Personalized Conversation](https://your-project-url.com)
437
  """
438
  gr.Markdown(markdown_content)
439
- gr.Markdown("πŸ“Œ Upload an image containing celebrities, the system will recognize them and provide Wikipedia-based Q&A using the RC-Qwen2-VL model.")
440
 
441
  with gr.Row():
442
  with gr.Column(scale=1):
@@ -450,7 +439,7 @@ def create_interface():
450
  label="Confidence Threshold (%)",
451
  info="Adjust the minimum confidence level for celebrity recognition"
452
  )
453
- submit_button = gr.Button("Ask RC-Qwen2-VL")
454
 
455
  # Add examples section
456
  if existing_examples:
@@ -512,9 +501,6 @@ def create_interface():
512
  - "What is the relationship between these people?"
513
  3. Adjust the confidence threshold slider if needed (lower values will recognize more faces but might be less accurate)
514
  4. Click the submit button to get the answer
515
- 5. Or try one of the examples below
516
- 6. The system caches recognition results for each image and confidence threshold combination
517
- 7. Cache is automatically cleared when you upload a new image
518
  """)
519
 
520
  return app
@@ -522,6 +508,4 @@ def create_interface():
522
  # Launch the application
523
  if __name__ == "__main__":
524
  app = create_interface()
525
- app.launch(share=False)
526
-
527
- #bash /mnt/14T-disk/code/instance-detection/run_script.sh
 
10
  from PIL import Image, ImageDraw, ImageFont
11
  import numpy as np
12
  import hashlib
13
+ import spaces # Make sure to import spaces
14
+
15
+
16
 
17
  #First output the thinking process in <think> </think> tags and then output the final answer in <answer> </answer> tags.
18
  #Answer the following question based on the information above and the given image, and provide citations for your response.
 
135
  def recognize_celebrities(image_path, confidence_threshold=90):
136
  client = boto3.client(
137
  "rekognition",
138
+ aws_access_key_id=os.getenv('aws_access_key_id'),
139
+ aws_secret_access_key=os.getenv('aws_secret_access_key'),
140
  region_name='us-east-1'
141
  )
142
 
 
171
 
172
  # Add name label
173
  text = f"[{i+1}]: {names[i]}"
174
+ font = ImageFont.truetype("./Arial.ttf", 20) # Adjust font and size as needed
175
  text_bbox = draw.textbbox((left, top - 20), text, font=font)
176
  draw.rectangle(text_bbox, fill="white")
177
  draw.text((left, top - 20), text, fill="red", font=font)
 
192
  torch_dtype=torch.float16,
193
  device_map="auto"
194
  )
195
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
196
 
197
  # Use Qwen model for Q&A
198
  def qwen_qa(image_path, question, names, bounding_boxes, en_wiki_pedia_links, en_intros):
 
263
  return False, None
264
 
265
  # Main processing function
266
+ # This is the function that will run on the GPU
267
  @spaces.GPU
268
  def process_image(image, question, confidence_threshold, examples=None):
269
  if image is None:
 
338
  print(links)
339
  en_wiki_pedia_links.append("No English Wikipedia link found")
340
  en_intros.append(f"This person is {name}.")
 
341
  if not names:
342
  if os.path.exists(temp_image_path):
343
  os.remove(temp_image_path)
 
405
  {'Width': 0.07422236353158951, 'Height': 0.15943190455436707, 'Left': 0.4633428454399109, 'Top': 0.07901764661073685},
406
  {'Width': 0.07562466710805893, 'Height': 0.13936467468738556, 'Left': 0.025178398936986923, 'Top': 0.4953641891479492}],
407
  ['https://www.wikidata.org/wiki/Q3572699', 'https://www.wikidata.org/wiki/Q92894', 'https://www.wikidata.org/wiki/Q3571662']],
 
 
 
 
 
 
 
 
 
 
 
 
 
408
  ]
409
 
410
  # Filter examples to only include files that exist
 
413
  if os.path.exists(example[0]):
414
  existing_examples.append(example)
415
 
416
+ with gr.Blocks(title="Celebrity Recognition and VQA System") as app:
417
+ gr.Markdown("<div style='text-align: center;'><h1 style=' font-size: 28px; '>Celebrity Recognition and VQA</h1></div>")
418
  gr.Markdown("**RC-MLLM** model is developed based on the Qwen2-VL model through a novel method called **RCVIT (Region-level Context-aware Visual Instruction Tuning)**, using the specially constructed **RCMU dataset** for training. Its core feature is the capability for **Region-level Context-aware Multimodal Understanding (RCMU)**. This means it can simultaneously understand both the visual content of specific regions/objects within an image and their associated textual information (utilizing bounding boxes coordinates), allowing it to respond to user instructions in a more context-aware manner. Simply put, RC-MLLM not only understands images but can also integrate the textual information linked to specific objects within the image for understanding. It achieves outstanding performance on RCMU tasks and is suitable for applications like personalized conversation.")
419
 
420
  markdown_content = """
421
+ πŸ“‘ [Region-Level Context-Aware Multimodal Understanding](https://arxiv.org/abs/2508.12263) |
422
+ πŸ€— Models:[RC-Qwen2VL-2b](https://huggingface.co/weihongliang/RC-Qwen2VL-2b/blob/main/README.md) [RC-Qwen2VL-7b](https://huggingface.co/weihongliang/RC-Qwen2VL-7b/blob/main/README.md)|
423
  πŸ“ [Dataset](https://huggingface.co/your-model-name) |
424
+ [Github](https://github.com/hongliang-wei/RC-MLLM) |
425
+ πŸš€ [Personalized Conversation Demo](https://1684c5f6e1c5a19b2c.gradio.live/)
426
  """
427
  gr.Markdown(markdown_content)
428
+ gr.Markdown("πŸ“Œ Upload an image containing celebrities, the system will recognize them and provide Wikipedia-based VQA using the RC-Qwen2-VL model.")
429
 
430
  with gr.Row():
431
  with gr.Column(scale=1):
 
439
  label="Confidence Threshold (%)",
440
  info="Adjust the minimum confidence level for celebrity recognition"
441
  )
442
+ submit_button = gr.Button("Ask RC-Qwen2VL-7B")
443
 
444
  # Add examples section
445
  if existing_examples:
 
501
  - "What is the relationship between these people?"
502
  3. Adjust the confidence threshold slider if needed (lower values will recognize more faces but might be less accurate)
503
  4. Click the submit button to get the answer
 
 
 
504
  """)
505
 
506
  return app
 
508
  # Launch the application
509
  if __name__ == "__main__":
510
  app = create_interface()
511
+ app.launch(share=True)