Spaces:
Running
on
Zero
Running
on
Zero
Show inference time for both models
Browse filesAdded an extra output that shows inference time for both models. I removed the `@GPU` annotation on the 'detect' function because it was causing a ~400ms hit to the first model's inference time (presumably because of ZeroGPU initialization that the second model is able to take advantage of). With the annotation removed both models get the performance hit resulting in a more apples-to-apples comparison.
Here's what it looks like:

app.py
CHANGED
@@ -2,6 +2,7 @@ import random
|
|
2 |
import requests
|
3 |
import json
|
4 |
import ast
|
|
|
5 |
|
6 |
import matplotlib.pyplot as plt
|
7 |
from PIL import Image, ImageDraw, ImageFont
|
@@ -156,6 +157,7 @@ def detect_qwen(image, prompt):
|
|
156 |
}
|
157 |
]
|
158 |
|
|
|
159 |
text = processor_qwen.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
160 |
image_inputs, video_inputs = process_vision_info(messages)
|
161 |
inputs = processor_qwen(
|
@@ -173,37 +175,41 @@ def detect_qwen(image, prompt):
|
|
173 |
output_text = processor_qwen.batch_decode(
|
174 |
generated_ids_trimmed, do_sample=True, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
175 |
)[0]
|
|
|
176 |
|
177 |
input_height = inputs['image_grid_thw'][0][1] * 14
|
178 |
input_width = inputs['image_grid_thw'][0][2] * 14
|
179 |
|
180 |
annotated_image = create_annotated_image(image, output_text, input_height, input_width)
|
181 |
|
182 |
-
|
|
|
183 |
|
184 |
|
185 |
@GPU
|
186 |
def detect_moondream(image, prompt, category_input):
|
|
|
187 |
if category_input in ["Object Detection", "Visual Grounding + Object Detection"]:
|
188 |
output_text = model_moondream.detect(image=image, object=prompt)
|
189 |
elif category_input == "Visual Grounding + Keypoint Detection":
|
190 |
output_text = model_moondream.point(image=image, object=prompt)
|
191 |
else:
|
192 |
output_text = model_moondream.query(image=image, question=prompt, reasoning=True)
|
|
|
193 |
|
194 |
annotated_image = create_annotated_image_normalized(image=image, json_data=output_text, label="object", explicit_color=None)
|
195 |
-
|
196 |
-
return annotated_image, output_text
|
197 |
|
198 |
-
|
|
|
|
|
199 |
def detect(image, prompt_model_1, prompt_model_2, category_input):
|
200 |
STANDARD_SIZE = (1024, 1024)
|
201 |
image.thumbnail(STANDARD_SIZE)
|
202 |
|
203 |
-
annotated_image_model_1, output_text_model_1 = detect_qwen(image, prompt_model_1)
|
204 |
-
annotated_image_model_2, output_text_model_2 = detect_moondream(image, prompt_model_2, category_input)
|
205 |
|
206 |
-
return annotated_image_model_1, output_text_model_1, annotated_image_model_2, output_text_model_2
|
207 |
|
208 |
css_hide_share = """
|
209 |
button#gradio-share-link-button-0 {
|
@@ -253,10 +259,12 @@ with gr.Blocks(theme=Ocean(), css=css_hide_share) as demo:
|
|
253 |
with gr.Column(scale=1):
|
254 |
output_image_model_1 = gr.Image(type="pil", label=f"Annotated image for {model_qwen_name}", height=400)
|
255 |
output_textbox_model_1 = gr.Textbox(label=f"Model response for {model_qwen_name}", lines=10)
|
|
|
256 |
|
257 |
with gr.Column(scale=1):
|
258 |
output_image_model_2 = gr.Image(type="pil", label=f"Annotated image for {model_moondream_name}", height=400)
|
259 |
output_textbox_model_2 = gr.Textbox(label=f"Model response for {model_moondream_name}", lines=10)
|
|
|
260 |
|
261 |
gr.Markdown("### Examples")
|
262 |
example_prompts = [
|
@@ -276,8 +284,15 @@ with gr.Blocks(theme=Ocean(), css=css_hide_share) as demo:
|
|
276 |
label="Click an example to populate the input"
|
277 |
)
|
278 |
|
279 |
-
generate_btn.click(
|
280 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
281 |
if __name__ == "__main__":
|
282 |
demo.launch()
|
283 |
|
|
|
2 |
import requests
|
3 |
import json
|
4 |
import ast
|
5 |
+
import time
|
6 |
|
7 |
import matplotlib.pyplot as plt
|
8 |
from PIL import Image, ImageDraw, ImageFont
|
|
|
157 |
}
|
158 |
]
|
159 |
|
160 |
+
t0 = time.perf_counter()
|
161 |
text = processor_qwen.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
162 |
image_inputs, video_inputs = process_vision_info(messages)
|
163 |
inputs = processor_qwen(
|
|
|
175 |
output_text = processor_qwen.batch_decode(
|
176 |
generated_ids_trimmed, do_sample=True, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
177 |
)[0]
|
178 |
+
elapsed_ms = (time.perf_counter() - t0) * 1_000
|
179 |
|
180 |
input_height = inputs['image_grid_thw'][0][1] * 14
|
181 |
input_width = inputs['image_grid_thw'][0][2] * 14
|
182 |
|
183 |
annotated_image = create_annotated_image(image, output_text, input_height, input_width)
|
184 |
|
185 |
+
time_taken = f"**Inference time ({model_qwen_name}):** {elapsed_ms:.0f} ms"
|
186 |
+
return annotated_image, output_text, time_taken
|
187 |
|
188 |
|
189 |
@GPU
|
190 |
def detect_moondream(image, prompt, category_input):
|
191 |
+
t0 = time.perf_counter()
|
192 |
if category_input in ["Object Detection", "Visual Grounding + Object Detection"]:
|
193 |
output_text = model_moondream.detect(image=image, object=prompt)
|
194 |
elif category_input == "Visual Grounding + Keypoint Detection":
|
195 |
output_text = model_moondream.point(image=image, object=prompt)
|
196 |
else:
|
197 |
output_text = model_moondream.query(image=image, question=prompt, reasoning=True)
|
198 |
+
elapsed_ms = (time.perf_counter() - t0) * 1_000
|
199 |
|
200 |
annotated_image = create_annotated_image_normalized(image=image, json_data=output_text, label="object", explicit_color=None)
|
|
|
|
|
201 |
|
202 |
+
time_taken = f"**Inference time ({model_moondream_name}):** {elapsed_ms:.0f} ms"
|
203 |
+
return annotated_image, output_text, time_taken
|
204 |
+
|
205 |
def detect(image, prompt_model_1, prompt_model_2, category_input):
|
206 |
STANDARD_SIZE = (1024, 1024)
|
207 |
image.thumbnail(STANDARD_SIZE)
|
208 |
|
209 |
+
annotated_image_model_1, output_text_model_1, timing_1 = detect_qwen(image, prompt_model_1)
|
210 |
+
annotated_image_model_2, output_text_model_2, timing_2 = detect_moondream(image, prompt_model_2, category_input)
|
211 |
|
212 |
+
return annotated_image_model_1, output_text_model_1, timing_1, annotated_image_model_2, output_text_model_2, timing_2
|
213 |
|
214 |
css_hide_share = """
|
215 |
button#gradio-share-link-button-0 {
|
|
|
259 |
with gr.Column(scale=1):
|
260 |
output_image_model_1 = gr.Image(type="pil", label=f"Annotated image for {model_qwen_name}", height=400)
|
261 |
output_textbox_model_1 = gr.Textbox(label=f"Model response for {model_qwen_name}", lines=10)
|
262 |
+
output_time_model_1 = gr.Markdown()
|
263 |
|
264 |
with gr.Column(scale=1):
|
265 |
output_image_model_2 = gr.Image(type="pil", label=f"Annotated image for {model_moondream_name}", height=400)
|
266 |
output_textbox_model_2 = gr.Textbox(label=f"Model response for {model_moondream_name}", lines=10)
|
267 |
+
output_time_model_2 = gr.Markdown()
|
268 |
|
269 |
gr.Markdown("### Examples")
|
270 |
example_prompts = [
|
|
|
284 |
label="Click an example to populate the input"
|
285 |
)
|
286 |
|
287 |
+
generate_btn.click(
|
288 |
+
fn=detect,
|
289 |
+
inputs=[image_input, prompt_input_model_1, prompt_input_model_2, category_input],
|
290 |
+
outputs=[
|
291 |
+
output_image_model_1, output_textbox_model_1, output_time_model_1,
|
292 |
+
output_image_model_2, output_textbox_model_2, output_time_model_2
|
293 |
+
]
|
294 |
+
)
|
295 |
+
|
296 |
if __name__ == "__main__":
|
297 |
demo.launch()
|
298 |
|