sergiopaniego HF Staff commited on
Commit
cc25f28
Β·
1 Parent(s): 010d97b
Files changed (2) hide show
  1. app.py +138 -4
  2. requirements.txt +6 -0
app.py CHANGED
@@ -1,7 +1,141 @@
1
  import gradio as gr
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from datasets import load_dataset
3
+ import numpy as np
4
+ import torch
5
+ import random
6
+ import time
7
+ import spaces
8
+ from transformers import AutoProcessor, AutoModelForImageTextToText, AutoModelForVision2Seq
9
 
 
 
10
 
11
+ dataset_vqa = load_dataset(
12
+ path="KevinNotSmile/nuscenes-qa-mini",
13
+ name="day",
14
+ split="train",
15
+ data_files="day-train/*.arrow",
16
+ )
17
+
18
+
19
+ MODEL_VERSIONS = {
20
+ "SmolVLM-256M-Instruct": "HuggingFaceTB/SmolVLM-256M-Instruct",
21
+ "SmolVLM-500M-Instruct": "HuggingFaceTB/SmolVLM-500M-Instruct",
22
+ "SmolVLM-2.2B-Instruct": "HuggingFaceTB/SmolVLM-Instruct",
23
+ "SmolVLM2-256M-Instruct": "HuggingFaceTB/SmolVLM2-256M-Instruct",
24
+ "SmolVLM2-500M-Instruct": "HuggingFaceTB/SmolVLM2-500M-Instruct",
25
+ "SmolVLM2-2.2B-Instruct": "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
26
+ }
27
+
28
+ def load_model_and_processor(version):
29
+ model_name = MODEL_VERSIONS[version]
30
+ if version.startswith("SmolVLM-"):
31
+ model = AutoModelForVision2Seq.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
32
+ else:
33
+ model = AutoModelForImageTextToText.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
34
+ processor = AutoProcessor.from_pretrained(model_name)
35
+ return model, processor
36
+
37
+ @spaces.GPU
38
+ def predict(model_version):
39
+ sample = random.choice(dataset_vqa)
40
+
41
+ model, processor = load_model_and_processor(model_version)
42
+
43
+ messages = [
44
+ {
45
+ "role": "system",
46
+ "content": "You are analyzing real-time camera feed from a self-driving car's multi-camera setup. "
47
+ + "The position of the cameras with respect to the car is: "
48
+ + "CAM_FRONT_LEFT, CAM_FRONT, CAM_FRONT_RIGHT, CAM_BACK_LEFT, CAM_BACK, CAM_BACK_RIGHT. "
49
+ + "Your task is to perform precise visual analysis and answer questions about the scene."
50
+ },
51
+ {
52
+ "role": "user",
53
+ "content": [
54
+ {"type": "image"},
55
+ {"type": "image"},
56
+ {"type": "image"},
57
+ {"type": "image"},
58
+ {"type": "image"},
59
+ {"type": "image"},
60
+ {"type": "text", "text": f"Answer the following question. {sample['question']}."},
61
+ ],
62
+ },
63
+ {
64
+ "role": "assistant",
65
+ "content": "Answer: "
66
+ }
67
+ ]
68
+
69
+ prompt = processor.apply_chat_template(messages, add_generation_prompt=True, do_rescale=False)
70
+
71
+ images = [
72
+ np.array(sample["CAM_FRONT_LEFT"]),
73
+ np.array(sample["CAM_FRONT"]),
74
+ np.array(sample["CAM_FRONT_RIGHT"]),
75
+ np.array(sample["CAM_BACK_LEFT"]),
76
+ np.array(sample["CAM_BACK"]),
77
+ np.array(sample["CAM_BACK_RIGHT"]),
78
+ ]
79
+
80
+ inputs = processor(text=prompt, images=images, return_tensors="pt").to(device=model.device).to(torch.float16)
81
+
82
+ start = time.time()
83
+ generated_ids = model.generate(**inputs, max_new_tokens=1000)
84
+ end = time.time()
85
+
86
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
87
+ predicted_answer = generated_text.split("Assistant: ")[-1].strip()
88
+ expected_answer = sample["answer"].strip()
89
+ question = sample["question"].strip()
90
+
91
+ is_correct = predicted_answer.lower() == expected_answer.lower()
92
+ inference_time = round(end - start, 2)
93
+
94
+ return (
95
+ images[0], images[1], images[2], images[3], images[4], images[5],
96
+ question, expected_answer, predicted_answer,
97
+ "βœ… Correct" if is_correct else "❌ Incorrect",
98
+ f"{inference_time:.2f} seconds"
99
+ )
100
+
101
+
102
+ theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="emerald")
103
+ with gr.Blocks(theme=theme, title="πŸ” SmolVLM VQA Demo (NuScenes Dataset)") as demo:
104
+ gr.Markdown("# SmolVLM VQA Demo (NuScenes Dataset)")
105
+ model_selector = gr.Dropdown(
106
+ choices=list(MODEL_VERSIONS.keys()),
107
+ value="2-2.2B",
108
+ label="Select Model Version"
109
+ )
110
+
111
+ predict_button = gr.Button("Predict on Random Sample")
112
+
113
+ with gr.Row():
114
+ cam_images_front = [
115
+ gr.Image(label=cam) for cam in [
116
+ "CAM_FRONT_LEFT", "CAM_FRONT", "CAM_FRONT_RIGHT"
117
+ ]
118
+ ]
119
+
120
+ with gr.Row():
121
+ cam_images_back = [
122
+ gr.Image(label=cam) for cam in [
123
+ "CAM_BACK_LEFT", "CAM_BACK", "CAM_BACK_RIGHT"
124
+ ]
125
+ ]
126
+
127
+ cam_images = cam_images_front + cam_images_back
128
+
129
+ question_text = gr.Textbox(label="Question")
130
+ expected_text = gr.Textbox(label="Expected Answer")
131
+ predicted_text = gr.Textbox(label="Predicted Answer")
132
+ correctness = gr.Textbox(label="Correct?")
133
+ timing = gr.Textbox(label="Inference Time")
134
+
135
+ predict_button.click(
136
+ fn=predict,
137
+ inputs=[model_selector],
138
+ outputs=cam_images + [question_text, expected_text, predicted_text, correctness, timing]
139
+ )
140
+
141
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ matplotlib
2
+ torch
3
+ transformers
4
+ datasets
5
+ num2words
6
+ accelerate