|
import gradio as gr |
|
from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer |
|
from transformers.image_utils import load_image |
|
from threading import Thread |
|
import time |
|
import torch |
|
|
|
|
|
print("π§ Loading SmolVLM model...") |
|
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct-250M") |
|
model = AutoModelForVision2Seq.from_pretrained( |
|
"HuggingFaceTB/SmolVLM-Instruct-250M", |
|
torch_dtype=torch.bfloat16, |
|
device_map="auto" |
|
) |
|
print("β
Model loaded successfully!") |
|
|
|
def model_inference(input_dict, history): |
|
"""Process multimodal input and generate response""" |
|
text = input_dict["text"] |
|
|
|
|
|
if len(input_dict["files"]) > 1: |
|
images = [load_image(image) for image in input_dict["files"]] |
|
elif len(input_dict["files"]) == 1: |
|
images = [load_image(input_dict["files"][0])] |
|
else: |
|
images = [] |
|
|
|
|
|
if text == "" and not images: |
|
raise gr.Error("Please input a query and optionally image(s).") |
|
|
|
if text == "" and images: |
|
raise gr.Error("Please input a text query along with the image(s).") |
|
|
|
|
|
resulting_messages = [ |
|
{ |
|
"role": "user", |
|
"content": [{"type": "image"} for _ in range(len(images))] + [ |
|
{"type": "text", "text": text} |
|
] |
|
} |
|
] |
|
|
|
try: |
|
|
|
prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True) |
|
inputs = processor(text=prompt, images=images if images else None, return_tensors="pt") |
|
|
|
|
|
device = next(model.parameters()).device |
|
inputs = {k: v.to(device) if v is not None else v for k, v in inputs.items()} |
|
|
|
|
|
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True) |
|
generation_kwargs = dict( |
|
inputs, |
|
streamer=streamer, |
|
max_new_tokens=500, |
|
min_new_tokens=10, |
|
no_repeat_ngram_size=2, |
|
do_sample=True, |
|
temperature=0.7, |
|
top_p=0.9 |
|
) |
|
|
|
|
|
thread = Thread(target=model.generate, kwargs=generation_kwargs) |
|
thread.start() |
|
|
|
|
|
yield "Thinking..." |
|
buffer = "" |
|
|
|
for new_text in streamer: |
|
buffer += new_text |
|
time.sleep(0.02) |
|
yield buffer |
|
|
|
except Exception as e: |
|
yield f"β Error generating response: {str(e)}" |
|
|
|
|
|
examples = [ |
|
[{"text": "What do you see in this image?", "files": []}], |
|
[{"text": "Describe the colors and objects in this image in detail.", "files": []}], |
|
[{"text": "What is the mood or atmosphere of this image?", "files": []}], |
|
[{"text": "Are there any people in this image? What are they doing?", "files": []}], |
|
[{"text": "What text can you read in this image?", "files": []}], |
|
[{"text": "Count the number of objects you can see.", "files": []}], |
|
] |
|
|
|
|
|
demo = gr.ChatInterface( |
|
fn=model_inference, |
|
title="π SmolVLM Vision Chat", |
|
description=""" |
|
Chat with **SmolVLM-256M**, a compact but powerful vision-language model! |
|
|
|
**How to use:** |
|
1. Upload one or more images using the π button |
|
2. Ask questions about the images |
|
3. Get detailed AI-generated descriptions and answers |
|
|
|
**Example questions:** |
|
- "What do you see in this image?" |
|
- "Describe the colors and composition" |
|
- "What text is visible in this image?" |
|
- "Count the objects in this image" |
|
|
|
This model can analyze photos, diagrams, documents, artwork, and more! |
|
""", |
|
examples=examples, |
|
textbox=gr.MultimodalTextbox( |
|
label="π¬ Ask about your images...", |
|
file_types=["image"], |
|
file_count="multiple", |
|
placeholder="Upload images and ask questions about them!" |
|
), |
|
stop_btn="βΉοΈ Stop Generation", |
|
multimodal=True, |
|
cache_examples=False, |
|
theme=gr.themes.Soft(), |
|
css=""" |
|
.gradio-container { |
|
max-width: 1000px !important; |
|
} |
|
.chat-message { |
|
border-radius: 10px !important; |
|
} |
|
""" |
|
) |
|
|
|
if __name__ == "__main__": |
|
print("π Launching SmolVLM Chat Interface...") |
|
demo.launch( |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
share=False, |
|
show_error=True |
|
) |