Spaces:

teragron
/

smoldev2

Sleeping

App Files Files Community

smoldev2 / app.py

teragron

Create app.py

0a424dd verified 2 months ago

raw

history blame contribute delete

4.78 kB

	import gradio as gr
	from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
	from transformers.image_utils import load_image
	from threading import Thread
	import time
	import torch

	# Load the SmolVLM model and processor
	print("🔧 Loading SmolVLM model...")
	processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct-250M")
	model = AutoModelForVision2Seq.from_pretrained(
	"HuggingFaceTB/SmolVLM-Instruct-250M",
	torch_dtype=torch.bfloat16,
	device_map="auto" # Automatically handles CPU/GPU placement
	)
	print("✅ Model loaded successfully!")

	def model_inference(input_dict, history):
	"""Process multimodal input and generate response"""
	text = input_dict["text"]

	# Handle image input
	if len(input_dict["files"]) > 1:
	images = [load_image(image) for image in input_dict["files"]]
	elif len(input_dict["files"]) == 1:
	images = [load_image(input_dict["files"][0])]
	else:
	images = []

	# Validation
	if text == "" and not images:
	raise gr.Error("Please input a query and optionally image(s).")

	if text == "" and images:
	raise gr.Error("Please input a text query along with the image(s).")

	# Prepare the conversation format
	resulting_messages = [
	{
	"role": "user",
	"content": [{"type": "image"} for _ in range(len(images))] + [
	{"type": "text", "text": text}
	]
	}
	]

	try:
	# Apply chat template and process inputs
	prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
	inputs = processor(text=prompt, images=images if images else None, return_tensors="pt")

	# Move to appropriate device
	device = next(model.parameters()).device
	inputs = {k: v.to(device) if v is not None else v for k, v in inputs.items()}

	# Set up streaming generation
	streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
	generation_kwargs = dict(
	inputs,
	streamer=streamer,
	max_new_tokens=500,
	min_new_tokens=10,
	no_repeat_ngram_size=2,
	do_sample=True,
	temperature=0.7,
	top_p=0.9
	)

	# Start generation in separate thread
	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	# Stream the response
	yield "Thinking..."
	buffer = ""

	for new_text in streamer:
	buffer += new_text
	time.sleep(0.02) # Small delay for smooth streaming
	yield buffer

	except Exception as e:
	yield f"❌ Error generating response: {str(e)}"

	# Example prompts and images for demonstration
	examples = [
	[{"text": "What do you see in this image?", "files": []}],
	[{"text": "Describe the colors and objects in this image in detail.", "files": []}],
	[{"text": "What is the mood or atmosphere of this image?", "files": []}],
	[{"text": "Are there any people in this image? What are they doing?", "files": []}],
	[{"text": "What text can you read in this image?", "files": []}],
	[{"text": "Count the number of objects you can see.", "files": []}],
	]

	# Create the Gradio interface using ChatInterface
	demo = gr.ChatInterface(
	fn=model_inference,
	title="🔍 SmolVLM Vision Chat",
	description="""
	Chat with SmolVLM-256M, a compact but powerful vision-language model!

	How to use:
	1. Upload one or more images using the 📎 button
	2. Ask questions about the images
	3. Get detailed AI-generated descriptions and answers

	Example questions:
	- "What do you see in this image?"
	- "Describe the colors and composition"
	- "What text is visible in this image?"
	- "Count the objects in this image"

	This model can analyze photos, diagrams, documents, artwork, and more!
	""",
	examples=examples,
	textbox=gr.MultimodalTextbox(
	label="💬 Ask about your images...",
	file_types=["image"],
	file_count="multiple",
	placeholder="Upload images and ask questions about them!"
	),
	stop_btn="⏹️ Stop Generation",
	multimodal=True,
	cache_examples=False,
	theme=gr.themes.Soft(),
	css="""
	.gradio-container {
	max-width: 1000px !important;
	}
	.chat-message {
	border-radius: 10px !important;
	}
	"""
	)

	if __name__ == "__main__":
	print("🚀 Launching SmolVLM Chat Interface...")
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True
	)