# Import required libraries import gradio as gr import os import torch from transformers import AutoProcessor, MllamaForConditionalGeneration from PIL import Image # Set up Hugging Face authentication hf_token = os.getenv("HF_KEY") # Get token from environment variable if not hf_token: raise ValueError("HF_KEY environment variable not set. Please set your Hugging Face token.") # Model configuration and loading model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" model = MllamaForConditionalGeneration.from_pretrained( model_name, use_auth_token=hf_token, torch_dtype=torch.bfloat16, device_map="auto", ) processor = AutoProcessor.from_pretrained(model_name, use_auth_token=hf_token) # Define prediction function for image and text processing def predict(image, text): # Prepare messages messages = [ {"role": "user", "content": [ {"type": "image"}, {"type": "text", "text": text} ]} ] # Create input text input_text = processor.apply_chat_template(messages, add_generation_prompt=True) # Process inputs and move to device inputs = processor(image, input_text, return_tensors="pt").to(model.device) # Generate model response outputs = model.generate(**inputs, max_new_tokens=100) # Decode output response = processor.decode(outputs[0], skip_special_tokens=True) return response # Setup Gradio interface interface = gr.Interface( fn=predict, inputs=[ gr.Image(type="pil", label="Image Input"), gr.Textbox(label="Text Input") ], outputs=gr.Textbox(label="Output"), title="Llama 3.2 11B Vision Instruct Demo", description="Meta's new model that generates a response based on an image and text input." ) # Launch the interface interface.launch()