import gradio as gr from transformers import BlipProcessor, BlipForConditionalGeneration from PIL import Image import requests # Initialize the model and processor processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda") def generate_caption(image): # Process the input image processed = processor(image, return_tensors="pt").to("cuda") # Generate caption outputs = model.generate(**processed) # Decode and return the first caption caption = processor.decode(outputs[0], skip_special_tokens=True) return caption # Gradio interface iface = gr.Interface(fn=generate_caption, inputs=gr.inputs.Image(type="pil"), outputs="text", title="BLIP Image Captioning") # Launch the app iface.launch()