Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from PIL import Image | |
| from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor | |
| st.title("Coffe machine captioning app") | |
| def load_model(): | |
| with st.spinner('Loading model and tokenizer...'): | |
| model_id = "Fer14/paligemma_coffe_machine_caption" | |
| model = PaliGemmaForConditionalGeneration.from_pretrained(model_id) | |
| processor = PaliGemmaProcessor.from_pretrained(model_id) | |
| st.success('Model loaded!') | |
| return model, processor | |
| model, processor = load_model() | |
| st.sidebar.title("Instructions") | |
| st.sidebar.write( | |
| """ | |
| 1. Upload an image using the file uploader. | |
| 2. Wait for the app to process and generate the caption. | |
| 3. The caption will be displayed in the text area. | |
| 4. Enjoy your caption! | |
| """ | |
| ) | |
| uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"]) | |
| prompt = ( | |
| f"Generate a caption for the following coffee maker image. The caption has to be of the following structure:\n" | |
| "\"A <color> <type>, <accessories>, <shape> shaped, with <screen> and <number> <b_color> butons\"\n\n" | |
| "in which:\n" | |
| "- color: red, black, blue...\n" | |
| "- type: coffee machine, coffee maker, espresso coffee machine...\n" | |
| "- accessories: a list of accessories like the ones described above\n" | |
| "- shape: cubed, round...\n" | |
| "- screen: screen, no screen.\n" | |
| "- number: amount of buttons to add\n" | |
| "- b_color: color of the buttons" | |
| ) | |
| if uploaded_image is not None: | |
| # Display the uploaded image | |
| image = Image.open(uploaded_image).convert("RGB") | |
| st.image(image, caption='Uploaded Image.', use_column_width=True) | |
| inputs = processor( | |
| text=prompt, | |
| images=image, | |
| return_tensors="pt", | |
| padding="longest", | |
| ) | |
| with st.spinner('Generating caption...'): | |
| output = model.generate(**inputs, max_length=1000) | |
| out = processor.decode(output[0], skip_special_tokens=True)[len(prompt) :] | |
| # Display the extracted text | |
| st.text_area("Coffe machine caption", out, height=100) | |