File size: 1,552 Bytes
2d8fe48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d76bb1
2d8fe48
 
 
 
 
9d76bb1
2d8fe48
 
 
 
8e5e4bb
2d8fe48
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

import streamlit as st
from PIL import Image
import urllib.request
import torch
from torch import nn
import numpy as np
from VT5 import VT5
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    T5Tokenizer, 
    T5ForConditionalGeneration,
    CLIPVisionModelWithProjection,
    AutoProcessor
)

  
clip = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
t5 = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")

vt5 = VT5(t5,tokenizer,clip)
vt5.load_state_dict(torch.load('weights.bin',map_location=torch.device('cpu')))

# Assuming you have this function that generates captions
def generate_caption(image):
    # Your model code here
    caption = "This is a placeholder caption"
  
    caption = vt5.generate_caption(image)
    return caption

st.title("Image Captioning App")
#st.image(image.numpy().reshape(224,224,3), caption='Uploaded Image.', clamp=True,use_column_width=True)
uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png"])

if uploaded_file is not None:
    image = Image.open(uploaded_file)
    st.image(image, caption='Uploaded Image.', clamp=True,use_column_width=True)
    image = processor(images=image,return_tensors='pt').pixel_values
    st.write("")
    st.write("Generating caption...")
    caption = generate_caption(image)
    st.write("Caption: ", caption)