freddyaboulton HF Staff taesiri commited on
Commit
fc701a8
·
0 Parent(s):

Duplicate from taesiri/BLIP-2

Browse files

Co-authored-by: taesiri <[email protected]>

.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ merlion.png filter=lfs diff=lfs merge=lfs -text
5kstbz-0001.png ADDED
Blue_Jay_0044_62759.jpg ADDED
ILSVRC2012_val_00000008.JPEG ADDED
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: BLIP-2
3
+ emoji: 👁
4
+ colorFrom: purple
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 3.17.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: other
11
+ duplicated_from: taesiri/BLIP-2
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+ import numpy as np
5
+ import torch
6
+ from lavis.models import load_model_and_preprocess
7
+ from PIL import Image
8
+
9
+ device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
10
+
11
+ model, vis_processors, _ = load_model_and_preprocess(
12
+ name="blip2_opt", model_type="pretrain_opt2.7b", is_eval=True, device=device
13
+ )
14
+
15
+
16
+ def generate_caption(image, caption_type):
17
+ image = vis_processors["eval"](image).unsqueeze(0).to(device)
18
+
19
+ if caption_type == "Beam Search":
20
+ caption = model.generate({"image": image})
21
+ else:
22
+ caption = model.generate(
23
+ {"image": image}, use_nucleus_sampling=True, num_captions=3
24
+ )
25
+
26
+ caption = "\n".join(caption)
27
+
28
+ if torch.cuda.is_available():
29
+ torch.cuda.empty_cache()
30
+
31
+ return caption
32
+
33
+
34
+ def chat(input_image, question, history):
35
+ history = history or []
36
+ question = question.lower()
37
+
38
+ image = vis_processors["eval"](input_image).unsqueeze(0).to(device)
39
+
40
+ clean = lambda x: x.replace("<p>", "").replace("</p>", "").replace("\n", "")
41
+ clean_h = lambda x: (clean(x[0]), clean(x[1]))
42
+ context = list(map(clean_h, history))
43
+ template = "Question: {} Answer: {}."
44
+ prompt = (
45
+ " ".join(
46
+ [template.format(context[i][0], context[i][1]) for i in range(len(context))]
47
+ )
48
+ + " Question: "
49
+ + question
50
+ + " Answer:"
51
+ )
52
+
53
+ response = model.generate({"image": image, "prompt": prompt})
54
+ history.append((question, response[0]))
55
+
56
+ return history, history
57
+
58
+
59
+ def clear_chat(history):
60
+ return [], []
61
+
62
+
63
+ with gr.Blocks() as demo:
64
+ gr.Markdown(
65
+ "### BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models"
66
+ )
67
+ gr.Markdown(
68
+ "This demo uses the `pretrain_opt2.7b` weights. For more information please visit [Github](https://github.com/salesforce/LAVIS/tree/main/projects/blip2) or [Paper](https://arxiv.org/abs/2301.12597)."
69
+ )
70
+
71
+ with gr.Row():
72
+ with gr.Column():
73
+ input_image = gr.Image(label="Image", type="pil")
74
+ caption_type = gr.Radio(
75
+ ["Beam Search", "Nucleus Sampling"],
76
+ label="Caption Decoding Strategy",
77
+ value="Beam Search",
78
+ )
79
+ btn_caption = gr.Button("Generate Caption")
80
+ output_text = gr.Textbox(label="Answer", lines=5)
81
+
82
+ with gr.Column():
83
+ chatbot = gr.Chatbot().style(color_map=("green", "pink"))
84
+ chat_state = gr.State()
85
+
86
+ question_txt = gr.Textbox(label="Question", lines=1)
87
+ btn_answer = gr.Button("Generate Answer")
88
+ btn_clear = gr.Button("Clear Chat")
89
+
90
+ btn_caption.click(
91
+ generate_caption, inputs=[input_image, caption_type], outputs=[output_text]
92
+ )
93
+
94
+ btn_answer.click(
95
+ chat,
96
+ inputs=[input_image, question_txt, chat_state],
97
+ outputs=[chatbot, chat_state],
98
+ )
99
+
100
+ btn_clear.click(clear_chat, inputs=[chat_state], outputs=[chatbot, chat_state])
101
+
102
+ gr.Examples(
103
+ [
104
+ ["./merlion.png", "Beam Search", "which city is this?"],
105
+ [
106
+ "./Blue_Jay_0044_62759.jpg",
107
+ "Beam Search",
108
+ "what is the name of this bird?",
109
+ ],
110
+ ["./5kstbz-0001.png", "Beam Search", "where is the man standing?"],
111
+ [
112
+ "ILSVRC2012_val_00000008.JPEG",
113
+ "Beam Search",
114
+ "Name the colors of macarons you see in the image.",
115
+ ],
116
+ ],
117
+ inputs=[input_image, caption_type, question_txt],
118
+ )
119
+
120
+ gr.Markdown(
121
+ "Sample images are taken from [ImageNet](https://paperswithcode.com/sota/image-classification-on-imagenet), [CUB](https://paperswithcode.com/dataset/cub-200-2011) and [GamePhysics](https://asgaardlab.github.io/CLIPxGamePhysics/) datasets."
122
+ )
123
+
124
+ demo.launch()
merlion.png ADDED

Git LFS Details

  • SHA256: f1f3b6a507ec92e8f47ac6d7c64e11b03fcba8c550bcb6851f80e261e8951431
  • Pointer size: 132 Bytes
  • Size of remote file: 1.6 MB
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch
2
+ torchvision
3
+ salesforce-lavis