Spaces:

Salesforce
/

BLIP2

Running

App Files Files Community

Dongxu Li commited on Feb 2, 2023

Commit

81cf2fa

1 Parent(s): f7f5be8

finish adding opt for captioning.

Browse files

Files changed (1) hide show

app.py +77 -17

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ def encode_image(image):
     return buffered
-def query_api(
     image, prompt, decoding_method, temperature, len_penalty, repetition_penalty
 ):
@@ -41,6 +41,34 @@ def query_api(
         return "Error: " + response.text
 def postprocess_output(output):
     # if last character is not a punctuation, add a full stop
     if not output[0][-1] in string.punctuation:
@@ -49,7 +77,7 @@ def postprocess_output(output):
     return output
-def inference(
     image,
     text_input,
     decoding_method,
@@ -64,7 +92,7 @@ def inference(
     prompt = " ".join(history)
     print(prompt)
-    output = query_api(
         image, prompt, decoding_method, temperature, length_penalty, repetition_penalty
     )
     output = postprocess_output(output)
@@ -77,6 +105,20 @@ def inference(
     return {chatbot: chat, state: history}
 title = """<h1 align="center">BLIP-2</h1>"""
 description = """Gradio demo for BLIP-2, a multimodal chatbot from Salesforce Research. To use it, simply upload your image, or click one of the examples to load them. Please visit our <a href='https://github.com/salesforce/LAVIS/tree/main/projects/blip2' target='_blank'>project webpage</a>.</p>
 <p> <strong>Disclaimer</strong>: This is a research prototype and is not intended for production use. No data including but not restricted to text and images is collected. </p>"""
@@ -101,16 +143,15 @@ with gr.Blocks() as iface:
     with gr.Row():
         with gr.Column():
             image_input = gr.Image(type="pil")
-            text_input = gr.Textbox(lines=2, label="Text input")
-            sampling = gr.Radio(
-                choices=["Beam search", "Nucleus sampling"],
-                value="Beam search",
-                label="Text Decoding Method",
-                interactive=True,
-            )
             with gr.Row():
                 temperature = gr.Slider(
                     minimum=0.5,
                     maximum=1.0,
@@ -134,13 +175,32 @@ with gr.Blocks() as iface:
                     value=10.0,
                     step=0.5,
                     interactive=True,
-                    label="Repetition Penalty",
                 )
         with gr.Column():
             with gr.Row():
                 chatbot = gr.Chatbot()
-                image_input.change(lambda: (None, []), [], [chatbot, state])
             with gr.Row():
@@ -148,17 +208,17 @@ with gr.Blocks() as iface:
                 clear_button.click(
                     lambda: ("", None, [], []),
                     [],
-                    [text_input, image_input, chatbot, state],
                 )
                 submit_button = gr.Button(
                     value="Submit", interactive=True, variant="primary"
                 )
                 submit_button.click(
-                    inference,
                     [
                         image_input,
-                        text_input,
                         sampling,
                         temperature,
                         len_penalty,
@@ -170,7 +230,7 @@ with gr.Blocks() as iface:
     examples = gr.Examples(
         examples=examples,
-        inputs=[image_input, text_input],
     )
 iface.queue(concurrency_count=1, api_open=False, max_size=20)

     return buffered
+def query_chat_api(
     image, prompt, decoding_method, temperature, len_penalty, repetition_penalty
 ):
         return "Error: " + response.text
+def query_caption_api(
+    image, decoding_method, temperature, len_penalty, repetition_penalty
+):
+    url = endpoint.url
+    # replace /generate with /caption
+    url = url.replace("/generate", "/caption")
+    headers = {"User-Agent": "BLIP-2 HuggingFace Space"}
+    data = {
+        "use_nucleus_sampling": decoding_method == "Nucleus sampling",
+        "temperature": temperature,
+        "length_penalty": len_penalty,
+        "repetition_penalty": repetition_penalty,
+    }
+    image = encode_image(image)
+    files = {"image": image}
+    response = requests.post(url, data=data, files=files, headers=headers)
+    if response.status_code == 200:
+        return response.json()
+    else:
+        return "Error: " + response.text
 def postprocess_output(output):
     # if last character is not a punctuation, add a full stop
     if not output[0][-1] in string.punctuation:
     return output
+def inference_chat(
     image,
     text_input,
     decoding_method,
     prompt = " ".join(history)
     print(prompt)
+    output = query_chat_api(
         image, prompt, decoding_method, temperature, length_penalty, repetition_penalty
     )
     output = postprocess_output(output)
     return {chatbot: chat, state: history}
+def inference_caption(
+    image,
+    decoding_method,
+    temperature,
+    length_penalty,
+    repetition_penalty,
+):
+    output = query_caption_api(
+        image, decoding_method, temperature, length_penalty, repetition_penalty
+    )
+    return output[0]
 title = """<h1 align="center">BLIP-2</h1>"""
 description = """Gradio demo for BLIP-2, a multimodal chatbot from Salesforce Research. To use it, simply upload your image, or click one of the examples to load them. Please visit our <a href='https://github.com/salesforce/LAVIS/tree/main/projects/blip2' target='_blank'>project webpage</a>.</p>
 <p> <strong>Disclaimer</strong>: This is a research prototype and is not intended for production use. No data including but not restricted to text and images is collected. </p>"""
     with gr.Row():
         with gr.Column():
             image_input = gr.Image(type="pil")
             with gr.Row():
+                sampling = gr.Radio(
+                    choices=["Beam search", "Nucleus sampling"],
+                    value="Beam search",
+                    label="Text Decoding Method",
+                    interactive=True,
+                )
                 temperature = gr.Slider(
                     minimum=0.5,
                     maximum=1.0,
                     value=10.0,
                     step=0.5,
                     interactive=True,
+                    label="Repeat Penalty",
+                )
+            with gr.Row():
+                caption_output = gr.Textbox(lines=2, label="Caption Output")
+                caption_button = gr.Button(
+                    value="Caption it!", interactive=True, variant="primary"
+                )
+                caption_button.click(
+                    inference_caption,
+                    [
+                        image_input,
+                        sampling,
+                        temperature,
+                        len_penalty,
+                        rep_penalty,
+                    ],
+                    [caption_output],
                 )
         with gr.Column():
+            chat_input = gr.Textbox(lines=2, label="Chat Input")
             with gr.Row():
                 chatbot = gr.Chatbot()
+                image_input.change(lambda: (None, "", "", []), [], [chatbot, chat_input, caption_output, state])
             with gr.Row():
                 clear_button.click(
                     lambda: ("", None, [], []),
                     [],
+                    [chat_input, image_input, chatbot, state],
                 )
                 submit_button = gr.Button(
                     value="Submit", interactive=True, variant="primary"
                 )
                 submit_button.click(
+                    inference_chat,
                     [
                         image_input,
+                        chat_input,
                         sampling,
                         temperature,
                         len_penalty,
     examples = gr.Examples(
         examples=examples,
+        inputs=[image_input, chat_input],
     )
 iface.queue(concurrency_count=1, api_open=False, max_size=20)