Spaces:
Runtime error
Runtime error
geonmo.gu
commited on
Commit
ยท
4b145c8
1
Parent(s):
fba8607
add description
Browse files- .gitignore +2 -0
- app.py +17 -5
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.swp
|
| 2 |
+
*.pt
|
app.py
CHANGED
|
@@ -3,8 +3,6 @@ import torch
|
|
| 3 |
import gradio as gr
|
| 4 |
import time
|
| 5 |
import clip
|
| 6 |
-
#from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
| 7 |
-
#from flores200_codes import flores_codes
|
| 8 |
import requests
|
| 9 |
import csv
|
| 10 |
import json
|
|
@@ -22,7 +20,6 @@ os.environ['CUDA_VISIBLE_DEVICES'] = ''
|
|
| 22 |
|
| 23 |
API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
|
| 24 |
HF_TOKEN = os.environ["HF_TOKEN"]
|
| 25 |
-
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
|
| 26 |
|
| 27 |
def load_openimage_classnames(csv_path):
|
| 28 |
csv_data = open(csv_path)
|
|
@@ -261,8 +258,23 @@ if __name__ == '__main__':
|
|
| 261 |
|
| 262 |
title = "Socratic models for image captioning with BLOOM"
|
| 263 |
|
| 264 |
-
|
| 265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.00598'>Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language</a></p>"
|
| 267 |
examples = ['k21-1.jpg']
|
| 268 |
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
import time
|
| 5 |
import clip
|
|
|
|
|
|
|
| 6 |
import requests
|
| 7 |
import csv
|
| 8 |
import json
|
|
|
|
| 20 |
|
| 21 |
API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
|
| 22 |
HF_TOKEN = os.environ["HF_TOKEN"]
|
|
|
|
| 23 |
|
| 24 |
def load_openimage_classnames(csv_path):
|
| 25 |
csv_data = open(csv_path)
|
|
|
|
| 258 |
|
| 259 |
title = "Socratic models for image captioning with BLOOM"
|
| 260 |
|
| 261 |
+
description = """
|
| 262 |
+
## Details
|
| 263 |
+
**Without any fine-tuning**, we can do image captioning using Visual-Language models (e.g., CLIP, SLIP, ...) and Large language models (e.g., GPT, BLOOM, ...).
|
| 264 |
+
In this demo, I choose BLOOM as the language model and CLIP ViT-L/14 as the visual-language model.
|
| 265 |
+
The order of generating image caption is as follow:
|
| 266 |
+
1. Classify whether there are people, where the location is, and what objects are in the input image using the visual-language model.
|
| 267 |
+
2. Then, build a prompt using classified results.
|
| 268 |
+
3. Request BLOOM API with the prompt.
|
| 269 |
+
|
| 270 |
+
This demo is slightly different with the original method proposed in the socratie model paper.
|
| 271 |
+
I used not only tencent ml class names, but also OpenImage class names and I adopt BLOOM for the large language model
|
| 272 |
+
|
| 273 |
+
If you want the demo using GPT3 from OpenAI, check https://github.com/geonm/socratic-models-demo.
|
| 274 |
+
|
| 275 |
+
Demo is running on CPU.
|
| 276 |
+
"""
|
| 277 |
+
|
| 278 |
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.00598'>Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language</a></p>"
|
| 279 |
examples = ['k21-1.jpg']
|
| 280 |
|