|
--- |
|
license: cc-by-nc-4.0 |
|
language: |
|
- en |
|
tags: |
|
- vila |
|
- nvila |
|
- conversational |
|
- multimodal |
|
--- |
|
|
|
Dependency setups: |
|
|
|
```bash |
|
# other transformers version may also work, but we have not tested |
|
pip install transformers==4.46 accelerate opencv-python torchvision einops pillow |
|
pip install git+https://github.com/bfshi/scaling_on_scales.git |
|
``` |
|
|
|
## Usage |
|
|
|
```python |
|
from transformers import AutoConfig, AutoModel |
|
from termcolor import colored |
|
|
|
model_path = "Efficient-Large-Model/NVILA-Lite-2B-Verifier" |
|
|
|
# you can use config |
|
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) |
|
model = AutoModel.from_config(config, trust_remote_code=True) |
|
# or directly from_pretrained |
|
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto") |
|
|
|
yes_id = model.tokenizer.encode("yes", add_special_tokens=False)[0] |
|
no_id = model.tokenizer.encode("no", add_special_tokens=False)[0] |
|
files = [ |
|
f"output/sana_test_prompt/0.png", |
|
f"output/sana_test_prompt/1.png" |
|
], |
|
|
|
prompt = "YOUR_GENERATED_PROMPT" |
|
|
|
prompt = f"""You are an AI assistant specializing in image analysis and ranking. Your task is to analyze and compare image based on how well they match the given prompt. |
|
The given prompt is:{prompt}. Please consider the prompt and the image to make a decision and response directly with 'yes' or 'no'. |
|
""" |
|
|
|
r1, scores1 = model.generate_content([ |
|
PIL.Image.open(files[0]), |
|
prompt |
|
]) |
|
|
|
r2, scores2 = model.generate_content([ |
|
PIL.Image.open(files[1]), |
|
prompt |
|
]) |
|
|
|
if r1 == r2: |
|
if r1 == "yes": |
|
# pick the one with higher score for yes |
|
if scores1[0][0, yes_id] > scores2[0][0, yes_id]: |
|
selected_file = files[0] |
|
else: |
|
selected_file = files[1] |
|
else: |
|
# pick the one with less score for no |
|
if scores1[0][0, no_id] < scores2[0][0, no_id]: |
|
selected_file = files[0] |
|
else: |
|
selected_file = files[1] |
|
else: |
|
if r1 == "yes": |
|
selected_file = files[0] |
|
else: |
|
selected_file = files[1] |
|
|
|
``` |
|
|