Chengyue Wu
commited on
Commit
·
8e1b521
1
Parent(s):
a0635c6
update readme
Browse files
README.md
CHANGED
@@ -31,127 +31,44 @@ model = AutoModel.from_config(config, trust_remote_code=True)
|
|
31 |
# or directly from_pretrained
|
32 |
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
])
|
38 |
-
print(colored(res, "cyan", attrs=["bold"]))
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
import PIL.Image
|
44 |
-
response = model.generate_content([
|
45 |
-
PIL.Image.open("inference_test/test_data/caption_meat.jpeg"),
|
46 |
-
"describe the image?"
|
47 |
])
|
48 |
-
print(colored(response, "cyan", attrs=["bold"]))
|
49 |
-
```
|
50 |
-
|
51 |
-
## AutoProcessor
|
52 |
-
|
53 |
-
we also support `AutoProcessor` class to ease data preparation for training and finetuning.
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
-
### single call
|
57 |
-
|
58 |
-
```python
|
59 |
-
from transformers import AutoProcessor, AutoModel
|
60 |
-
|
61 |
-
model_path = "Efficient-Large-Model/NVILA-Lite-2B-hf-preview"
|
62 |
-
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
|
63 |
-
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
|
64 |
-
# important: set model to eval mode, otherwise the model will be in training mode and will pad to right.
|
65 |
-
model.eval()
|
66 |
-
|
67 |
-
gpt_conv = [{
|
68 |
-
"role": "user",
|
69 |
-
"content": [
|
70 |
-
{"type": "image", "path": "demo_images/demo_img_1.png"},
|
71 |
-
{"type": "text", "text": "Describe this image."}
|
72 |
-
]
|
73 |
-
}]
|
74 |
-
text = processor.apply_chat_template(gpt_conv, tokenize=False, add_generation_prompt=True)
|
75 |
-
inputs = processor([text])
|
76 |
-
|
77 |
-
output_ids = model.generate(
|
78 |
-
input_ids=inputs.input_ids,
|
79 |
-
media=inputs.media,
|
80 |
-
media_config=inputs.media_config,
|
81 |
-
generation_config=model.generation_config,
|
82 |
-
max_new_tokens=256,
|
83 |
-
)
|
84 |
-
print(processor.tokenizer.batch_decode(output_ids, skip_special_tokens=True))
|
85 |
-
|
86 |
-
##### the above code is equivalent to
|
87 |
-
# response = model.generate_content([
|
88 |
-
# PIL.Image.open("demo_images/demo_img_1.png"),
|
89 |
-
# "describe the image?"
|
90 |
-
# ])
|
91 |
-
# print(colored(response, "cyan", attrs=["bold"]))
|
92 |
```
|
93 |
-
|
94 |
-
### batch call
|
95 |
-
|
96 |
-
```python
|
97 |
-
from transformers import AutoProcessor, AutoModel
|
98 |
-
|
99 |
-
model_path = "Efficient-Large-Model/NVILA-Lite-2B-hf-preview"
|
100 |
-
model_path = "./NVILA-Lite-2B-hf-preview"
|
101 |
-
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
|
102 |
-
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
|
103 |
-
# important: set model to eval mode, otherwise the model will be in training mode and will pad to right.
|
104 |
-
model.eval()
|
105 |
-
|
106 |
-
gpt_conv1 = [{
|
107 |
-
"role": "user",
|
108 |
-
"content": [
|
109 |
-
{"type": "image", "path": "demo_images/demo_img_1.png"},
|
110 |
-
{"type": "text", "text": "Describe this image."}
|
111 |
-
]
|
112 |
-
}]
|
113 |
-
gpt_conv2 = [{
|
114 |
-
"role": "user",
|
115 |
-
"content": [
|
116 |
-
{"type": "image", "path": "demo_images/demo_img_2.png"},
|
117 |
-
{"type": "text", "text": "Describe this image for me. Provide a detailed description of the image."}
|
118 |
-
]
|
119 |
-
}]
|
120 |
-
|
121 |
-
messages = [gpt_conv1, gpt_conv2]
|
122 |
-
texts = [
|
123 |
-
processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
|
124 |
-
for msg in messages
|
125 |
-
]
|
126 |
-
inputs = processor(texts)
|
127 |
-
|
128 |
-
output_ids = model.generate(
|
129 |
-
input_ids=inputs.input_ids,
|
130 |
-
media=inputs.media,
|
131 |
-
media_config=inputs.media_config,
|
132 |
-
generation_config=model.generation_config,
|
133 |
-
max_new_tokens=256,
|
134 |
-
)
|
135 |
-
output_texts = processor.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
|
136 |
-
print(output_texts[0])
|
137 |
-
print("---" * 40)
|
138 |
-
print(output_texts[1])
|
139 |
-
```
|
140 |
-
|
141 |
-
|
142 |
-
## Model Convert
|
143 |
-
|
144 |
-
The follwing code converts a convetional NVILA model to a HF compatible model.
|
145 |
-
|
146 |
-
```python
|
147 |
-
import os, os.path as osp
|
148 |
-
from transformers import AutoConfig, AutoModel, AutoProcessor, AutoTokenizer, AutoImageProcessor
|
149 |
-
|
150 |
-
model_path = "Efficient-Large-Model/NVILA-Lite-2B"
|
151 |
-
output_dir = "NVILA-Lite-2B-hf-preview"
|
152 |
-
|
153 |
-
if osp.isdir(output_dir):
|
154 |
-
shutil.rmtree(output_dir)
|
155 |
-
from llava.remote_code.modeling_vila import VILAForCasualLM
|
156 |
-
VILAForCasualLM.convert_vila_dev_ckpt_to_remote(model_path, output_dir, copy=False)
|
157 |
-
```
|
|
|
31 |
# or directly from_pretrained
|
32 |
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
|
33 |
|
34 |
+
files = [
|
35 |
+
f"output/sana_test_prompt/0.png",
|
36 |
+
f"output/sana_test_prompt/1.png"
|
37 |
+
],
|
38 |
+
|
39 |
+
prompt = "YOUR_GENERATED_PROMPT"
|
40 |
+
|
41 |
+
prompt = f"""You are an AI assistant specializing in image analysis and ranking. Your task is to analyze and compare image based on how well they match the given prompt.
|
42 |
+
The given prompt is:{prompt}. Please consider the prompt and the image to make a decision and response directly with 'yes' or 'no'.
|
43 |
+
"""
|
44 |
+
|
45 |
+
r1, scores1 = model.generate_content([
|
46 |
+
PIL.Image.open(files[0]),
|
47 |
+
prompt
|
48 |
])
|
|
|
49 |
|
50 |
+
r2, scores2 = model.generate_content([
|
51 |
+
PIL.Image.open(files[1]),
|
52 |
+
prompt
|
|
|
|
|
|
|
|
|
53 |
])
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
+
if r1 == r2:
|
56 |
+
if r1 == "yes":
|
57 |
+
# pick the one with higher score for yes
|
58 |
+
if scores1[0][0, yes_id] > scores2[0][0, yes_id]:
|
59 |
+
selected_file = files[0]
|
60 |
+
else:
|
61 |
+
selected_file = files[1]
|
62 |
+
else:
|
63 |
+
# pick the one with less score for no
|
64 |
+
if scores1[0][0, no_id] < scores2[0][0, no_id]:
|
65 |
+
selected_file = files[0]
|
66 |
+
else:
|
67 |
+
selected_file = files[1]
|
68 |
+
else:
|
69 |
+
if r1 == "yes":
|
70 |
+
selected_file = files[0]
|
71 |
+
else:
|
72 |
+
selected_file = files[1]
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|