Chengyue Wu commited on
Commit
8e1b521
·
1 Parent(s): a0635c6

update readme

Browse files
Files changed (1) hide show
  1. README.md +35 -118
README.md CHANGED
@@ -31,127 +31,44 @@ model = AutoModel.from_config(config, trust_remote_code=True)
31
  # or directly from_pretrained
32
  model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
33
 
34
- # examples generate with raw text
35
- res = model.generate_content([
36
- "how are you today?"
 
 
 
 
 
 
 
 
 
 
 
37
  ])
38
- print(colored(res, "cyan", attrs=["bold"]))
39
 
40
- print("---" * 40)
41
-
42
- # examples generate with text + image
43
- import PIL.Image
44
- response = model.generate_content([
45
- PIL.Image.open("inference_test/test_data/caption_meat.jpeg"),
46
- "describe the image?"
47
  ])
48
- print(colored(response, "cyan", attrs=["bold"]))
49
- ```
50
-
51
- ## AutoProcessor
52
-
53
- we also support `AutoProcessor` class to ease data preparation for training and finetuning.
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- ### single call
57
-
58
- ```python
59
- from transformers import AutoProcessor, AutoModel
60
-
61
- model_path = "Efficient-Large-Model/NVILA-Lite-2B-hf-preview"
62
- processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
63
- model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
64
- # important: set model to eval mode, otherwise the model will be in training mode and will pad to right.
65
- model.eval()
66
-
67
- gpt_conv = [{
68
- "role": "user",
69
- "content": [
70
- {"type": "image", "path": "demo_images/demo_img_1.png"},
71
- {"type": "text", "text": "Describe this image."}
72
- ]
73
- }]
74
- text = processor.apply_chat_template(gpt_conv, tokenize=False, add_generation_prompt=True)
75
- inputs = processor([text])
76
-
77
- output_ids = model.generate(
78
- input_ids=inputs.input_ids,
79
- media=inputs.media,
80
- media_config=inputs.media_config,
81
- generation_config=model.generation_config,
82
- max_new_tokens=256,
83
- )
84
- print(processor.tokenizer.batch_decode(output_ids, skip_special_tokens=True))
85
-
86
- ##### the above code is equivalent to
87
- # response = model.generate_content([
88
- # PIL.Image.open("demo_images/demo_img_1.png"),
89
- # "describe the image?"
90
- # ])
91
- # print(colored(response, "cyan", attrs=["bold"]))
92
  ```
93
-
94
- ### batch call
95
-
96
- ```python
97
- from transformers import AutoProcessor, AutoModel
98
-
99
- model_path = "Efficient-Large-Model/NVILA-Lite-2B-hf-preview"
100
- model_path = "./NVILA-Lite-2B-hf-preview"
101
- processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
102
- model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
103
- # important: set model to eval mode, otherwise the model will be in training mode and will pad to right.
104
- model.eval()
105
-
106
- gpt_conv1 = [{
107
- "role": "user",
108
- "content": [
109
- {"type": "image", "path": "demo_images/demo_img_1.png"},
110
- {"type": "text", "text": "Describe this image."}
111
- ]
112
- }]
113
- gpt_conv2 = [{
114
- "role": "user",
115
- "content": [
116
- {"type": "image", "path": "demo_images/demo_img_2.png"},
117
- {"type": "text", "text": "Describe this image for me. Provide a detailed description of the image."}
118
- ]
119
- }]
120
-
121
- messages = [gpt_conv1, gpt_conv2]
122
- texts = [
123
- processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
124
- for msg in messages
125
- ]
126
- inputs = processor(texts)
127
-
128
- output_ids = model.generate(
129
- input_ids=inputs.input_ids,
130
- media=inputs.media,
131
- media_config=inputs.media_config,
132
- generation_config=model.generation_config,
133
- max_new_tokens=256,
134
- )
135
- output_texts = processor.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
136
- print(output_texts[0])
137
- print("---" * 40)
138
- print(output_texts[1])
139
- ```
140
-
141
-
142
- ## Model Convert
143
-
144
- The follwing code converts a convetional NVILA model to a HF compatible model.
145
-
146
- ```python
147
- import os, os.path as osp
148
- from transformers import AutoConfig, AutoModel, AutoProcessor, AutoTokenizer, AutoImageProcessor
149
-
150
- model_path = "Efficient-Large-Model/NVILA-Lite-2B"
151
- output_dir = "NVILA-Lite-2B-hf-preview"
152
-
153
- if osp.isdir(output_dir):
154
- shutil.rmtree(output_dir)
155
- from llava.remote_code.modeling_vila import VILAForCasualLM
156
- VILAForCasualLM.convert_vila_dev_ckpt_to_remote(model_path, output_dir, copy=False)
157
- ```
 
31
  # or directly from_pretrained
32
  model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
33
 
34
+ files = [
35
+ f"output/sana_test_prompt/0.png",
36
+ f"output/sana_test_prompt/1.png"
37
+ ],
38
+
39
+ prompt = "YOUR_GENERATED_PROMPT"
40
+
41
+ prompt = f"""You are an AI assistant specializing in image analysis and ranking. Your task is to analyze and compare image based on how well they match the given prompt.
42
+ The given prompt is:{prompt}. Please consider the prompt and the image to make a decision and response directly with 'yes' or 'no'.
43
+ """
44
+
45
+ r1, scores1 = model.generate_content([
46
+ PIL.Image.open(files[0]),
47
+ prompt
48
  ])
 
49
 
50
+ r2, scores2 = model.generate_content([
51
+ PIL.Image.open(files[1]),
52
+ prompt
 
 
 
 
53
  ])
 
 
 
 
 
 
54
 
55
+ if r1 == r2:
56
+ if r1 == "yes":
57
+ # pick the one with higher score for yes
58
+ if scores1[0][0, yes_id] > scores2[0][0, yes_id]:
59
+ selected_file = files[0]
60
+ else:
61
+ selected_file = files[1]
62
+ else:
63
+ # pick the one with less score for no
64
+ if scores1[0][0, no_id] < scores2[0][0, no_id]:
65
+ selected_file = files[0]
66
+ else:
67
+ selected_file = files[1]
68
+ else:
69
+ if r1 == "yes":
70
+ selected_file = files[0]
71
+ else:
72
+ selected_file = files[1]
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  ```