Efficient-Large-Model
/

NVILA-Lite-2B-Verifier

@@ -31,127 +31,44 @@ model = AutoModel.from_config(config, trust_remote_code=True)
 # or directly from_pretrained
 model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
-# examples generate with raw text
-res = model.generate_content([
-    "how are you today?"
 ])
-print(colored(res, "cyan", attrs=["bold"]))
-print("---" * 40)
-# examples generate with text + image
-import PIL.Image
-response = model.generate_content([
-    PIL.Image.open("inference_test/test_data/caption_meat.jpeg"),
-    "describe the image?"
 ])
-print(colored(response, "cyan", attrs=["bold"]))
-```
-## AutoProcessor
-we also support `AutoProcessor` class to ease data preparation for training and finetuning.
-### single call
-```python
-from transformers import AutoProcessor, AutoModel
-model_path = "Efficient-Large-Model/NVILA-Lite-2B-hf-preview"
-processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
-model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
-# important: set model to eval mode, otherwise the model will be in training mode and will pad to right.
-model.eval()
-gpt_conv = [{
-    "role": "user",
-    "content": [
-        {"type": "image", "path": "demo_images/demo_img_1.png"},
-        {"type": "text", "text": "Describe this image."}
-    ]
-}]
-text = processor.apply_chat_template(gpt_conv, tokenize=False, add_generation_prompt=True)
-inputs = processor([text])
-output_ids = model.generate(
-    input_ids=inputs.input_ids,
-    media=inputs.media,
-    media_config=inputs.media_config,
-    generation_config=model.generation_config,
-    max_new_tokens=256,
-)
-print(processor.tokenizer.batch_decode(output_ids, skip_special_tokens=True))
-##### the above code is equivalent to
-# response = model.generate_content([
-#     PIL.Image.open("demo_images/demo_img_1.png"),
-#     "describe the image?"
-# ])
-# print(colored(response, "cyan", attrs=["bold"]))
 ```
-### batch call
-```python
-from transformers import AutoProcessor, AutoModel
-model_path = "Efficient-Large-Model/NVILA-Lite-2B-hf-preview"
-model_path = "./NVILA-Lite-2B-hf-preview"
-processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
-model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
-# important: set model to eval mode, otherwise the model will be in training mode and will pad to right.
-model.eval()
-gpt_conv1 = [{
-    "role": "user",
-    "content": [
-        {"type": "image", "path": "demo_images/demo_img_1.png"},
-        {"type": "text", "text": "Describe this image."}
-    ]
-}]
-gpt_conv2 = [{
-    "role": "user",
-    "content": [
-        {"type": "image", "path": "demo_images/demo_img_2.png"},
-        {"type": "text", "text": "Describe this image for me. Provide a detailed description of the image."}
-    ]
-}]
-messages = [gpt_conv1, gpt_conv2]
-texts = [
-    processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
-    for msg in messages
-]
-inputs = processor(texts)
-output_ids = model.generate(
-    input_ids=inputs.input_ids,
-    media=inputs.media,
-    media_config=inputs.media_config,
-    generation_config=model.generation_config,
-    max_new_tokens=256,
-)
-output_texts = processor.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-print(output_texts[0])
-print("---" * 40)
-print(output_texts[1])
-```
-## Model Convert
-The follwing code converts a convetional NVILA model to a HF compatible model.
-```python
-import os, os.path as osp
-from transformers import AutoConfig, AutoModel, AutoProcessor, AutoTokenizer, AutoImageProcessor
-model_path = "Efficient-Large-Model/NVILA-Lite-2B"
-output_dir = "NVILA-Lite-2B-hf-preview"
-if osp.isdir(output_dir):
-    shutil.rmtree(output_dir)
-from llava.remote_code.modeling_vila import VILAForCasualLM
-VILAForCasualLM.convert_vila_dev_ckpt_to_remote(model_path, output_dir, copy=False)
-```

 # or directly from_pretrained
 model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
+files = [
+        f"output/sana_test_prompt/0.png",
+        f"output/sana_test_prompt/1.png"
+    ],
+prompt = "YOUR_GENERATED_PROMPT"
+prompt = f"""You are an AI assistant specializing in image analysis and ranking. Your task is to analyze and compare image based on how well they match the given prompt.
+The given prompt is:{prompt}. Please consider the prompt and the image to make a decision and response directly with 'yes' or 'no'.
+"""
+r1, scores1 = model.generate_content([
+    PIL.Image.open(files[0]),
+    prompt
 ])
+r2, scores2 = model.generate_content([
+    PIL.Image.open(files[1]),
+    prompt
 ])
+if r1 == r2:
+    if r1 == "yes":
+        # pick the one with higher score for yes
+        if scores1[0][0, yes_id] > scores2[0][0, yes_id]:
+            selected_file = files[0]
+        else:
+            selected_file = files[1]
+    else:
+        # pick the one with less score for no
+        if scores1[0][0, no_id] < scores2[0][0, no_id]:
+            selected_file = files[0]
+        else:
+            selected_file = files[1]
+else:
+    if r1 == "yes":
+        selected_file = files[0]
+    else:
+        selected_file = files[1]
 ```