OPEA
/

llama-joycaption-alpha-two-hf-llava-int4-sym-inc

@@ -25,8 +25,8 @@ quantized_model_path="OPEA/llama-joycaption-alpha-two-hf-llava-int4-sym-inc"
 # Load JoyCaption INT4 Model
 processor = AutoProcessor.from_pretrained(quantized_model_path)
-llava_model = LlavaForConditionalGeneration.from_pretrained(quantized_model_path, device_map=0)
-llava_model.eval()
 image_url = "http://images.cocodataset.org/train2017/000000116003.jpg"
 content = "Write a descriptive caption for this image in a formal tone."
@@ -48,9 +48,9 @@ with torch.no_grad():
 	assert isinstance(prompt, str)
 	inputs = processor(text=[prompt], images=[image], return_tensors="pt").to(model.device)
 	inputs['pixel_values'] = inputs['pixel_values'].to(model.dtype)
 	# Generate the captions
-	generate_ids = llava_model.generate(
 		**inputs,
 		max_new_tokens=50,
 		do_sample=False,
@@ -60,34 +60,41 @@ with torch.no_grad():
 		top_k=None,
 		top_p=0.9,
 	)[0]
 	# Trim off the prompt
 	generate_ids = generate_ids[inputs['input_ids'].shape[1]:]
 	# Decode the caption
 	caption = processor.tokenizer.decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 	caption = caption.strip()
 	print(caption)
-##INT4:
-##BF16:
 image_url = "http://images.cocodataset.org/train2017/000000411975.jpg"
-content = "How many people are on the baseball field in the picture?"
-##INT4:
-##BF16:
 image_url = "http://images.cocodataset.org/train2017/000000093025.jpg"
-content = "How many people and animals are there in the image?"
-##INT4:
-##BF16:
 ```

 # Load JoyCaption INT4 Model
 processor = AutoProcessor.from_pretrained(quantized_model_path)
+model = LlavaForConditionalGeneration.from_pretrained(quantized_model_path, device_map=0)
+model.eval()
 image_url = "http://images.cocodataset.org/train2017/000000116003.jpg"
 content = "Write a descriptive caption for this image in a formal tone."
 	assert isinstance(prompt, str)
 	inputs = processor(text=[prompt], images=[image], return_tensors="pt").to(model.device)
 	inputs['pixel_values'] = inputs['pixel_values'].to(model.dtype)
 	# Generate the captions
+	generate_ids = model.generate(
 		**inputs,
 		max_new_tokens=50,
 		do_sample=False,
 		top_k=None,
 		top_p=0.9,
 	)[0]
 	# Trim off the prompt
 	generate_ids = generate_ids[inputs['input_ids'].shape[1]:]
 	# Decode the caption
 	caption = processor.tokenizer.decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 	caption = caption.strip()
 	print(caption)
+##INT4: This black-and-white photograph captures a moment of triumph on a tennis court. The central figure is a male tennis player, mid-celebration,
+## with his arms raised high in victory. He is wearing a white athletic shirt and shorts, with a
+##BF16: This black-and-white photograph captures a moment of triumph on a tennis court. The central figure is a male tennis player, mid-celebration,
+## with his arms raised high in victory. He is wearing a white tennis shirt and shorts, with a
 image_url = "http://images.cocodataset.org/train2017/000000411975.jpg"
+content = "Write a descriptive caption for this image in a formal tone."
+##INT4: This is a photograph capturing a moment during a baseball game. The image is taken from a high vantage point, likely from the stands,
+## looking down onto the field. The main focus is on a young girl and a man standing on the grassy
+##BF16: This is a photograph capturing a moment during a baseball game. The image is taken from a high angle, looking down onto the field.
+## In the foreground, there is a section of the baseball field with a reddish-brown dirt infield and a well
 image_url = "http://images.cocodataset.org/train2017/000000093025.jpg"
+content = "Write a descriptive caption for this image in a formal tone."
+##INT4: This is a photograph capturing a serene outdoor scene on a rocky mountainous terrain under a clear blue sky with scattered white clouds.
+## The central focus is on a man and a sheep. The man, positioned slightly to the right of the center, is sitting
+##BF16: This photograph captures a serene mountainous landscape under a bright blue sky dotted with fluffy white clouds. In the foreground,
+## a man and a woman are seated on a rocky outcrop. The man, positioned on the left, is wearing a blue jacket and
 ```