--- license: apache-2.0 base_model: - qihoo360/fg-clip-base - ds4sd/SmolDocling-256M-preview --- # VQ-VAE CLIP Vision Calculate the CLIP score on the denoised latent. ## Inference ```python def clip_score(text_feature, image_feature): image_feature = image_feature / image_feature.norm(p=2, dim=-1, keepdim=True) text_feature = text_feature / text_feature.norm(p=2, dim=-1, keepdim=True) logits_per_image = image_feature @ text_feature.T logits_per_image = scale.exp() * logits_per_image probs = logits_per_image.softmax(dim=1)[0] return probs.tolist() def doc(pixel_values): model = Idefics3ForConditionalGeneration.from_pretrained('ds4sd/SmolDocling-256M-preview').model.vision_model return model.forward(pixel_values, return_dict=True).last_hidden_state def encode(pixel_values): model = VQModel.from_pretrained('MeissonFlow/Meissonic', subfolder='vqvae') y = model.encoder.conv_in(pixel_values) for down_block in model.encoder.down_blocks: y = down_block(y) y = model.encoder.mid_block(y) y = model.encoder.conv_norm_out(y) return model.encoder.conv_act(y) image_size = 512 doc_output = doc(pixel_values) vq_output = encode(pixel_values) b, dim, _, _ = vq_output.shape vq_output = vq_output.permute(0, 3, 1, 2).contiguous() vq_output = vq_output.view(1, -1, dim) output = model(torch.cat([vq_output, doc_output], dim=1)) pooled = output[:, :1024, :].mean(dim=1) clip_model = AutoModelForCausalLM.from_pretrained('qihoo360/fg-clip-base', trust_remote_code=True) clip_tokenizer = AutoTokenizer.from_pretrained('qihoo360/fg-clip-base') # ... clip_score(text_feature, pooled) ```