VQ-VAE CLIP Vision

Calculate the CLIP score on the denoised latent.

Inference

def clip_score(text_feature, image_feature):
    image_feature = image_feature / image_feature.norm(p=2, dim=-1, keepdim=True)
    text_feature = text_feature / text_feature.norm(p=2, dim=-1, keepdim=True)
    logits_per_image = image_feature @ text_feature.T
    logits_per_image = scale.exp() * logits_per_image
    probs = logits_per_image.softmax(dim=1)[0]

    return probs.tolist()

def doc(pixel_values):
    model = Idefics3ForConditionalGeneration.from_pretrained('ds4sd/SmolDocling-256M-preview').model.vision_model

    return model.forward(pixel_values, return_dict=True).last_hidden_state 

def encode(pixel_values):
    model = VQModel.from_pretrained('MeissonFlow/Meissonic', subfolder='vqvae')
    y = model.encoder.conv_in(pixel_values)
    for down_block in model.encoder.down_blocks:
        y = down_block(y)
    y = model.encoder.mid_block(y)
    y = model.encoder.conv_norm_out(y)

    return model.encoder.conv_act(y)

image_size = 512
doc_output = doc(pixel_values)
vq_output = encode(pixel_values)
b, dim, _, _ = vq_output.shape
vq_output = vq_output.permute(0, 3, 1, 2).contiguous()
vq_output = vq_output.view(1, -1, dim)
output = model(torch.cat([vq_output, doc_output], dim=1))
pooled = output[:, :1024, :].mean(dim=1)

clip_model = AutoModelForCausalLM.from_pretrained('qihoo360/fg-clip-base', trust_remote_code=True)
clip_tokenizer = AutoTokenizer.from_pretrained('qihoo360/fg-clip-base')
# ...

clip_score(text_feature, pooled)
Downloads last month

-

Downloads are not tracked for this model. How to track
Safetensors
Model size
46.9M params
Tensor type
F32
ยท
Inference Providers NEW
This model isn't deployed by any Inference Provider. ๐Ÿ™‹ Ask for provider support

Model tree for twodgirl/vq-clip-vision