import data import torch import gradio as gr from models import imagebind_model from models.imagebind_model import ModalityType device = "cuda:0" if torch.cuda.is_available() else "cpu" model = imagebind_model.imagebind_huge(pretrained=True) model.eval() model.to(device) def image_text_zeroshot(image, text_list): image_paths = [image] labels = [label.strip(" ") for label in text_list.strip(" ").split("|")] inputs = { ModalityType.TEXT: data.load_and_transform_text(labels, device), ModalityType.VISION: data.load_and_transform_vision_data(image_paths, device), } with torch.no_grad(): embeddings = model(inputs) scores = ( torch.softmax( embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T, dim=-1 ) .squeeze(0) .tolist() ) score_dict = {label: score for label, score in zip(labels, scores)} return score_dict def audio_text_zeroshot(audio, text_list): audio_paths = [audio] labels = [label.strip(" ") for label in text_list.strip(" ").split("|")] inputs = { ModalityType.TEXT: data.load_and_transform_text(labels, device), ModalityType.AUDIO: data.load_and_transform_audio_data(audio_paths, device), } with torch.no_grad(): embeddings = model(inputs) scores = ( torch.softmax( embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1 ) .squeeze(0) .tolist() ) score_dict = {label: score for label, score in zip(labels, scores)} return score_dict def inference( task, image=None, audio=None, text_list=None, ): if task == "image-text": result = image_text_zeroshot(image, text_list) elif task == "audio-text": result = audio_text_zeroshot(audio, text_list) else: raise NotImplementedError return result def main(): inputs = [ gr.inputs.Radio( choices=[ "image-text", "audio-text", ], type="value", default="image-text", label="Task", ), gr.inputs.Image(type="filepath", label="Input image"), gr.inputs.Audio(type="filepath", label="Input audio"), gr.inputs.Textbox(lines=1, label="Candidate texts"), ] iface = gr.Interface( inference, inputs, "label", examples=[ ["image-text", "assets/dog_image.jpg", None, "A dog|A car|A bird"], ["image-text", "assets/car_image.jpg", None, "A dog|A car|A bird"], ["audio-text", None, "assets/bird_audio.wav", "A dog|A car|A bird"], ["audio-text", None, "assets/dog_audio.wav", "A dog|A car|A bird"], ], description="""
This is a simple demo of ImageBind for zero-shot cross-modal understanding (now including image classification and audio classification). Please refer to the original paper and repo for more details.
To test your own cases, you can upload an image or an audio, and provide the candidate texts separated by "|".
You can duplicate this space and run it privately: