lavila / eval_zeroshot.py
nateraw's picture
Upload . with huggingface_hub
39d5658
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import numpy as np
import os.path as osp
import time
from collections import OrderedDict
import pandas as pd
import torch
import torchvision.transforms as transforms
import torchvision.transforms._transforms_video as transforms_video
from sklearn.metrics import confusion_matrix
from lavila.data import datasets
from lavila.data.video_transforms import Permute, SpatialCrop, TemporalCrop
from lavila.models import models
from lavila.models.utils import inflate_positional_embeds
from lavila.utils import distributed as dist_utils
from lavila.utils.evaluation import accuracy, get_mean_accuracy
from lavila.utils.evaluation_egomcq import egomcq_accuracy_metrics
from lavila.utils.evaluation_ek100mir import (calculate_k_counts, calculate_IDCG, calculate_mAP, calculate_nDCG)
from lavila.utils.evaluation_charades import charades_map
from lavila.utils.preprocess import generate_label_map, generate_tokenizer
def get_args_parser():
parser = argparse.ArgumentParser(description='LAVILA 0-shot evaluations', add_help=False)
parser.add_argument('--dataset', default='ek100_mir', type=str,
choices=['ek100_cls', 'ek100_mir', 'charades_ego', 'egtea', 'ego4d_mcq'])
parser.add_argument('--root',
default='datasets/EK100/video_ht256px/',
type=str, help='path to dataset root')
parser.add_argument('--metadata-val',
default='datasets/EK100/epic-kitchens-100-annotations/retrieval_annotations/EPIC_100_retrieval_test.csv',
type=str, help='path to metadata file (val set)')
parser.add_argument('--relevancy-path',
default='datasets/EK100/epic-kitchens-100-annotations/retrieval_annotations/relevancy/caption_relevancy_EPIC_100_retrieval_test.pkl',
type=str, help='path to relevancy matrix (val set)')
parser.add_argument('--output-dir', default='./', type=str, help='output dir')
parser.add_argument('--num-crops', default=1, type=int, help='number of crops in transforms')
parser.add_argument('--num-clips', default=1, type=int, help='number of clips (for untrimmed videos, eg. Charades)')
parser.add_argument('--clip-length', default=4, type=int, help='clip length')
parser.add_argument('--clip-stride', default=16, type=int, help='clip stride')
parser.add_argument('--sparse-sample', action='store_true', help='switch to sparse sampling')
parser.add_argument('--batch-size', default=16, type=int, help='batch_size')
parser.add_argument('--cls-use-template', action='store_true', help='use prompt in 0-shot classification')
parser.add_argument('--print-freq', default=100, type=int)
parser.add_argument('-j', '--workers', default=10, type=int, metavar='N',
help='number of data loading workers per process')
parser.add_argument('--resume', default='', type=str, help='path to latest checkpoint')
parser.add_argument('--use-half', action='store_true')
return parser
def main(args):
if args.resume:
ckpt_path = args.resume
elif osp.isfile(osp.join(args.output_dir, 'checkpoint_best.pt')):
ckpt_path = osp.join(args.output_dir, 'checkpoint_best.pt')
else:
raise Exception('no checkpoint found')
ckpt = torch.load(ckpt_path, map_location='cpu')
# create model
state_dict = OrderedDict()
for k, v in ckpt['state_dict'].items():
state_dict[k.replace('module.', '')] = v
old_args = ckpt['args']
print('=> creating model: {}'.format(old_args.model))
model = getattr(models, old_args.model)(
text_use_cls_token=old_args.use_cls_token,
project_embed_dim=old_args.project_embed_dim,
gated_xattn=False if 'gated_xattn' not in old_args else old_args.gated_xattn,
timesformer_gated_xattn=False if 'timesformer_gated_xattn' not in old_args else old_args.timesformer_gated_xattn,
timesformer_freeze_space=False if 'timesformer_freeze_space' not in old_args else old_args.timesformer_freeze_space,
freeze_lm_vclm=False if 'freeze_lm_vclm' not in old_args else old_args.freeze_lm_vclm,
freeze_visual_vclm=False if 'freeze_visual_vclm' not in old_args else old_args.freeze_visual_vclm,
num_frames=args.clip_length,
drop_path_rate=0,
)
model.cuda()
if 'TIMESFORMER' in old_args.model or 'EGOVLP' in old_args.model:
# inflate weight
print('=> inflating PE in models due to different frame numbers')
state_dict = inflate_positional_embeds(
model.state_dict(), state_dict,
num_frames=args.clip_length,
load_temporal_fix='bilinear',
)
model.load_state_dict(state_dict, strict=True)
print("=> loaded resume checkpoint '{}' (epoch {}, best_metric = {})".format(args.resume, ckpt['epoch'], ckpt['best_acc1']))
torch.backends.cudnn.benchmark = True
if args.dataset in ['ek100_cls', 'charades_ego', 'egtea']:
labels, mapping_vn2act = generate_label_map(args.dataset)
else:
mapping_vn2act = None
tokenizer = generate_tokenizer(old_args.model)
crop_size = 224 if '336PX' not in old_args.model else 336
if args.num_crops == 1 and args.num_clips == 1:
val_transform = transforms.Compose([
Permute([3, 0, 1, 2]), # T H W C -> C T H W
transforms.Resize(crop_size),
transforms.CenterCrop(crop_size),
(transforms_video.NormalizeVideo(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]) if ('OPENAI' not in old_args.model) else
transforms_video.NormalizeVideo(mean=[108.3272985, 116.7460125, 104.09373615000001], std=[68.5005327, 66.6321579, 70.32316305])),
])
else:
val_transform = transforms.Compose([
Permute([3, 0, 1, 2]), # T H W C -> C T H W
transforms.Resize(crop_size),
(transforms_video.NormalizeVideo(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]) if ('OPENAI' not in old_args.model) else
transforms_video.NormalizeVideo(mean=[108.3272985, 116.7460125, 104.09373615000001], std=[68.5005327, 66.6321579, 70.32316305])),
TemporalCrop(frames_per_clip=args.clip_length, stride=args.clip_length),
SpatialCrop(crop_size=crop_size, num_crops=args.num_crops),
])
val_dataset = datasets.get_downstream_dataset(
val_transform, tokenizer, args, subset='val', label_mapping=mapping_vn2act,
)
val_loader = torch.utils.data.DataLoader(
val_dataset, batch_size=args.batch_size, shuffle=False,
num_workers=args.workers, pin_memory=True, drop_last=False)
if args.cls_use_template:
templates = ['#C C {}', '#C {}']
else:
templates = ['{}']
if args.dataset in ['ek100_cls', 'charades_ego', 'egtea']:
preds, targets = validate_zeroshot(val_loader, templates, labels, model, tokenizer)
if args.dataset == 'ek100_cls':
if args.use_half:
preds = preds.float()
top1, top5 = accuracy(preds, targets, topk=(1, 5))
print('top1 = {:.3f}'.format(top1.item()))
print('top5 = {:.3f}'.format(top5.item()))
elif args.dataset == 'charades_ego':
preds, targets = preds.numpy(), targets.numpy()
m_ap, _, _ = charades_map(preds, targets)
print('mAP = {:.3f}'.format(m_ap))
elif args.dataset == 'egtea':
preds, targets = preds.numpy(), targets.numpy()
print(preds.shape, targets.shape)
cm = confusion_matrix(targets, preds.argmax(axis=1))
mean_class_acc, acc = get_mean_accuracy(cm)
print('Mean Acc. = {:.3f}, Top-1 Acc. = {:.3f}'.format(mean_class_acc, acc))
if args.dataset == 'ek100_mir':
val_dataset = datasets.VideoCaptionDatasetCLIP(
'ek100_mir',
args.root,
args.metadata_val,
transform=val_transform, is_training=False,
tokenizer=tokenizer,
clip_length=args.clip_length,
clip_stride=args.clip_stride,
sparse_sample=False
)
val_loader = torch.utils.data.DataLoader(
val_dataset, batch_size=args.batch_size, shuffle=False,
num_workers=args.workers, pin_memory=True, drop_last=False
)
similarity_matrix = get_similarity_matrix(val_loader, model, print_freq=args.print_freq, use_half=args.use_half)
similarity_matrix = (similarity_matrix + 1) / 2
video_id = pd.read_csv(args.metadata_val).values[:, 0]
text_id = pd.read_csv(args.metadata_val.replace("test.csv", "test_sentence.csv")).values[:, 0]
indexes = [video_id.tolist().index(elem) for elem in text_id]
similarity_matrix = similarity_matrix[:, indexes]
print(similarity_matrix.shape)
rel_matrix = pd.read_pickle(args.relevancy_path)
vis_map = calculate_mAP(similarity_matrix, rel_matrix)
txt_map = calculate_mAP(similarity_matrix.T, rel_matrix.T)
print('mAP: V->T: {:.3f} T->V: {:.3f} AVG: {:.3f}'.format(vis_map, txt_map, (vis_map + txt_map) / 2))
vis_k_counts = calculate_k_counts(rel_matrix)
txt_k_counts = calculate_k_counts(rel_matrix.T)
vis_IDCG = calculate_IDCG(rel_matrix, vis_k_counts)
txt_IDCG = calculate_IDCG(rel_matrix.T, txt_k_counts)
vis_nDCG = calculate_nDCG(similarity_matrix, rel_matrix, k_counts=vis_k_counts, IDCG=vis_IDCG)
txt_nDCG = calculate_nDCG(similarity_matrix.T, rel_matrix.T, k_counts=txt_k_counts, IDCG=txt_IDCG)
print('nDCG: V->T: {:.3f} T->V: {:.3f} AVG: {:.3f}'.format(vis_nDCG, txt_nDCG, (vis_nDCG + txt_nDCG) / 2))
if args.dataset == 'ego4d_mcq':
val_dataset = datasets.VideoCaptionDatasetMCQ(
args.dataset,
args.root,
args.metadata_val,
transform=val_transform, is_training=False,
tokenizer=tokenizer,
clip_length=args.clip_length,
clip_stride=args.clip_stride,
sparse_sample=False,
)
val_loader = torch.utils.data.DataLoader(
val_dataset, batch_size=args.batch_size, shuffle=False,
num_workers=args.workers, pin_memory=True, drop_last=False
)
validate_mcq(val_loader, model, use_half=args.use_half)
def validate_zeroshot(val_loader, templates, labels, model, tokenizer):
model.eval()
if args.use_half:
model = model.half()
all_outputs = []
all_targets = []
all_vis_features = []
print('=> encoding captions')
with torch.no_grad():
text_features = []
for label in labels:
if isinstance(label, list):
texts = [tmpl.format(lbl) for tmpl in templates for lbl in label]
else:
texts = [tmpl.format(label) for tmpl in templates]
texts = tokenizer(texts)
if isinstance(texts, tuple):
# Bert-style tokenizer will output both ids and mask
texts, masks = texts
texts = texts.cuda(non_blocking=True)
masks = masks.cuda(non_blocking=True)
else:
texts = texts.cuda(non_blocking=True)
masks = None
texts = texts.view(-1, 77).contiguous()
masks = masks.view(-1, 77).contiguous() if masks is not None else None
if masks is not None:
class_embeddings = dist_utils.get_model(model).encode_text(texts, attention_mask=masks)
else:
class_embeddings = dist_utils.get_model(model).encode_text(texts)
class_embeddings = class_embeddings / class_embeddings.norm(dim=-1, keepdim=True)
class_embeddings = class_embeddings.mean(dim=0)
class_embeddings = class_embeddings / class_embeddings.norm(dim=-1, keepdim=True)
text_features.append(class_embeddings)
text_features = torch.stack(text_features, dim=0)
print('=> start forwarding')
end_time = time.time()
for i, (images, target) in enumerate(val_loader):
if i % args.print_freq == 0:
print('finish batch {}/{} in {} sec'.format(i, len(val_loader), time.time() - end_time))
end_time = time.time()
if isinstance(images, torch.Tensor):
images = images.cuda(non_blocking=True)
if args.use_half:
images = images.half()
target = target.cuda(non_blocking=True)
# encode images
image_features = dist_utils.get_model(model).encode_image(images)
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
all_vis_features.append(image_features)
# cosine similarity as logits
logits_per_image = image_features @ text_features.t()
# logits_per_image = torch.softmax(logits_per_image, dim=1)
else:
target = target.cuda(non_blocking=True)
images_list = images
logits_all_clips = []
for images in images_list:
images = images.cuda(non_blocking=True)
if args.use_half:
images = images.half()
image_features = dist_utils.get_model(model).encode_image(images)
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
logits_per_image = image_features @ text_features.t()
logits_all_clips.append(logits_per_image)
logits_all_clips = torch.stack(logits_all_clips, dim=0)
logits_per_image = logits_all_clips.max(0).values
# logits_per_image = logits_all_clips.mean(0)
logits_per_image = torch.softmax(logits_per_image, dim=1)
all_outputs.append(logits_per_image.cpu())
all_targets.append(target.cpu())
return torch.cat(all_outputs), torch.cat(all_targets)
def get_similarity_matrix(val_loader, model, print_freq=100, use_half=False):
model.eval()
if use_half:
model = model.half()
all_text_embed = []
all_video_embed = []
with torch.no_grad():
print('=> encoding visual and textual')
for i, inputs in enumerate(val_loader):
if i % print_freq == 0:
print('finish batch {}/{}'.format(i, len(val_loader)))
frames = inputs[0].cuda(non_blocking=True)
if use_half:
frames = frames.half()
texts = inputs[1].cuda(non_blocking=True)
if len(inputs) == 4:
masks = inputs[2].cuda(non_blocking=True)
else:
masks = None
# encode images
image_features = dist_utils.get_model(model).encode_image(frames)
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
all_video_embed.append(image_features.cpu().numpy())
if texts.ndim == 3:
is_multiple_narrations = True
texts = texts.view(-1, texts.shape[-1])
else:
is_multiple_narrations = False
if masks is not None:
text_features = dist_utils.get_model(model).encode_text(texts, attention_mask=masks)
else:
text_features = dist_utils.get_model(model).encode_text(texts)
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
all_text_embed.append(text_features.cpu().numpy())
all_text_embed = np.vstack(all_text_embed)
all_video_embed = np.vstack(all_video_embed)
similarity_matrix = np.matmul(all_video_embed, all_text_embed.T)
if is_multiple_narrations:
similarity_matrix = similarity_matrix.reshape(all_video_embed.shape[0], all_video_embed.shape[0], -1)
return similarity_matrix
def validate_mcq(val_loader, model, use_half=False):
model.eval()
if use_half:
model.half()
with torch.no_grad():
print('=> start forwarding')
all_preds = []
all_gts = []
all_types = []
end_time = time.time()
for i, inputs in enumerate(val_loader):
if i % args.print_freq == 0:
print('finish batch {}/{} in {} sec'.format(i, len(val_loader), time.time() - end_time))
end_time = time.time()
texts_query = inputs[0].cuda(non_blocking=True)
frames_options = inputs[1].cuda(non_blocking=True)
if use_half:
frames_options = frames_options.half()
answer = inputs[3]
q_type = inputs[4]
if len(inputs) == 7:
masks_query = inputs[5].cuda(non_blocking=True)
else:
masks_query = None
batch_size = frames_options.shape[0]
frames_options = frames_options.view(-1, *frames_options.shape[2:])
image_features = dist_utils.get_model(model).encode_image(frames_options)
image_features = image_features.view(batch_size, -1, *image_features.shape[1:])
if masks_query is not None:
query_features = dist_utils.get_model(model).encode_text(texts_query, attention_mask=masks_query)
else:
query_features = dist_utils.get_model(model).encode_text(texts_query)
all_gts.append(answer)
all_types.append(q_type)
for j in range(batch_size):
similarity_matrix = torch.matmul(query_features[j], image_features[j].T)
similarity_matrix = similarity_matrix.cpu().detach()
all_preds.append(similarity_matrix)
all_preds = torch.stack(all_preds)
all_gts = torch.cat(all_gts)
all_types = torch.cat(all_types)
metrics = egomcq_accuracy_metrics(all_preds, all_gts, all_types)
print(metrics)
if __name__ == '__main__':
parser = argparse.ArgumentParser('lavila 0-shot evaluations', parents=[get_args_parser()])
args = parser.parse_args()
main(args)