Spaces:
Runtime error
Runtime error
import torch | |
import argparse | |
import os | |
import pickle as pkl | |
import decord | |
import numpy as np | |
import yaml | |
from tqdm import tqdm | |
from cover.datasets import ( | |
UnifiedFrameSampler, | |
ViewDecompositionDataset, | |
spatial_temporal_view_decomposition, | |
) | |
from cover.models import COVER | |
mean, std = ( | |
torch.FloatTensor([123.675, 116.28, 103.53]), | |
torch.FloatTensor([58.395, 57.12, 57.375]), | |
) | |
mean_clip, std_clip = ( | |
torch.FloatTensor([122.77, 116.75, 104.09]), | |
torch.FloatTensor([68.50, 66.63, 70.32]) | |
) | |
def fuse_results(results: list): | |
x = (results[0] + results[1] + results[2]) | |
return { | |
"semantic" : results[0], | |
"technical": results[1], | |
"aesthetic": results[2], | |
"overall" : x, | |
} | |
def parse_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-o", "--opt" , type=str, default="./cover.yml", help="the option file") | |
parser.add_argument('-d', "--device", type=str, default="cuda" , help='CUDA device id') | |
parser.add_argument("-i", "--input_video_dir", type=str, default="./demo", help="the input video dir") | |
parser.add_argument( "--output", type=str, default="./demo.csv" , help='output file to store predict mos value') | |
args = parser.parse_args() | |
return args | |
if __name__ == "__main__": | |
args = parse_args() | |
with open(args.opt, "r") as f: | |
opt = yaml.safe_load(f) | |
### Load COVER | |
evaluator = COVER(**opt["model"]["args"]).to(args.device) | |
state_dict = torch.load(opt["test_load_path"], map_location=args.device) | |
# set strict=False here to avoid error of missing | |
# weight of prompt_learner in clip-iqa+, cross-gate | |
evaluator.load_state_dict(state_dict['state_dict'], strict=False) | |
video_paths = [] | |
all_results = {} | |
with open(args.output, "w") as w: | |
w.write(f"path, semantic score, technical score, aesthetic score, overall/final score\n") | |
dopt = opt["data"]["val-l1080p"]["args"] | |
dopt["anno_file"] = None | |
dopt["data_prefix"] = args.input_video_dir | |
dataset = ViewDecompositionDataset(dopt) | |
dataloader = torch.utils.data.DataLoader( | |
dataset, batch_size=1, num_workers=opt["num_workers"], pin_memory=True, | |
) | |
sample_types = ["semantic", "technical", "aesthetic"] | |
for i, data in enumerate(tqdm(dataloader, desc="Testing")): | |
if len(data.keys()) == 1: | |
## failed data | |
continue | |
video = {} | |
for key in sample_types: | |
if key in data: | |
video[key] = data[key].to(args.device) | |
b, c, t, h, w = video[key].shape | |
video[key] = ( | |
video[key] | |
.reshape( | |
b, c, data["num_clips"][key], t // data["num_clips"][key], h, w | |
) | |
.permute(0, 2, 1, 3, 4, 5) | |
.reshape( | |
b * data["num_clips"][key], c, t // data["num_clips"][key], h, w | |
) | |
) | |
with torch.no_grad(): | |
results = evaluator(video, reduce_scores=False) | |
results = [np.mean(l.cpu().numpy()) for l in results] | |
rescaled_results = fuse_results(results) | |
# all_results[data["name"][0]] = rescaled_results | |
# with open( | |
# f"cover_predictions/val-custom_{args.input_video_dir.split('/')[-1]}.pkl", "wb" | |
# ) as wf: | |
# pkl.dump(all_results, wf) | |
with open(args.output, "a") as w: | |
w.write( | |
f'{data["name"][0].split("/")[-1]},{rescaled_results["semantic"]:4f},{rescaled_results["technical"]:4f},{rescaled_results["aesthetic"]:4f},{rescaled_results["overall"]:4f}\n' | |
) | |