Spaces:

aharley
/

alltracker

Running on Zero

App Files Files Community

aharley commited on Jun 30

Commit

77a88de

1 Parent(s): f7f5275

added basics

Browse files

Files changed (10) hide show

nets/alltracker.py +588 -0
nets/blocks.py +1304 -0
utils/basic.py +144 -0
utils/data.py +96 -0
utils/improc.py +1103 -0
utils/loss.py +220 -0
utils/misc.py +100 -0
utils/py.py +755 -0
utils/samp.py +213 -0
utils/saveload.py +65 -0

nets/alltracker.py ADDED Viewed

	@@ -0,0 +1,588 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import utils.misc
+import numpy as np
+from nets.blocks import CNBlockConfig, ConvNeXt, conv1x1, RelUpdateBlock, InputPadder, CorrBlock, BasicEncoder
+class Net(nn.Module):
+    def __init__(
+            self,
+            seqlen,
+            use_attn=True,
+            use_mixer=False,
+            use_conv=False,
+            use_convb=False,
+            use_basicencoder=False,
+            use_sinmotion=False,
+            use_relmotion=False,
+            use_sinrelmotion=False,
+            use_feats8=False,
+            no_time=False,
+            no_space=False,
+            no_split=False,
+            no_ctx=False,
+            full_split=False,
+            corr_levels=5,
+            corr_radius=4,
+            num_blocks=3,
+            dim=128,
+            hdim=128,
+            init_weights=True,
+    ):
+        super(Net, self).__init__()
+        self.dim = dim
+        self.hdim = hdim
+        self.no_time = no_time
+        self.no_space = no_space
+        self.seqlen = seqlen
+        self.corr_levels = corr_levels
+        self.corr_radius = corr_radius
+        self.corr_channel = self.corr_levels * (self.corr_radius * 2 + 1) ** 2
+        self.num_blocks = num_blocks
+        self.use_feats8 = use_feats8
+        self.use_basicencoder = use_basicencoder
+        self.use_sinmotion = use_sinmotion
+        self.use_relmotion = use_relmotion
+        self.use_sinrelmotion = use_sinrelmotion
+        self.no_split = no_split
+        self.no_ctx = no_ctx
+        self.full_split = full_split
+        if use_basicencoder:
+            if self.full_split:
+                self.fnet = BasicEncoder(input_dim=3, output_dim=self.dim, stride=8)
+                self.cnet = BasicEncoder(input_dim=3, output_dim=self.dim, stride=8)
+            else:
+                if self.no_split:
+                    self.fnet = BasicEncoder(input_dim=3, output_dim=self.dim, stride=8)
+                else:
+                    self.fnet = BasicEncoder(input_dim=3, output_dim=self.dim*2, stride=8)
+        else:
+            block_setting = [
+                CNBlockConfig(96, 192, 3, True), # 4x
+                CNBlockConfig(192, 384, 3, False), # 8x
+                CNBlockConfig(384, None, 9, False), # 8x
+            ]
+            self.cnn = ConvNeXt(block_setting, stochastic_depth_prob=0.0, init_weights=init_weights)
+            if self.no_split:
+                self.dot_conv = conv1x1(384, dim)
+            else:
+                self.dot_conv = conv1x1(384, dim*2)
+        self.upsample_weight = nn.Sequential(
+            # convex combination of 3x3 patches
+            nn.Conv2d(dim, dim * 2, 3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(dim * 2, 64 * 9, 1, padding=0)
+        )
+        self.flow_head = nn.Sequential(
+            nn.Conv2d(dim, 2*dim, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(2*dim, 2, kernel_size=3, padding=1)
+        )
+        self.visconf_head = nn.Sequential(
+            nn.Conv2d(dim, 2*dim, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(2*dim, 2, kernel_size=3, padding=1)
+        )
+        if self.use_sinrelmotion:
+            self.pdim = 84 # 32*2
+        elif self.use_relmotion:
+            self.pdim = 4
+        elif self.use_sinmotion:
+            self.pdim = 42
+        else:
+            self.pdim = 2
+        self.update_block = RelUpdateBlock(self.corr_channel, self.num_blocks, cdim=dim, hdim=hdim, pdim=self.pdim,
+                                           use_attn=use_attn, use_mixer=use_mixer, use_conv=use_conv, use_convb=use_convb,
+                                           use_layer_scale=True, no_time=no_time, no_space=no_space,
+                                           no_ctx=no_ctx)
+        time_line = torch.linspace(0, seqlen-1, seqlen).reshape(1, seqlen, 1)
+        self.register_buffer("time_emb", utils.misc.get_1d_sincos_pos_embed_from_grid(self.dim, time_line[0])) # 1,S,C
+    def fetch_time_embed(self, t, dtype, is_training=False):
+        S = self.time_emb.shape[1]
+        if t == S:
+            return self.time_emb.to(dtype)
+        elif t==1:
+            if is_training:
+                ind = np.random.choice(S)
+                return self.time_emb[:,ind:ind+1].to(dtype)
+            else:
+                return self.time_emb[:,1:2].to(dtype)
+        else:
+            time_emb = self.time_emb.float()
+            time_emb = F.interpolate(time_emb.permute(0, 2, 1), size=t, mode="linear").permute(0, 2, 1)
+            return time_emb.to(dtype)
+    def coords_grid(self, batch, ht, wd, device, dtype):
+        coords = torch.meshgrid(torch.arange(ht, device=device, dtype=dtype), torch.arange(wd, device=device, dtype=dtype), indexing='ij')
+        coords = torch.stack(coords[::-1], dim=0)
+        return coords[None].repeat(batch, 1, 1, 1)
+    def initialize_flow(self, img):
+        """ Flow is represented as difference between two coordinate grids flow = coords2 - coords1"""
+        N, C, H, W = img.shape
+        coords1 = self.coords_grid(N, H//8, W//8, device=img.device)
+        coords2 = self.coords_grid(N, H//8, W//8, device=img.device)
+        return coords1, coords2
+    def upsample_data(self, flow, mask):
+        """ Upsample [H/8, W/8, C] -> [H, W, C] using convex combination """
+        N, C, H, W = flow.shape
+        mask = mask.view(N, 1, 9, 8, 8, H, W)
+        mask = torch.softmax(mask, dim=2)
+        up_flow = F.unfold(8 * flow, [3,3], padding=1)
+        up_flow = up_flow.view(N, 2, 9, 1, 1, H, W)
+        up_flow = torch.sum(mask * up_flow, dim=2)
+        up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)
+        return up_flow.reshape(N, 2, 8*H, 8*W).to(flow.dtype)
+    def get_T_padded_images(self, images, T, S, is_training, stride=None, pad=True):
+        B,T,C,H,W = images.shape
+        indices = None
+        if T > 2:
+            step = S // 2 if stride is None else stride
+            indices = []
+            start = 0
+            while start + S < T:
+                indices.append(start)
+                start += step
+            indices.append(start)
+            Tpad = indices[-1]+S-T
+            if pad:
+                if is_training:
+                    assert Tpad == 0
+                else:
+                    images = images.reshape(B,1,T,C*H*W)
+                    if Tpad > 0:
+                        padding_tensor = images[:,:,-1:,:].expand(B,1,Tpad,C*H*W)
+                        images = torch.cat([images, padding_tensor], dim=2)
+                    images = images.reshape(B,T+Tpad,C,H,W)
+                    T = T+Tpad
+        else:
+            assert T == 2
+        return images, T, indices
+    def get_fmaps(self, images_, B, T, sw, is_training):
+        _, _, H_pad, W_pad = images_.shape # revised HW
+        C, H8, W8 = self.dim*2, H_pad//8, W_pad//8
+        if self.no_split:
+            C = self.dim
+        fmaps_chunk_size = 32
+        if (not is_training) and (T > fmaps_chunk_size):
+            images = images_.reshape(B,T,3,H_pad,W_pad)
+            fmaps = []
+            for t in range(0, T, fmaps_chunk_size):
+                images_chunk = images[:, t : t + fmaps_chunk_size]
+                images_chunk = images_chunk.cuda()
+                if self.use_basicencoder:
+                    if self.full_split:
+                        fmaps_chunk1 = self.fnet(images_chunk.reshape(-1, 3, H_pad, W_pad))
+                        fmaps_chunk2 = self.cnet(images_chunk.reshape(-1, 3, H_pad, W_pad))
+                        fmaps_chunk = torch.cat([fmaps_chunk1, fmaps_chunk2], axis=1)
+                    else:
+                        fmaps_chunk = self.fnet(images_chunk.reshape(-1, 3, H_pad, W_pad))
+                else:
+                    fmaps_chunk = self.cnn(images_chunk.reshape(-1, 3, H_pad, W_pad))
+                    if t==0 and sw is not None and sw.save_this:
+                        sw.summ_feat('1_model/fmap_raw', fmaps_chunk[0:1])
+                    fmaps_chunk = self.dot_conv(fmaps_chunk) # B*T,C,H8,W8
+                T_chunk = images_chunk.shape[1]
+                fmaps.append(fmaps_chunk.reshape(B, -1, C, H8, W8))
+            fmaps_ = torch.cat(fmaps, dim=1).reshape(-1, C, H8, W8)
+        else:
+            if not is_training:
+                # sometimes we need to move things to cuda here
+                images_ = images_.cuda()
+            if self.use_basicencoder:
+                if self.full_split:
+                    fmaps1_ = self.fnet(images_)
+                    fmaps2_ = self.cnet(images_)
+                    fmaps_ = torch.cat([fmaps1_, fmaps2_], axis=1)
+                else:
+                    fmaps_ = self.fnet(images_)
+            else:
+                fmaps_ = self.cnn(images_)
+                if sw is not None and sw.save_this:
+                    sw.summ_feat('1_model/fmap_raw', fmaps_[0:1])
+                fmaps_ = self.dot_conv(fmaps_) # B*T,C,H8,W8
+        return fmaps_
+    def forward(self, images, iters=4, sw=None, is_training=False, stride=None):
+        B,T,C,H,W = images.shape
+        S = self.seqlen
+        device = images.device
+        dtype = images.dtype
+        print('images', images.shape)
+        # images are in [0,255]
+        mean = torch.as_tensor([0.485, 0.456, 0.406], device=device).reshape(1,1,3,1,1).to(images.dtype)
+        std = torch.as_tensor([0.229, 0.224, 0.225], device=device).reshape(1,1,3,1,1).to(images.dtype)
+        images = images / 255.0
+        images = (images - mean)/std
+        print("a0 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
+        T_bak = T
+        if stride is not None:
+            pad = False
+        else:
+            pad = True
+        images, T, indices = self.get_T_padded_images(images, T, S, is_training, stride=stride, pad=pad)
+        images = images.contiguous()
+        images_ = images.reshape(B*T,3,H,W)
+        padder = InputPadder(images_.shape)
+        images_ = padder.pad(images_)[0]
+        print("a1 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
+        _, _, H_pad, W_pad = images_.shape # revised HW
+        C, H8, W8 = self.dim*2, H_pad//8, W_pad//8
+        C2 = C//2
+        if self.no_split:
+            C = self.dim
+            C2 = C
+        fmaps = self.get_fmaps(images_, B, T, sw, is_training).reshape(B,T,C,H8,W8)
+        device = fmaps.device
+        print("a2 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
+        fmap_anchor = fmaps[:,0]
+        if T<=2 or is_training:
+            # note: collecting preds can get expensive on a long video
+            all_flow_preds = []
+            all_visconf_preds = []
+        else:
+            all_flow_preds = None
+            all_visconf_preds = None
+        if T > 2: # multiframe tracking
+            # we will store our final outputs in these tensors
+            full_flows = torch.zeros((B,T,2,H,W), dtype=dtype, device=device)
+            full_visconfs = torch.zeros((B,T,2,H,W), dtype=dtype, device=device)
+            # 1/8 resolution
+            full_flows8 = torch.zeros((B,T,2,H_pad//8,W_pad//8), dtype=dtype, device=device)
+            full_visconfs8 = torch.zeros((B,T,2,H_pad//8,W_pad//8), dtype=dtype, device=device)
+            if self.use_feats8:
+                full_feats8 = torch.zeros((B,T,C2,H_pad//8,W_pad//8), dtype=dtype, device=device)
+            visits = np.zeros((T))
+            print("a3 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
+            for ii, ind in enumerate(indices):
+                ara = np.arange(ind,ind+S)
+                print('ara', ara)
+                if ii < len(indices)-1:
+                    next_ind = indices[ii+1]
+                    next_ara = np.arange(next_ind,next_ind+S)
+                # print("torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024), 'ara', ara)
+                fmaps2 = fmaps[:,ara]
+                flows8 = full_flows8[:,ara].reshape(B*(S),2,H_pad//8,W_pad//8).detach()
+                visconfs8 = full_visconfs8[:,ara].reshape(B*(S),2,H_pad//8,W_pad//8).detach()
+                if self.use_feats8:
+                    if ind==0:
+                        feats8 = None
+                    else:
+                        feats8 = full_feats8[:,ara].reshape(B*(S),C2,H_pad//8,W_pad//8).detach()
+                else:
+                    feats8 = None
+                print("a4 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
+                flow_predictions, visconf_predictions, flows8, visconfs8, feats8 = self.forward_window(
+                    fmap_anchor, fmaps2, visconfs8, iters=iters, flowfeat=feats8, flows8=flows8,
+                    is_training=is_training)
+                print("a5 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
+                unpad_flow_predictions = []
+                unpad_visconf_predictions = []
+                for i in range(len(flow_predictions)):
+                    flow_predictions[i] = padder.unpad(flow_predictions[i])
+                    unpad_flow_predictions.append(flow_predictions[i].reshape(B,S,2,H,W))
+                    visconf_predictions[i] = padder.unpad(torch.sigmoid(visconf_predictions[i]))
+                    unpad_visconf_predictions.append(visconf_predictions[i].reshape(B,S,2,H,W))
+                print("a6 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
+                full_flows[:,ara] = unpad_flow_predictions[-1].reshape(B,S,2,H,W)
+                full_flows8[:,ara] = flows8.reshape(B,S,2,H_pad//8,W_pad//8)
+                full_visconfs[:,ara] = unpad_visconf_predictions[-1].reshape(B,S,2,H,W)
+                full_visconfs8[:,ara] = visconfs8.reshape(B,S,2,H_pad//8,W_pad//8)
+                if self.use_feats8:
+                    full_feats8[:,ara] = feats8.reshape(B,S,C2,H_pad//8,W_pad//8)
+                visits[ara] += 1
+                print("a7 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
+                if is_training:
+                    all_flow_preds.append(unpad_flow_predictions)
+                    all_visconf_preds.append(unpad_visconf_predictions)
+                else:
+                    del unpad_flow_predictions
+                    del unpad_visconf_predictions
+                # for the next iter, replace empty data with nearest available preds
+                invalid_idx = np.where(visits==0)[0]
+                valid_idx = np.where(visits>0)[0]
+                for idx in invalid_idx:
+                    nearest = valid_idx[np.argmin(np.abs(valid_idx - idx))]
+                    # print('replacing %d with %d' % (idx, nearest))
+                    full_flows8[:,idx] = full_flows8[:,nearest]
+                    full_visconfs8[:,idx] = full_visconfs8[:,nearest]
+                    if self.use_feats8:
+                        full_feats8[:,idx] = full_feats8[:,nearest]
+                print("a8 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
+        else: # flow
+            flows8 = torch.zeros((B,2,H_pad//8,W_pad//8), dtype=dtype, device=device)
+            visconfs8 = torch.zeros((B,2,H_pad//8,W_pad//8), dtype=dtype, device=device)
+            flow_predictions, visconf_predictions, flows8, visconfs8, feats8 = self.forward_window(
+                fmap_anchor, fmaps[:,1:2], visconfs8, iters=iters, flowfeat=None, flows8=flows8,
+                is_training=is_training)
+            unpad_flow_predictions = []
+            unpad_visconf_predictions = []
+            for i in range(len(flow_predictions)):
+                flow_predictions[i] = padder.unpad(flow_predictions[i])
+                all_flow_preds.append(flow_predictions[i].reshape(B,2,H,W))
+                visconf_predictions[i] = padder.unpad(torch.sigmoid(visconf_predictions[i]))
+                all_visconf_preds.append(visconf_predictions[i].reshape(B,2,H,W))
+            full_flows = all_flow_preds[-1].reshape(B,2,H,W)
+            full_visconfs = all_visconf_preds[-1].reshape(B,2,H,W)
+        if (not is_training) and (T > 2):
+            full_flows = full_flows[:,:T_bak]
+            full_visconfs = full_visconfs[:,:T_bak]
+        print("a9 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
+        return full_flows, full_visconfs, all_flow_preds, all_visconf_preds
+    def forward_sliding(self, images, iters=4, sw=None, is_training=False, window_len=None, stride=None):
+        B,T,C,H,W = images.shape
+        S = self.seqlen if window_len is None else window_len
+        device = images.device
+        dtype = images.dtype
+        stride = S // 2 if stride is None else stride
+        T_bak = T
+        images, T, indices = self.get_T_padded_images(images, T, S, is_training, stride)
+        assert stride <= S // 2
+        images = images.contiguous()
+        images_ = images.reshape(B*T,3,H,W)
+        padder = InputPadder(images_.shape)
+        images_ = padder.pad(images_)[0]
+        _, _, H_pad, W_pad = images_.shape # revised HW
+        C, H8, W8 = self.dim*2, H_pad//8, W_pad//8
+        C2 = C//2
+        if self.no_split:
+            C = self.dim
+            C2 = C
+        all_flow_preds = None
+        all_visconf_preds = None
+        if T<=2:
+            # note: collecting preds can get expensive on a long video
+            all_flow_preds = []
+            all_visconf_preds = []
+            fmaps = self.get_fmaps(images_, B, T, sw, is_training).reshape(B,T,C,H8,W8)
+            device = fmaps.device
+            flows8 = torch.zeros((B,2,H_pad//8,W_pad//8), dtype=dtype, device=device)
+            visconfs8 = torch.zeros((B,2,H_pad//8,W_pad//8), dtype=dtype, device=device)
+            fmap_anchor = fmaps[:,0]
+            flow_predictions, visconf_predictions, flows8, visconfs8, feats8 = self.forward_window(
+                fmap_anchor, fmaps[:,1:2], visconfs8, iters=iters, flowfeat=None, flows8=flows8,
+                is_training=is_training)
+            unpad_flow_predictions = []
+            unpad_visconf_predictions = []
+            for i in range(len(flow_predictions)):
+                flow_predictions[i] = padder.unpad(flow_predictions[i])
+                all_flow_preds.append(flow_predictions[i].reshape(B,2,H,W))
+                visconf_predictions[i] = padder.unpad(torch.sigmoid(visconf_predictions[i]))
+                all_visconf_preds.append(visconf_predictions[i].reshape(B,2,H,W))
+            full_flows = all_flow_preds[-1].reshape(B,2,H,W).detach().cpu()
+            full_visconfs = all_visconf_preds[-1].reshape(B,2,H,W).detach().cpu()
+            return full_flows, full_visconfs, all_flow_preds, all_visconf_preds
+        assert T > 2 # multiframe tracking
+        if is_training:
+            all_flow_preds = []
+            all_visconf_preds = []
+        # we will store our final outputs in these cpu tensors
+        full_flows = torch.zeros((B,T,2,H,W), dtype=dtype, device='cpu')
+        full_visconfs = torch.zeros((B,T,2,H,W), dtype=dtype, device='cpu')
+        images_ = images_.reshape(B,T,3,H_pad,W_pad)
+        fmap_anchor = self.get_fmaps(images_[:,:1].reshape(-1,3,H_pad,W_pad), B, 1, sw, is_training).reshape(B,C,H8,W8)
+        device = fmap_anchor.device
+        full_visited = torch.zeros((T,), dtype=torch.bool, device=device)
+        for ii, ind in enumerate(indices):
+            ara = np.arange(ind,ind+S)
+            if ii == 0:
+                flows8 = torch.zeros((B,S,2,H_pad//8,W_pad//8), dtype=dtype, device=device)
+                visconfs8 = torch.zeros((B,S,2,H_pad//8,W_pad//8), dtype=dtype, device=device)
+                fmaps2 = self.get_fmaps(images_[:,ara].reshape(-1,3,H_pad,W_pad), B, S, sw, is_training).reshape(B,S,C,H8,W8)
+            else:
+                flows8 = torch.cat([flows8[:,stride:stride+S//2], flows8[:,stride+S//2-1:stride+S//2].repeat(1,S//2,1,1,1)], dim=1)
+                visconfs8 = torch.cat([visconfs8[:,stride:stride+S//2], visconfs8[:,stride+S//2-1:stride+S//2].repeat(1,S//2,1,1,1)], dim=1)
+                fmaps2 = torch.cat([fmaps2[:,stride:stride+S//2],
+                                    self.get_fmaps(images_[:,np.arange(ind+S//2,ind+S)].reshape(-1,3,H_pad,W_pad), B, S//2, sw, is_training).reshape(B,S//2,C,H8,W8)], dim=1)
+            flows8 = flows8.reshape(B*S,2,H_pad//8,W_pad//8).detach()
+            visconfs8 = visconfs8.reshape(B*S,2,H_pad//8,W_pad//8).detach()
+            flow_predictions, visconf_predictions, flows8, visconfs8, _ = self.forward_window(
+                fmap_anchor, fmaps2, visconfs8, iters=iters, flowfeat=None, flows8=flows8,
+                is_training=is_training)
+            unpad_flow_predictions = []
+            unpad_visconf_predictions = []
+            for i in range(len(flow_predictions)):
+                flow_predictions[i] = padder.unpad(flow_predictions[i])
+                unpad_flow_predictions.append(flow_predictions[i].reshape(B,S,2,H,W))
+                visconf_predictions[i] = padder.unpad(torch.sigmoid(visconf_predictions[i]))
+                unpad_visconf_predictions.append(visconf_predictions[i].reshape(B,S,2,H,W))
+            current_visiting = torch.zeros((T,), dtype=torch.bool, device=device)
+            current_visiting[ara] = True
+            to_fill = current_visiting & (~full_visited)
+            to_fill_sum = to_fill.sum().item()
+            full_flows[:,to_fill] = unpad_flow_predictions[-1].reshape(B,S,2,H,W)[:,-to_fill_sum:].detach().cpu()
+            full_visconfs[:,to_fill] = unpad_visconf_predictions[-1].reshape(B,S,2,H,W)[:,-to_fill_sum:].detach().cpu()
+            full_visited |= current_visiting
+            if is_training:
+                all_flow_preds.append(unpad_flow_predictions)
+                all_visconf_preds.append(unpad_visconf_predictions)
+            else:
+                del unpad_flow_predictions
+                del unpad_visconf_predictions
+            flows8 = flows8.reshape(B,S,2,H_pad//8,W_pad//8)
+            visconfs8 = visconfs8.reshape(B,S,2,H_pad//8,W_pad//8)
+        if not is_training:
+            full_flows = full_flows[:,:T_bak]
+            full_visconfs = full_visconfs[:,:T_bak]
+        return full_flows, full_visconfs, all_flow_preds, all_visconf_preds
+    def forward_window(self, fmap1_single, fmaps2, visconfs8, iters=None, flowfeat=None, flows8=None, sw=None, is_training=False):
+        B,S,C,H8,W8 = fmaps2.shape
+        device = fmaps2.device
+        dtype = fmaps2.dtype
+        flow_predictions = []
+        visconf_predictions = []
+        fmap1 = fmap1_single.unsqueeze(1).repeat(1,S,1,1,1) # B,S,C,H,W
+        fmap1 = fmap1.reshape(B*(S),C,H8,W8).contiguous()
+        fmap2 = fmaps2.reshape(B*(S),C,H8,W8).contiguous()
+        visconfs8 = visconfs8.reshape(B*(S),2,H8,W8).contiguous()
+        corr_fn = CorrBlock(fmap1, fmap2, self.corr_levels, self.corr_radius)
+        coords1 = self.coords_grid(B*(S), H8, W8, device=fmap1.device, dtype=dtype)
+        if self.no_split:
+            flowfeat, ctxfeat = fmap1.clone(), fmap1.clone()
+        else:
+            if flowfeat is not None:
+                _, ctxfeat = torch.split(fmap1, [self.dim, self.dim], dim=1)
+            else:
+                flowfeat, ctxfeat = torch.split(fmap1, [self.dim, self.dim], dim=1)
+        # add pos emb to ctxfeat (and not flowfeat), since ctxfeat is untouched across iters
+        time_emb = self.fetch_time_embed(S, ctxfeat.dtype, is_training).reshape(1,S,self.dim,1,1).repeat(B,1,1,1,1)
+        ctxfeat = ctxfeat + time_emb.reshape(B*S,self.dim,1,1)
+        if self.no_ctx:
+            flowfeat = flowfeat + time_emb.reshape(B*S,self.dim,1,1)
+        for itr in range(iters):
+            _, _, H8, W8 = flows8.shape
+            flows8 = flows8.detach()
+            coords2 = (coords1 + flows8).detach() # B*S,2,H,W
+            corr = corr_fn(coords2).to(dtype)
+            if self.use_relmotion or self.use_sinrelmotion:
+                coords_ = coords2.reshape(B,S,2,H8*W8).permute(0,1,3,2) # B,S,H8*W8,2
+                rel_coords_forward = coords_[:, :-1] - coords_[:, 1:]
+                rel_coords_backward = coords_[:, 1:] - coords_[:, :-1]
+                rel_coords_forward = torch.nn.functional.pad(
+                    rel_coords_forward, (0, 0, 0, 0, 0, 1) # pad the 3rd-last dim (S) by (0,1)
+                )
+                rel_coords_backward = torch.nn.functional.pad(
+                    rel_coords_backward, (0, 0, 0, 0, 1, 0) # pad the 3rd-last dim (S) by (1,0)
+                )
+                rel_coords = torch.cat([rel_coords_forward, rel_coords_backward], dim=-1) # B,S,H8*W8,4
+                if self.use_sinrelmotion:
+                    rel_pos_emb_input = utils.misc.posenc(
+                        rel_coords,
+                        min_deg=0,
+                        max_deg=10,
+                    )  # B,S,H*W,pdim
+                    motion = rel_pos_emb_input.reshape(B*S,H8,W8,self.pdim).permute(0,3,1,2).to(dtype) # B*S,pdim,H8,W8
+                else:
+                    motion = rel_coords.reshape(B*S,H8,W8,4).permute(0,3,1,2).to(dtype) # B*S,4,H8,W8
+            else:
+                if self.use_sinmotion:
+                    pos_emb_input = utils.misc.posenc(
+                        flows8.reshape(B,S,H8*W8,2),
+                        min_deg=0,
+                        max_deg=10,
+                    )  # B,S,H*W,pdim
+                    motion = pos_emb_input.reshape(B*S,H8,W8,self.pdim).permute(0,3,1,2).to(dtype) # B*S,pdim,H8,W8
+                else:
+                    motion = flows8
+            flowfeat = self.update_block(flowfeat, ctxfeat, visconfs8, corr, motion, S)
+            flow_update = self.flow_head(flowfeat)
+            visconf_update = self.visconf_head(flowfeat)
+            weight_update = .25 * self.upsample_weight(flowfeat)
+            flows8 = flows8 + flow_update
+            visconfs8 = visconfs8 + visconf_update
+            flow_up = self.upsample_data(flows8, weight_update)
+            visconf_up = self.upsample_data(visconfs8, weight_update)
+            if not is_training: # clear mem
+                flow_predictions = []
+                visconf_predictions = []
+            flow_predictions.append(flow_up)
+            visconf_predictions.append(visconf_up)
+        return flow_predictions, visconf_predictions, flows8, visconfs8, flowfeat

nets/blocks.py ADDED Viewed

	@@ -0,0 +1,1304 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import nn, Tensor
+from itertools import repeat
+import collections
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Sequence
+from functools import partial
+import einops
+import math
+from torchvision.ops.misc import Conv2dNormActivation, Permute
+from torchvision.ops.stochastic_depth import StochasticDepth
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+    return parse
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+to_2tuple = _ntuple(2)
+class InputPadder:
+    """ Pads images such that dimensions are divisible by a certain stride """
+    def __init__(self, dims, mode='sintel'):
+        self.ht, self.wd = dims[-2:]
+        pad_ht = (((self.ht // 64) + 1) * 64 - self.ht) % 64
+        pad_wd = (((self.wd // 64) + 1) * 64 - self.wd) % 64
+        if mode == 'sintel':
+            self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2]
+        else:
+            self._pad = [pad_wd//2, pad_wd - pad_wd//2, 0, pad_ht]
+    def pad(self, *inputs):
+        return [F.pad(x, self._pad, mode='replicate') for x in inputs]
+    def unpad(self, x):
+        ht, wd = x.shape[-2:]
+        c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]]
+        return x[..., c[0]:c[1], c[2]:c[3]]
+def bilinear_sampler(
+        input, coords,
+        align_corners=True,
+        padding_mode="border",
+        normalize_coords=True):
+    # func from mattie (oct9)
+    if input.ndim not in [4, 5]:
+        raise ValueError("input must be 4D or 5D.")
+    if input.ndim == 4 and not coords.ndim == 4:
+        raise ValueError("input is 4D, but coords is not 4D.")
+    if input.ndim == 5 and not coords.ndim == 5:
+        raise ValueError("input is 5D, but coords is not 5D.")
+    if coords.ndim == 5:
+        coords = coords[..., [1, 2, 0]]  # t x y -> x y t to match what grid_sample() expects.
+    if normalize_coords:
+        if align_corners:
+            # Normalize coordinates from [0, W/H - 1] to [-1, 1].
+            coords = (
+                coords
+                * torch.tensor([2 / max(size - 1, 1) for size in reversed(input.shape[2:])], device=coords.device)
+                - 1
+            )
+        else:
+            # Normalize coordinates from [0, W/H] to [-1, 1].
+            coords = coords * torch.tensor([2 / size for size in reversed(input.shape[2:])], device=coords.device) - 1
+    return F.grid_sample(input, coords, align_corners=align_corners, padding_mode=padding_mode)
+class CorrBlock:
+    def __init__(self, fmap1, fmap2, corr_levels, corr_radius):
+        self.num_levels = corr_levels
+        self.radius = corr_radius
+        self.corr_pyramid = []
+        # all pairs correlation
+        for i in range(self.num_levels):
+            corr = CorrBlock.corr(fmap1, fmap2, 1)
+            batch, h1, w1, dim, h2, w2 = corr.shape
+            corr = corr.reshape(batch*h1*w1, dim, h2, w2)
+            fmap2 = F.interpolate(fmap2, scale_factor=0.5, mode='area')
+            # print('corr', corr.shape)
+            self.corr_pyramid.append(corr)
+    def __call__(self, coords, dilation=None):
+        r = self.radius
+        coords = coords.permute(0, 2, 3, 1)
+        batch, h1, w1, _ = coords.shape
+        if dilation is None:
+            dilation = torch.ones(batch, 1, h1, w1, device=coords.device)
+        out_pyramid = []
+        for i in range(self.num_levels):
+            corr = self.corr_pyramid[i]
+            device = coords.device
+            dx = torch.linspace(-r, r, 2*r+1, device=device)
+            dy = torch.linspace(-r, r, 2*r+1, device=device)
+            delta = torch.stack(torch.meshgrid(dy, dx), axis=-1)
+            delta_lvl = delta.view(1, 2*r+1, 2*r+1, 2)
+            delta_lvl = delta_lvl * dilation.view(batch * h1 * w1, 1, 1, 1)
+            centroid_lvl = coords.reshape(batch*h1*w1, 1, 1, 2) / 2**i
+            coords_lvl = centroid_lvl + delta_lvl
+            corr = bilinear_sampler(corr, coords_lvl)
+            corr = corr.view(batch, h1, w1, -1)
+            out_pyramid.append(corr)
+        out = torch.cat(out_pyramid, dim=-1)
+        out = out.permute(0, 3, 1, 2).contiguous().float()
+        return out
+    @staticmethod
+    def corr(fmap1, fmap2, num_head):
+        batch, dim, h1, w1 = fmap1.shape
+        h2, w2 = fmap2.shape[2:]
+        fmap1 = fmap1.view(batch, num_head, dim // num_head, h1*w1)
+        fmap2 = fmap2.view(batch, num_head, dim // num_head, h2*w2)
+        corr = fmap1.transpose(2, 3) @ fmap2
+        corr = corr.reshape(batch, num_head, h1, w1, h2, w2).permute(0, 2, 3, 1, 4, 5)
+        return corr  / torch.sqrt(torch.tensor(dim).float())
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution without padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=0)
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1)
+class LayerNorm2d(nn.LayerNorm):
+    def forward(self, x: Tensor) -> Tensor:
+        x = x.permute(0, 2, 3, 1)
+        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        x = x.permute(0, 3, 1, 2)
+        return x
+class CNBlock1d(nn.Module):
+    def __init__(
+        self,
+        dim,
+        output_dim,
+        layer_scale: float = 1e-6,
+        stochastic_depth_prob: float = 0,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        dense=True,
+        use_attn=True,
+        use_mixer=False,
+        use_conv=False,
+        use_convb=False,
+        use_layer_scale=True,
+    ) -> None:
+        super().__init__()
+        self.dense = dense
+        self.use_attn = use_attn
+        self.use_mixer = use_mixer
+        self.use_conv = use_conv
+        self.use_layer_scale = use_layer_scale
+        if use_attn:
+            assert not use_mixer
+            assert not use_conv
+            assert not use_convb
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        if use_attn:
+            num_heads = 8
+            self.block = AttnBlock(
+                hidden_size=dim,
+                num_heads=num_heads,
+                mlp_ratio=4,
+                attn_class=Attention,
+            )
+        elif use_mixer:
+            self.block = MLPMixerBlock(
+                S=16,
+                dim=dim,
+                depth=1,
+                expansion_factor=2,
+            )
+        elif use_conv:
+            self.block = nn.Sequential(
+                nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim, bias=True, padding_mode='zeros'),
+                Permute([0, 2, 1]),
+                norm_layer(dim),
+                nn.Linear(in_features=dim, out_features=4 * dim, bias=True),
+                nn.GELU(),
+                nn.Linear(in_features=4 * dim, out_features=dim, bias=True),
+                Permute([0, 2, 1]),
+            )
+        elif use_convb:
+            self.block = nn.Sequential(
+                nn.Conv1d(dim, dim, kernel_size=3, padding=1, bias=True, padding_mode='zeros'),
+                Permute([0, 2, 1]),
+                norm_layer(dim),
+                nn.Linear(in_features=dim, out_features=4 * dim, bias=True),
+                nn.GELU(),
+                nn.Linear(in_features=4 * dim, out_features=dim, bias=True),
+                Permute([0, 2, 1]),
+            )
+        else:
+            assert(False) # choose attn, mixer, or conv please
+        if self.use_layer_scale:
+            self.layer_scale = nn.Parameter(torch.ones(dim, 1) * layer_scale)
+        else:
+            self.layer_scale = 1.0
+        self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
+        if output_dim != dim:
+            self.final = nn.Conv1d(dim, output_dim, kernel_size=1, padding=0)
+        else:
+            self.final = nn.Identity()
+    def forward(self, input, S=None):
+        if self.dense:
+            assert S is not None
+            BS,C,H,W = input.shape
+            B = BS//S
+            input = einops.rearrange(input, '(b s) c h w -> (b h w) c s', b=B, s=S, c=C, h=H, w=W)
+            if self.use_mixer or self.use_attn:
+                # mixer/transformer blocks want B,S,C
+                result = self.layer_scale * self.block(input.permute(0,2,1)).permute(0,2,1)
+            else:
+                result = self.layer_scale * self.block(input)
+            result = self.stochastic_depth(result)
+            result += input
+            result = self.final(result)
+            result = einops.rearrange(result, '(b h w) c s -> (b s) c h w', b=B, s=S, c=C, h=H, w=W)
+        else:
+            B,S,C = input.shape
+            if S<7:
+                return input
+            input = einops.rearrange(input, 'b s c -> b c s', b=B, s=S, c=C)
+            result = self.layer_scale * self.block(input)
+            result = self.stochastic_depth(result)
+            result += input
+            result = self.final(result)
+            result = einops.rearrange(result, 'b c s -> b s c', b=B, s=S, c=C)
+        return result
+class CNBlock2d(nn.Module):
+    def __init__(
+        self,
+        dim,
+        output_dim,
+        layer_scale: float = 1e-6,
+        stochastic_depth_prob: float = 0,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        use_layer_scale=True,
+    ) -> None:
+        super().__init__()
+        self.use_layer_scale = use_layer_scale
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.block = nn.Sequential(
+            nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim, bias=True, padding_mode='zeros'),
+            Permute([0, 2, 3, 1]),
+            norm_layer(dim),
+            nn.Linear(in_features=dim, out_features=4 * dim, bias=True),
+            nn.GELU(),
+            nn.Linear(in_features=4 * dim, out_features=dim, bias=True),
+            Permute([0, 3, 1, 2]),
+        )
+        if self.use_layer_scale:
+            self.layer_scale = nn.Parameter(torch.ones(dim, 1, 1) * layer_scale)
+        else:
+            self.layer_scale = 1.0
+        self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
+        if output_dim != dim:
+            self.final = nn.Conv2d(dim, output_dim, kernel_size=1, padding=0)
+        else:
+            self.final = nn.Identity()
+    def forward(self, input, S=None):
+        result = self.layer_scale * self.block(input)
+        result = self.stochastic_depth(result)
+        result += input
+        result = self.final(result)
+        return result
+class CNBlockConfig:
+    # Stores information listed at Section 3 of the ConvNeXt paper
+    def __init__(
+        self,
+        input_channels: int,
+        out_channels: Optional[int],
+        num_layers: int,
+        downsample: bool,
+    ) -> None:
+        self.input_channels = input_channels
+        self.out_channels = out_channels
+        self.num_layers = num_layers
+        self.downsample = downsample
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "input_channels={input_channels}"
+        s += ", out_channels={out_channels}"
+        s += ", num_layers={num_layers}"
+        s += ", downsample={downsample}"
+        s += ")"
+        return s.format(**self.__dict__)
+class ConvNeXt(nn.Module):
+    def __init__(
+            self,
+            block_setting: List[CNBlockConfig],
+            stochastic_depth_prob: float = 0.0,
+            layer_scale: float = 1e-6,
+            num_classes: int = 1000,
+            block: Optional[Callable[..., nn.Module]] = None,
+            norm_layer: Optional[Callable[..., nn.Module]] = None,
+            init_weights=True):
+        super().__init__()
+        self.init_weights = init_weights
+        if not block_setting:
+            raise ValueError("The block_setting should not be empty")
+        elif not (isinstance(block_setting, Sequence) and all([isinstance(s, CNBlockConfig) for s in block_setting])):
+            raise TypeError("The block_setting should be List[CNBlockConfig]")
+        if block is None:
+            block = CNBlock2d
+        if norm_layer is None:
+            norm_layer = partial(LayerNorm2d, eps=1e-6)
+        layers: List[nn.Module] = []
+        # Stem
+        firstconv_output_channels = block_setting[0].input_channels
+        layers.append(
+            Conv2dNormActivation(
+                3,
+                firstconv_output_channels,
+                kernel_size=4,
+                stride=4,
+                padding=0,
+                norm_layer=norm_layer,
+                activation_layer=None,
+                bias=True,
+            )
+        )
+        total_stage_blocks = sum(cnf.num_layers for cnf in block_setting)
+        stage_block_id = 0
+        for cnf in block_setting:
+            # Bottlenecks
+            stage: List[nn.Module] = []
+            for _ in range(cnf.num_layers):
+                # adjust stochastic depth probability based on the depth of the stage block
+                sd_prob = stochastic_depth_prob * stage_block_id / (total_stage_blocks - 1.0)
+                stage.append(block(cnf.input_channels, cnf.input_channels, layer_scale, sd_prob))
+                stage_block_id += 1
+            layers.append(nn.Sequential(*stage))
+            if cnf.out_channels is not None:
+                if cnf.downsample:
+                    layers.append(
+                        nn.Sequential(
+                            norm_layer(cnf.input_channels),
+                            nn.Conv2d(cnf.input_channels, cnf.out_channels, kernel_size=2, stride=2),
+                        )
+                    )
+                else:
+                    # we convert the 2x2 downsampling layer into a 3x3 with dilation2 and replicate padding.
+                    # replicate padding compensates for the fact that this kernel never saw zero-padding.
+                    layers.append(
+                        nn.Sequential(
+                            norm_layer(cnf.input_channels),
+                            nn.Conv2d(cnf.input_channels, cnf.out_channels, kernel_size=3, stride=1, padding=2, dilation=2, padding_mode='zeros'),
+                        )
+                    )
+        self.features = nn.Sequential(*layers)
+        # self.final_conv = conv1x1(block_setting[-1].input_channels, output_dim)
+        for m in self.modules():
+            if isinstance(m, (nn.Conv2d, nn.Linear)):
+                nn.init.trunc_normal_(m.weight, std=0.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+        if self.init_weights:
+            from torchvision.models import convnext_tiny, ConvNeXt_Tiny_Weights
+            pretrained_dict = convnext_tiny(weights=ConvNeXt_Tiny_Weights.DEFAULT).state_dict()
+            # from torchvision.models import convnext_base, ConvNeXt_Base_Weights
+            # pretrained_dict = convnext_base(weights=ConvNeXt_Base_Weights.DEFAULT).state_dict()
+            model_dict = self.state_dict()
+            pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
+            for k, v in pretrained_dict.items():
+                if k == 'features.4.1.weight': # this is the layer normally in charge of 2x2 downsampling
+                    # convert to 3x3 filter
+                    pretrained_dict[k] = F.interpolate(v, (3, 3), mode='bicubic', align_corners=True) * (4/9.0)
+            model_dict.update(pretrained_dict)
+            self.load_state_dict(model_dict, strict=False)
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        x = self.features(x)
+        # x = self.final_conv(x)
+        return x
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.0,
+        use_conv=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = (
+            norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
+        )
+        self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+        self, query_dim, context_dim=None, num_heads=8, dim_head=48, qkv_bias=False
+    ):
+        super().__init__()
+        inner_dim = dim_head * num_heads
+        context_dim = default(context_dim, query_dim)
+        self.scale = dim_head**-0.5
+        self.heads = num_heads
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=qkv_bias)
+        self.to_kv = nn.Linear(context_dim, inner_dim * 2, bias=qkv_bias)
+        self.to_out = nn.Linear(inner_dim, query_dim)
+    def forward(self, x, context=None, attn_bias=None):
+        B, N1, C = x.shape
+        H = self.heads
+        q = self.to_q(x)
+        context = default(context, x)
+        k, v = self.to_kv(context).chunk(2, dim=-1)
+        q, k, v = map(lambda t: einops.rearrange(t, 'b n (h d) -> b h n d', h=self.heads), (q, k, v))
+        x = F.scaled_dot_product_attention(q, k, v) # scale default is already dim^-0.5
+        x = einops.rearrange(x, 'b h n d -> b n (h d)')
+        return self.to_out(x)
+class CrossAttnBlock(nn.Module):
+    def __init__(
+        self, hidden_size, context_dim, num_heads=1, mlp_ratio=4.0, **block_kwargs
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.norm_context = nn.LayerNorm(hidden_size)
+        self.cross_attn = Attention(
+            hidden_size,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            qkv_bias=True,
+            **block_kwargs
+        )
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=mlp_hidden_dim,
+            act_layer=approx_gelu,
+            drop=0,
+        )
+    def forward(self, x, context, mask=None):
+        attn_bias = None
+        if mask is not None:
+            if mask.shape[1] == x.shape[1]:
+                mask = mask[:, None, :, None].expand(
+                    -1, self.cross_attn.heads, -1, context.shape[1]
+                )
+            else:
+                mask = mask[:, None, None].expand(
+                    -1, self.cross_attn.heads, x.shape[1], -1
+                )
+            max_neg_value = -torch.finfo(x.dtype).max
+            attn_bias = (~mask) * max_neg_value
+        x = x + self.cross_attn(
+            self.norm1(x), context=self.norm_context(context), attn_bias=attn_bias
+        )
+        x = x + self.mlp(self.norm2(x))
+        return x
+class AttnBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        num_heads,
+        attn_class: Callable[..., nn.Module] = Attention,
+        mlp_ratio=4.0,
+        **block_kwargs
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = attn_class(hidden_size, num_heads=num_heads, qkv_bias=True, dim_head=hidden_size//num_heads)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=mlp_hidden_dim,
+            act_layer=approx_gelu,
+            drop=0,
+        )
+    def forward(self, x, mask=None):
+        attn_bias = mask
+        if mask is not None:
+            mask = (
+                (mask[:, None] * mask[:, :, None])
+                .unsqueeze(1)
+                .expand(-1, self.attn.num_heads, -1, -1)
+            )
+            max_neg_value = -torch.finfo(x.dtype).max
+            attn_bias = (~mask) * max_neg_value
+        x = x + self.attn(self.norm1(x), attn_bias=attn_bias)
+        x = x + self.mlp(self.norm2(x))
+        return x
+class ResidualBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn="group", stride=1):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_planes,
+            planes,
+            kernel_size=3,
+            padding=1,
+            stride=stride,
+            padding_mode="zeros",
+        )
+        self.conv2 = nn.Conv2d(
+            planes, planes, kernel_size=3, padding=1, padding_mode="zeros"
+        )
+        self.relu = nn.ReLU(inplace=True)
+        num_groups = planes // 8
+        if norm_fn == "group":
+            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+        elif norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(planes)
+            self.norm2 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.BatchNorm2d(planes)
+        elif norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(planes)
+            self.norm2 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.InstanceNorm2d(planes)
+        elif norm_fn == "none":
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            if not stride == 1:
+                self.norm3 = nn.Sequential()
+        if stride == 1:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3
+            )
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return self.relu(x + y)
+class BasicEncoder(nn.Module):
+    def __init__(self, input_dim=3, output_dim=128, stride=4):
+        super(BasicEncoder, self).__init__()
+        self.stride = stride
+        self.norm_fn = "instance"
+        self.in_planes = output_dim // 2
+        self.norm1 = nn.InstanceNorm2d(self.in_planes)
+        self.norm2 = nn.InstanceNorm2d(output_dim * 2)
+        self.conv1 = nn.Conv2d(
+            input_dim,
+            self.in_planes,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            padding_mode="zeros",
+        )
+        self.relu1 = nn.ReLU(inplace=True)
+        self.layer1 = self._make_layer(output_dim // 2, stride=1)
+        self.layer2 = self._make_layer(output_dim // 4 * 3, stride=2)
+        self.layer3 = self._make_layer(output_dim, stride=2)
+        self.layer4 = self._make_layer(output_dim, stride=2)
+        self.conv2 = nn.Conv2d(
+            output_dim * 3 + output_dim // 4,
+            output_dim * 2,
+            kernel_size=3,
+            padding=1,
+            padding_mode="zeros",
+        )
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(output_dim * 2, output_dim, kernel_size=1)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.InstanceNorm2d)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        _, _, H, W = x.shape
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        a = self.layer1(x)
+        b = self.layer2(a)
+        c = self.layer3(b)
+        d = self.layer4(c)
+        def _bilinear_intepolate(x):
+            return F.interpolate(
+                x,
+                (H // self.stride, W // self.stride),
+                mode="bilinear",
+                align_corners=True,
+            )
+        a = _bilinear_intepolate(a)
+        b = _bilinear_intepolate(b)
+        c = _bilinear_intepolate(c)
+        d = _bilinear_intepolate(d)
+        x = self.conv2(torch.cat([a, b, c, d], dim=1))
+        x = self.norm2(x)
+        x = self.relu2(x)
+        x = self.conv3(x)
+        return x
+class EfficientUpdateFormer(nn.Module):
+    """
+    Transformer model that updates track estimates.
+    """
+    def __init__(
+        self,
+        space_depth=6,
+        time_depth=6,
+        input_dim=320,
+        hidden_size=384,
+        num_heads=8,
+        output_dim=130,
+        mlp_ratio=4.0,
+        num_virtual_tracks=64,
+        add_space_attn=True,
+        linear_layer_for_vis_conf=False,
+        use_time_conv=False,
+        use_time_mixer=False,
+    ):
+        super().__init__()
+        self.out_channels = 2
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.input_transform = torch.nn.Linear(input_dim, hidden_size, bias=True)
+        if linear_layer_for_vis_conf:
+            self.flow_head = torch.nn.Linear(hidden_size, output_dim - 2, bias=True)
+            self.vis_conf_head = torch.nn.Linear(hidden_size, 2, bias=True)
+        else:
+            self.flow_head = torch.nn.Linear(hidden_size, output_dim, bias=True)
+        self.num_virtual_tracks = num_virtual_tracks
+        self.virual_tracks = nn.Parameter(
+            torch.randn(1, num_virtual_tracks, 1, hidden_size)
+        )
+        self.add_space_attn = add_space_attn
+        self.linear_layer_for_vis_conf = linear_layer_for_vis_conf
+        if use_time_conv:
+            self.time_blocks = nn.ModuleList(
+                [
+                    CNBlock1d(hidden_size, hidden_size, dense=False)
+                    for _ in range(time_depth)
+                ]
+            )
+        elif use_time_mixer:
+            self.time_blocks = nn.ModuleList(
+                [
+                    MLPMixerBlock(
+                        S=16,
+                        dim=hidden_size,
+                        depth=1,
+                    )
+                    for _ in range(time_depth)
+                ]
+            )
+        else:
+            self.time_blocks = nn.ModuleList(
+                [
+                    AttnBlock(
+                        hidden_size,
+                        num_heads,
+                        mlp_ratio=mlp_ratio,
+                        attn_class=Attention,
+                    )
+                    for _ in range(time_depth)
+                ]
+            )
+        if add_space_attn:
+            self.space_virtual_blocks = nn.ModuleList(
+                [
+                    AttnBlock(
+                        hidden_size,
+                        num_heads,
+                        mlp_ratio=mlp_ratio,
+                        attn_class=Attention,
+                    )
+                    for _ in range(space_depth)
+                ]
+            )
+            self.space_point2virtual_blocks = nn.ModuleList(
+                [
+                    CrossAttnBlock(
+                        hidden_size, hidden_size, num_heads, mlp_ratio=mlp_ratio
+                    )
+                    for _ in range(space_depth)
+                ]
+            )
+            self.space_virtual2point_blocks = nn.ModuleList(
+                [
+                    CrossAttnBlock(
+                        hidden_size, hidden_size, num_heads, mlp_ratio=mlp_ratio
+                    )
+                    for _ in range(space_depth)
+                ]
+            )
+            assert len(self.time_blocks) >= len(self.space_virtual2point_blocks)
+        self.initialize_weights()
+    def initialize_weights(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+            torch.nn.init.trunc_normal_(self.flow_head.weight, std=0.001)
+            if self.linear_layer_for_vis_conf:
+                torch.nn.init.trunc_normal_(self.vis_conf_head.weight, std=0.001)
+        def _trunc_init(module):
+            """ViT weight initialization, original timm impl (for reproducibility)"""
+            if isinstance(module, nn.Linear):
+                torch.nn.init.trunc_normal_(module.weight, std=0.02)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+        self.apply(_basic_init)
+    def forward(self, input_tensor, mask=None, add_space_attn=True):
+        tokens = self.input_transform(input_tensor)
+        B, _, T, _ = tokens.shape
+        virtual_tokens = self.virual_tracks.repeat(B, 1, T, 1)
+        tokens = torch.cat([tokens, virtual_tokens], dim=1)
+        _, N, _, _ = tokens.shape
+        j = 0
+        layers = []
+        for i in range(len(self.time_blocks)):
+            time_tokens = tokens.contiguous().view(B * N, T, -1)  # B N T C -> (B N) T C
+            time_tokens = self.time_blocks[i](time_tokens)
+            tokens = time_tokens.view(B, N, T, -1)  # (B N) T C -> B N T C
+            if (
+                add_space_attn
+                and hasattr(self, "space_virtual_blocks")
+                and (i % (len(self.time_blocks) // len(self.space_virtual_blocks)) == 0)
+            ):
+                space_tokens = (
+                    tokens.permute(0, 2, 1, 3).contiguous().view(B * T, N, -1)
+                )  # B N T C -> (B T) N C
+                point_tokens = space_tokens[:, : N - self.num_virtual_tracks]
+                virtual_tokens = space_tokens[:, N - self.num_virtual_tracks :]
+                virtual_tokens = self.space_virtual2point_blocks[j](
+                    virtual_tokens, point_tokens, mask=mask
+                )
+                virtual_tokens = self.space_virtual_blocks[j](virtual_tokens)
+                point_tokens = self.space_point2virtual_blocks[j](
+                    point_tokens, virtual_tokens, mask=mask
+                )
+                space_tokens = torch.cat([point_tokens, virtual_tokens], dim=1)
+                tokens = space_tokens.view(B, T, N, -1).permute(
+                    0, 2, 1, 3
+                )  # (B T) N C -> B N T C
+                j += 1
+        tokens = tokens[:, : N - self.num_virtual_tracks]
+        flow = self.flow_head(tokens)
+        if self.linear_layer_for_vis_conf:
+            vis_conf = self.vis_conf_head(tokens)
+            flow = torch.cat([flow, vis_conf], dim=-1)
+        return flow
+class MMPreNormResidual(nn.Module):
+    def __init__(self, dim, fn):
+        super().__init__()
+        self.fn = fn
+        self.norm = nn.LayerNorm(dim)
+    def forward(self, x):
+        return self.fn(self.norm(x)) + x
+def MMFeedForward(dim, expansion_factor=4, dropout=0., dense=nn.Linear):
+    return nn.Sequential(
+        dense(dim, dim * expansion_factor),
+        nn.GELU(),
+        nn.Dropout(dropout),
+        dense(dim * expansion_factor, dim),
+        nn.Dropout(dropout)
+    )
+def MLPMixer(S, input_dim, dim, output_dim, depth=6, expansion_factor=4, dropout=0., do_reduce=False):
+    # input is coming in as B,S,C, as standard for mlp and transformer
+    # chan_first treats S as the channel dim, and transforms it to a new S
+    # chan_last treats C as the channel dim, and transforms it to a new C
+    chan_first, chan_last = partial(nn.Conv1d, kernel_size=1), nn.Linear
+    if do_reduce:
+        return nn.Sequential(
+            nn.Linear(input_dim, dim),
+            *[nn.Sequential(
+                MMPreNormResidual(dim, MMFeedForward(S, expansion_factor, dropout, chan_first)),
+                MMPreNormResidual(dim, MMFeedForward(dim, expansion_factor, dropout, chan_last))
+            ) for _ in range(depth)],
+            nn.LayerNorm(dim),
+            Reduce('b n c -> b c', 'mean'),
+            nn.Linear(dim, output_dim)
+        )
+    else:
+        return nn.Sequential(
+            nn.Linear(input_dim, dim),
+            *[nn.Sequential(
+                MMPreNormResidual(dim, MMFeedForward(S, expansion_factor, dropout, chan_first)),
+                MMPreNormResidual(dim, MMFeedForward(dim, expansion_factor, dropout, chan_last))
+            ) for _ in range(depth)],
+        )
+def MLPMixerBlock(S, dim, depth=1, expansion_factor=4, dropout=0., do_reduce=False):
+    # input is coming in as B,S,C, as standard for mlp and transformer
+    # chan_first treats S as the channel dim, and transforms it to a new S
+    # chan_last treats C as the channel dim, and transforms it to a new C
+    chan_first, chan_last = partial(nn.Conv1d, kernel_size=1), nn.Linear
+    return nn.Sequential(
+        *[nn.Sequential(
+            MMPreNormResidual(dim, MMFeedForward(S, expansion_factor, dropout, chan_first)),
+            MMPreNormResidual(dim, MMFeedForward(dim, expansion_factor, dropout, chan_last))
+        ) for _ in range(depth)],
+    )
+class MlpUpdateFormer(nn.Module):
+    """
+    Transformer model that updates track estimates.
+    """
+    def __init__(
+        self,
+        space_depth=6,
+        time_depth=6,
+        input_dim=320,
+        hidden_size=384,
+        num_heads=8,
+        output_dim=130,
+        mlp_ratio=4.0,
+        num_virtual_tracks=64,
+        add_space_attn=True,
+        linear_layer_for_vis_conf=False,
+    ):
+        super().__init__()
+        self.out_channels = 2
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.input_transform = torch.nn.Linear(input_dim, hidden_size, bias=True)
+        if linear_layer_for_vis_conf:
+            self.flow_head = torch.nn.Linear(hidden_size, output_dim - 2, bias=True)
+            self.vis_conf_head = torch.nn.Linear(hidden_size, 2, bias=True)
+        else:
+            self.flow_head = torch.nn.Linear(hidden_size, output_dim, bias=True)
+        self.num_virtual_tracks = num_virtual_tracks
+        self.virual_tracks = nn.Parameter(
+            torch.randn(1, num_virtual_tracks, 1, hidden_size)
+        )
+        self.add_space_attn = add_space_attn
+        self.linear_layer_for_vis_conf = linear_layer_for_vis_conf
+        self.time_blocks = nn.ModuleList(
+            [
+                MLPMixer(
+                    S=16,
+                    input_dim=hidden_size,
+                    dim=hidden_size,
+                    output_dim=hidden_size,
+                    depth=1,
+                )
+                for _ in range(time_depth)
+            ]
+        )
+        if add_space_attn:
+            self.space_virtual_blocks = nn.ModuleList(
+                [
+                    AttnBlock(
+                        hidden_size,
+                        num_heads,
+                        mlp_ratio=mlp_ratio,
+                        attn_class=Attention,
+                    )
+                    for _ in range(space_depth)
+                ]
+            )
+            self.space_point2virtual_blocks = nn.ModuleList(
+                [
+                    CrossAttnBlock(
+                        hidden_size, hidden_size, num_heads, mlp_ratio=mlp_ratio
+                    )
+                    for _ in range(space_depth)
+                ]
+            )
+            self.space_virtual2point_blocks = nn.ModuleList(
+                [
+                    CrossAttnBlock(
+                        hidden_size, hidden_size, num_heads, mlp_ratio=mlp_ratio
+                    )
+                    for _ in range(space_depth)
+                ]
+            )
+            assert len(self.time_blocks) >= len(self.space_virtual2point_blocks)
+        self.initialize_weights()
+    def initialize_weights(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+            torch.nn.init.trunc_normal_(self.flow_head.weight, std=0.001)
+            if self.linear_layer_for_vis_conf:
+                torch.nn.init.trunc_normal_(self.vis_conf_head.weight, std=0.001)
+        def _trunc_init(module):
+            """ViT weight initialization, original timm impl (for reproducibility)"""
+            if isinstance(module, nn.Linear):
+                torch.nn.init.trunc_normal_(module.weight, std=0.02)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+        self.apply(_basic_init)
+    def forward(self, input_tensor, mask=None, add_space_attn=True):
+        tokens = self.input_transform(input_tensor)
+        B, _, T, _ = tokens.shape
+        virtual_tokens = self.virual_tracks.repeat(B, 1, T, 1)
+        tokens = torch.cat([tokens, virtual_tokens], dim=1)
+        _, N, _, _ = tokens.shape
+        j = 0
+        layers = []
+        for i in range(len(self.time_blocks)):
+            time_tokens = tokens.contiguous().view(B * N, T, -1)  # B N T C -> (B N) T C
+            time_tokens = self.time_blocks[i](time_tokens)
+            tokens = time_tokens.view(B, N, T, -1)  # (B N) T C -> B N T C
+            if (
+                add_space_attn
+                and hasattr(self, "space_virtual_blocks")
+                and (i % (len(self.time_blocks) // len(self.space_virtual_blocks)) == 0)
+            ):
+                space_tokens = (
+                    tokens.permute(0, 2, 1, 3).contiguous().view(B * T, N, -1)
+                )  # B N T C -> (B T) N C
+                point_tokens = space_tokens[:, : N - self.num_virtual_tracks]
+                virtual_tokens = space_tokens[:, N - self.num_virtual_tracks :]
+                virtual_tokens = self.space_virtual2point_blocks[j](
+                    virtual_tokens, point_tokens, mask=mask
+                )
+                virtual_tokens = self.space_virtual_blocks[j](virtual_tokens)
+                point_tokens = self.space_point2virtual_blocks[j](
+                    point_tokens, virtual_tokens, mask=mask
+                )
+                space_tokens = torch.cat([point_tokens, virtual_tokens], dim=1)
+                tokens = space_tokens.view(B, T, N, -1).permute(
+                    0, 2, 1, 3
+                )  # (B T) N C -> B N T C
+                j += 1
+        tokens = tokens[:, : N - self.num_virtual_tracks]
+        flow = self.flow_head(tokens)
+        if self.linear_layer_for_vis_conf:
+            vis_conf = self.vis_conf_head(tokens)
+            flow = torch.cat([flow, vis_conf], dim=-1)
+        return flow
+class BasicMotionEncoder(nn.Module):
+    def __init__(self, corr_channel, dim=128, pdim=2):
+        super(BasicMotionEncoder, self).__init__()
+        self.pdim = pdim
+        self.convc1 = nn.Conv2d(corr_channel, dim*4, 1, padding=0)
+        self.convc2 = nn.Conv2d(dim*4, dim+dim//2, 3, padding=1)
+        if pdim==2 or pdim==4:
+            self.convf1 = nn.Conv2d(pdim, dim*2, 5, padding=2)
+            self.convf2 = nn.Conv2d(dim*2, dim//2, 3, padding=1)
+            self.conv = nn.Conv2d(dim*2, dim-pdim, 3, padding=1)
+        else:
+            self.conv = nn.Conv2d(dim+dim//2+pdim, dim, 3, padding=1)
+    def forward(self, flow, corr):
+        cor = F.relu(self.convc1(corr))
+        cor = F.relu(self.convc2(cor))
+        if self.pdim==2 or self.pdim==4:
+            flo = F.relu(self.convf1(flow))
+            flo = F.relu(self.convf2(flo))
+            cor_flo = torch.cat([cor, flo], dim=1)
+            out = F.relu(self.conv(cor_flo))
+            return torch.cat([out, flow], dim=1)
+        else:
+            # the flow is already encoded to something nice
+            cor_flo = torch.cat([cor, flow], dim=1)
+            return F.relu(self.conv(cor_flo))
+            # return torch.cat([out, flow], dim=1)
+def conv133_encoder(input_dim, dim, expansion_factor=4):
+    return nn.Sequential(
+        nn.Conv2d(input_dim, dim*expansion_factor, kernel_size=1),
+        nn.GELU(),
+        nn.Conv2d(dim*expansion_factor, dim*expansion_factor, kernel_size=3, padding=1),
+        nn.GELU(),
+        nn.Conv2d(dim*expansion_factor, dim, kernel_size=3, padding=1),
+    )
+class BasicUpdateBlock(nn.Module):
+    def __init__(self, corr_channel, num_blocks, hdim=128, cdim=128):
+        # flowfeat is hdim; ctxfeat is dim. typically hdim==cdim.
+        super(BasicUpdateBlock, self).__init__()
+        self.encoder = BasicMotionEncoder(corr_channel, dim=cdim)
+        self.compressor = conv1x1(2*cdim+hdim, hdim)
+        self.refine = []
+        for i in range(num_blocks):
+            self.refine.append(CNBlock1d(hdim, hdim))
+            self.refine.append(CNBlock2d(hdim, hdim))
+        self.refine = nn.ModuleList(self.refine)
+    def forward(self, flowfeat, ctxfeat, corr, flow, S, upsample=True):
+        BS,C,H,W = flowfeat.shape
+        B = BS//S
+        # with torch.no_grad():
+        motion_features = self.encoder(flow, corr)
+        flowfeat = self.compressor(torch.cat([flowfeat, ctxfeat, motion_features], dim=1))
+        for blk in self.refine:
+            flowfeat = blk(flowfeat, S)
+        return flowfeat
+class FullUpdateBlock(nn.Module):
+    def __init__(self, corr_channel, num_blocks, hdim=128, cdim=128, pdim=2, use_attn=False):
+        # flowfeat is hdim; ctxfeat is dim. typically hdim==cdim.
+        super(FullUpdateBlock, self).__init__()
+        self.encoder = BasicMotionEncoder(corr_channel, dim=cdim, pdim=pdim)
+        # note we have hdim==cdim
+        # compressor chans:
+        #   dim for flowfeat
+        #   dim for ctxfeat
+        #   dim for motion_features
+        #   pdim for flow (if p 2, like if we give sincos(relflow))
+        #   2 for visconf
+        if pdim==2:
+            # hdim==cdim
+            # dim for flowfeat
+            # dim for ctxfeat
+            # dim for motion_features
+            # 2 for visconf
+            self.compressor = conv1x1(2*cdim+hdim+2, hdim)
+        else:
+            # we concatenate the flow info again, to not lose it (e.g., from the relu)
+            self.compressor = conv1x1(2*cdim+hdim+2+pdim, hdim)
+        self.refine = []
+        for i in range(num_blocks):
+            self.refine.append(CNBlock1d(hdim, hdim, use_attn=use_attn))
+            self.refine.append(CNBlock2d(hdim, hdim))
+        self.refine = nn.ModuleList(self.refine)
+    def forward(self, flowfeat, ctxfeat, visconf, corr, flow, S, upsample=True):
+        BS,C,H,W = flowfeat.shape
+        B = BS//S
+        motion_features = self.encoder(flow, corr)
+        flowfeat = self.compressor(torch.cat([flowfeat, ctxfeat, motion_features, visconf], dim=1))
+        for blk in self.refine:
+            flowfeat = blk(flowfeat, S)
+        return flowfeat
+class MixerUpdateBlock(nn.Module):
+    def __init__(self, corr_channel, num_blocks, hdim=128, cdim=128):
+        # flowfeat is hdim; ctxfeat is dim. typically hdim==cdim.
+        super(MixerUpdateBlock, self).__init__()
+        self.encoder = BasicMotionEncoder(corr_channel, dim=cdim)
+        self.compressor = conv1x1(2*cdim+hdim, hdim)
+        self.refine = []
+        for i in range(num_blocks):
+            self.refine.append(CNBlock1d(hdim, hdim, use_mixer=True))
+            self.refine.append(CNBlock2d(hdim, hdim))
+        self.refine = nn.ModuleList(self.refine)
+    def forward(self, flowfeat, ctxfeat, corr, flow, S, upsample=True):
+        BS,C,H,W = flowfeat.shape
+        B = BS//S
+        # with torch.no_grad():
+        motion_features = self.encoder(flow, corr)
+        flowfeat = self.compressor(torch.cat([flowfeat, ctxfeat, motion_features], dim=1))
+        for ii, blk in enumerate(self.refine):
+            flowfeat = blk(flowfeat, S)
+        return flowfeat
+class FacUpdateBlock(nn.Module):
+    def __init__(self, corr_channel, num_blocks, hdim=128, cdim=128, pdim=84, use_attn=False):
+        super(FacUpdateBlock, self).__init__()
+        self.corr_encoder = conv133_encoder(corr_channel, cdim)
+        # note we have hdim==cdim
+        # compressor chans:
+        #   dim for flowfeat
+        #   dim for ctxfeat
+        #   dim for corr
+        #   pdim for flow
+        #   2 for visconf
+        self.compressor = conv1x1(2*cdim+hdim+2+pdim, hdim)
+        self.refine = []
+        for i in range(num_blocks):
+            self.refine.append(CNBlock1d(hdim, hdim, use_attn=use_attn))
+            self.refine.append(CNBlock2d(hdim, hdim))
+        self.refine = nn.ModuleList(self.refine)
+    def forward(self, flowfeat, ctxfeat, visconf, corr, flow, S, upsample=True):
+        BS,C,H,W = flowfeat.shape
+        B = BS//S
+        corr = self.corr_encoder(corr)
+        flowfeat = self.compressor(torch.cat([flowfeat, ctxfeat, corr, visconf, flow], dim=1))
+        for blk in self.refine:
+            flowfeat = blk(flowfeat, S)
+        return flowfeat
+class CleanUpdateBlock(nn.Module):
+    def __init__(self, corr_channel, num_blocks, cdim=128, hdim=256, pdim=84, use_attn=False, use_layer_scale=True):
+        super(CleanUpdateBlock, self).__init__()
+        self.corr_encoder = conv133_encoder(corr_channel, cdim)
+        # compressor chans:
+        #   cdim for flowfeat
+        #   cdim for ctxfeat
+        #   cdim for corrfeat
+        #   pdim for flow
+        #   2 for visconf
+        self.compressor = conv1x1(3*cdim+pdim+2, hdim)
+        self.refine = []
+        for i in range(num_blocks):
+            self.refine.append(CNBlock1d(hdim, hdim, use_attn=use_attn, use_layer_scale=use_layer_scale))
+            self.refine.append(CNBlock2d(hdim, hdim, use_layer_scale=use_layer_scale))
+        self.refine = nn.ModuleList(self.refine)
+        self.final_conv = conv1x1(hdim, cdim)
+    def forward(self, flowfeat, ctxfeat, visconf, corr, flow, S, upsample=True):
+        BS,C,H,W = flowfeat.shape
+        B = BS//S
+        corrfeat = self.corr_encoder(corr)
+        flowfeat = self.compressor(torch.cat([flowfeat, ctxfeat, corrfeat, flow, visconf], dim=1))
+        for blk in self.refine:
+            flowfeat = blk(flowfeat, S)
+        flowfeat = self.final_conv(flowfeat)
+        return flowfeat
+class RelUpdateBlock(nn.Module):
+    def __init__(self, corr_channel, num_blocks, cdim=128, hdim=128, pdim=4, use_attn=True, use_mixer=False, use_conv=False, use_convb=False, use_layer_scale=True, no_time=False, no_space=False, no_ctx=False):
+        super(RelUpdateBlock, self).__init__()
+        self.motion_encoder = BasicMotionEncoder(corr_channel, dim=hdim, pdim=pdim) # B,hdim,H,W
+        self.no_ctx = no_ctx
+        if no_ctx:
+            self.compressor = conv1x1(cdim+hdim+2, hdim)
+        else:
+            self.compressor = conv1x1(2*cdim+hdim+2, hdim)
+        self.refine = []
+        for i in range(num_blocks):
+            if not no_time:
+                self.refine.append(CNBlock1d(hdim, hdim, use_attn=use_attn, use_mixer=use_mixer, use_conv=use_conv, use_convb=use_convb, use_layer_scale=use_layer_scale))
+            if not no_space:
+                self.refine.append(CNBlock2d(hdim, hdim, use_layer_scale=use_layer_scale))
+        self.refine = nn.ModuleList(self.refine)
+        self.final_conv = conv1x1(hdim, cdim)
+    def forward(self, flowfeat, ctxfeat, visconf, corr, flow, S, upsample=True):
+        BS,C,H,W = flowfeat.shape
+        B = BS//S
+        motion_features = self.motion_encoder(flow, corr)
+        if self.no_ctx:
+            flowfeat = self.compressor(torch.cat([flowfeat, motion_features, visconf], dim=1))
+        else:
+            flowfeat = self.compressor(torch.cat([flowfeat, ctxfeat, motion_features, visconf], dim=1))
+        for blk in self.refine:
+            flowfeat = blk(flowfeat, S)
+        flowfeat = self.final_conv(flowfeat)
+        return flowfeat

utils/basic.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import torch
+import numpy as np
+import os
+EPS = 1e-6
+def sub2ind(height, width, y, x):
+    return y*width + x
+def ind2sub(height, width, ind):
+    y = ind // width
+    x = ind % width
+    return y, x
+def get_lr_str(lr):
+    lrn = "%.1e" % lr # e.g., 5.0e-04
+    lrn = lrn[0] + lrn[3:5] + lrn[-1] # e.g., 5e-4
+    return lrn
+def strnum(x):
+    s = '%g' % x
+    if '.' in s:
+        if x < 1.0:
+            s = s[s.index('.'):]
+        s = s[:min(len(s),4)]
+    return s
+def assert_same_shape(t1, t2):
+    for (x, y) in zip(list(t1.shape), list(t2.shape)):
+        assert(x==y)
+def mkdir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+def print_stats(name, tensor):
+    shape = tensor.shape
+    tensor = tensor.detach().cpu().numpy()
+    print('%s (%s) min = %.2f, mean = %.2f, max = %.2f' % (name, tensor.dtype, np.min(tensor), np.mean(tensor), np.max(tensor)), shape)
+def normalize_single(d):
+    # d is a whatever shape torch tensor
+    dmin = torch.min(d)
+    dmax = torch.max(d)
+    d = (d-dmin)/(EPS+(dmax-dmin))
+    return d
+def normalize(d):
+    # d is B x whatever. normalize within each element of the batch
+    out = torch.zeros(d.size(), dtype=d.dtype, device=d.device)
+    B = list(d.size())[0]
+    for b in list(range(B)):
+        out[b] = normalize_single(d[b])
+    return out
+def meshgrid2d(B, Y, X, stack=False, norm=False, device='cuda', on_chans=False):
+    # returns a meshgrid sized B x Y x X
+    grid_y = torch.linspace(0.0, Y-1, Y, device=torch.device(device))
+    grid_y = torch.reshape(grid_y, [1, Y, 1])
+    grid_y = grid_y.repeat(B, 1, X)
+    grid_x = torch.linspace(0.0, X-1, X, device=torch.device(device))
+    grid_x = torch.reshape(grid_x, [1, 1, X])
+    grid_x = grid_x.repeat(B, Y, 1)
+    if norm:
+        grid_y, grid_x = normalize_grid2d(
+            grid_y, grid_x, Y, X)
+    if stack:
+        # note we stack in xy order
+        # (see https://pytorch.org/docs/stable/nn.functional.html#torch.nn.functional.grid_sample)
+        if on_chans:
+            grid = torch.stack([grid_x, grid_y], dim=1)
+        else:
+            grid = torch.stack([grid_x, grid_y], dim=-1)
+        return grid
+    else:
+        return grid_y, grid_x
+def gridcloud2d(B, Y, X, norm=False, device='cuda'):
+    # we want to sample for each location in the grid
+    grid_y, grid_x = meshgrid2d(B, Y, X, norm=norm, device=device)
+    x = torch.reshape(grid_x, [B, -1])
+    y = torch.reshape(grid_y, [B, -1])
+    # these are B x N
+    xy = torch.stack([x, y], dim=2)
+    # this is B x N x 2
+    return xy
+def reduce_masked_mean(x, mask, dim=None, keepdim=False, broadcast=False):
+    # x and mask are the same shape, or at least broadcastably so < actually it's safer if you disallow broadcasting
+    # returns shape-1
+    # axis can be a list of axes
+    if not broadcast:
+        for (a,b) in zip(x.size(), mask.size()):
+            if not a==b:
+                print('some shape mismatch:', x.shape, mask.shape)
+            assert(a==b) # some shape mismatch!
+    # assert(x.size() == mask.size())
+    prod = x*mask
+    if dim is None:
+        numer = torch.sum(prod)
+        denom = EPS+torch.sum(mask)
+    else:
+        numer = torch.sum(prod, dim=dim, keepdim=keepdim)
+        denom = EPS+torch.sum(mask, dim=dim, keepdim=keepdim)
+    mean = numer/denom
+    return mean
+def reduce_masked_median(x, mask, keep_batch=False):
+    # x and mask are the same shape
+    assert(x.size() == mask.size())
+    device = x.device
+    B = list(x.shape)[0]
+    x = x.detach().cpu().numpy()
+    mask = mask.detach().cpu().numpy()
+    if keep_batch:
+        x = np.reshape(x, [B, -1])
+        mask = np.reshape(mask, [B, -1])
+        meds = np.zeros([B], np.float32)
+        for b in list(range(B)):
+            xb = x[b]
+            mb = mask[b]
+            if np.sum(mb) > 0:
+                xb = xb[mb > 0]
+                meds[b] = np.median(xb)
+            else:
+                meds[b] = np.nan
+        meds = torch.from_numpy(meds).to(device)
+        return meds.float()
+    else:
+        x = np.reshape(x, [-1])
+        mask = np.reshape(mask, [-1])
+        if np.sum(mask) > 0:
+            x = x[mask > 0]
+            med = np.median(x)
+        else:
+            med = np.nan
+        med = np.array([med], np.float32)
+        med = torch.from_numpy(med).to(device)
+        return med.float()

utils/data.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import torch
+import dataclasses
+import torch.nn.functional as F
+from dataclasses import dataclass
+from typing import Any, Optional, Dict
+@dataclass(eq=False)
+class VideoData:
+    """
+    Dataclass for storing video tracks data.
+    """
+    video: torch.Tensor  # B,S,C,H,W
+    trajs: torch.Tensor  # B,S,N,2
+    visibs: torch.Tensor  # B,S,N
+    valids: Optional[torch.Tensor] = None  # B,S,N
+    seq_name: Optional[str] = None
+    dname: Optional[str] = None
+    aug_video: Optional[torch.Tensor] = None
+def collate_fn(batch):
+    """
+    Collate function for video tracks data.
+    """
+    video = torch.stack([b.video for b in batch], dim=0)
+    trajs = torch.stack([b.trajs for b in batch], dim=0)
+    visibs = torch.stack([b.visibs for b in batch], dim=0)
+    seq_name = [b.seq_name for b in batch]
+    dname = [b.dname for b in batch]
+    return VideoData(
+        video=video,
+        trajs=trajs,
+        visibs=visibs,
+        seq_name=seq_name,
+        dname=dname,
+    )
+def collate_fn_train(batch):
+    """
+    Collate function for video tracks data during training.
+    """
+    gotit = [gotit for _, gotit in batch]
+    video = torch.stack([b.video for b, _ in batch], dim=0)
+    trajs = torch.stack([b.trajs for b, _ in batch], dim=0)
+    visibs = torch.stack([b.visibs for b, _ in batch], dim=0)
+    valids = torch.stack([b.valids for b, _ in batch], dim=0)
+    seq_name = [b.seq_name for b, _ in batch]
+    dname = [b.dname for b, _ in batch]
+    return (
+        VideoData(
+            video=video,
+            trajs=trajs,
+            visibs=visibs,
+            valids=valids,
+            seq_name=seq_name,
+            dname=dname,
+        ),
+        gotit,
+    )
+def try_to_cuda(t: Any) -> Any:
+    """
+    Try to move the input variable `t` to a cuda device.
+    Args:
+        t: Input.
+    Returns:
+        t_cuda: `t` moved to a cuda device, if supported.
+    """
+    try:
+        t = t.float().cuda()
+    except AttributeError:
+        pass
+    return t
+def dataclass_to_cuda_(obj):
+    """
+    Move all contents of a dataclass to cuda inplace if supported.
+    Args:
+        batch: Input dataclass.
+    Returns:
+        batch_cuda: `batch` moved to a cuda device, if supported.
+    """
+    for f in dataclasses.fields(obj):
+        setattr(obj, f.name, try_to_cuda(getattr(obj, f.name)))
+    return obj

utils/improc.py ADDED Viewed

	@@ -0,0 +1,1103 @@

+import torch
+import numpy as np
+import utils.basic
+import utils.py
+from sklearn.decomposition import PCA
+from matplotlib import cm
+import matplotlib.pyplot as plt
+import cv2
+import torch.nn.functional as F
+EPS = 1e-6
+from skimage.color import (
+    rgb2lab, rgb2yuv, rgb2ycbcr, lab2rgb, yuv2rgb, ycbcr2rgb,
+    rgb2hsv, hsv2rgb, rgb2xyz, xyz2rgb, rgb2hed, hed2rgb)
+def _convert(input_, type_):
+    return {
+        'float': input_.float(),
+        'double': input_.double(),
+    }.get(type_, input_)
+def _generic_transform_sk_3d(transform, in_type='', out_type=''):
+    def apply_transform_individual(input_):
+        device = input_.device
+        input_ = input_.cpu()
+        input_ = _convert(input_, in_type)
+        input_ = input_.permute(1, 2, 0).detach().numpy()
+        transformed = transform(input_)
+        output = torch.from_numpy(transformed).float().permute(2, 0, 1)
+        output = _convert(output, out_type)
+        return output.to(device)
+    def apply_transform(input_):
+        to_stack = []
+        for image in input_:
+            to_stack.append(apply_transform_individual(image))
+        return torch.stack(to_stack)
+    return apply_transform
+hsv_to_rgb = _generic_transform_sk_3d(hsv2rgb)
+def flow2color(flow, clip=0.0):
+    B, C, H, W = list(flow.size())
+    assert(C==2)
+    flow = flow[0:1].detach()
+    if clip==0:
+        clip = torch.max(torch.abs(flow)).item()
+    flow = torch.clamp(flow, -clip, clip)/clip
+    radius = torch.sqrt(torch.sum(flow**2, dim=1, keepdim=True)) # B,1,H,W
+    radius_clipped = torch.clamp(radius, 0.0, 1.0)
+    angle = torch.atan2(-flow[:, 1:2], -flow[:, 0:1]) / np.pi # B,1,H,W
+    hue = torch.clamp((angle + 1.0) / 2.0, 0.0, 1.0)
+    saturation = torch.ones_like(hue) * 0.75
+    value = radius_clipped
+    hsv = torch.cat([hue, saturation, value], dim=1) # B,3,H,W
+    flow = hsv_to_rgb(hsv)
+    flow = (flow*255.0).type(torch.ByteTensor)
+    return flow
+COLORMAP_FILE = "./utils/bremm.png"
+class ColorMap2d:
+    def __init__(self, filename=None):
+        self._colormap_file = filename or COLORMAP_FILE
+        self._img = (plt.imread(self._colormap_file)*255).astype(np.uint8)
+        self._height = self._img.shape[0]
+        self._width = self._img.shape[1]
+    def __call__(self, X):
+        assert len(X.shape) == 2
+        output = np.zeros((X.shape[0], 3), dtype=np.uint8)
+        for i in range(X.shape[0]):
+            x, y = X[i, :]
+            xp = int((self._width-1) * x)
+            yp = int((self._height-1) * y)
+            xp = np.clip(xp, 0, self._width-1)
+            yp = np.clip(yp, 0, self._height-1)
+            output[i, :] = self._img[yp, xp]
+        return output
+def get_2d_colors(xys, H, W):
+    N,D = xys.shape
+    assert(D==2)
+    bremm = ColorMap2d()
+    xys[:,0] /= float(W-1)
+    xys[:,1] /= float(H-1)
+    colors = bremm(xys)
+    # print('colors', colors)
+    # colors = (colors[0]*255).astype(np.uint8)
+    # colors = (int(colors[0]),int(colors[1]),int(colors[2]))
+    return colors
+def get_n_colors(N, sequential=False):
+    label_colors = []
+    for ii in range(N):
+        if sequential:
+            rgb = cm.winter(ii/(N-1))
+            rgb = (np.array(rgb) * 255).astype(np.uint8)[:3]
+        else:
+            rgb = np.zeros(3)
+            while np.sum(rgb) < 128: # ensure min brightness
+                rgb = np.random.randint(0,256,3)
+        label_colors.append(rgb)
+    return label_colors
+def pca_embed(emb, keep, valid=None):
+    # helper function for reduce_emb
+    # emb is B,C,H,W
+    # keep is the number of principal components to keep
+    emb = emb + EPS
+    emb = emb.permute(0, 2, 3, 1).cpu().detach().numpy() #this is B x H x W x C
+    if valid:
+        valid = valid.cpu().detach().numpy().reshape((H*W))
+    emb_reduced = list()
+    B, H, W, C = np.shape(emb)
+    for img in emb:
+        if np.isnan(img).any():
+            emb_reduced.append(np.zeros([H, W, keep]))
+            continue
+        pixels_kd = np.reshape(img, (H*W, C))
+        if valid:
+            pixels_kd_pca = pixels_kd[valid]
+        else:
+            pixels_kd_pca = pixels_kd
+        P = PCA(keep)
+        P.fit(pixels_kd_pca)
+        if valid:
+            pixels3d = P.transform(pixels_kd)*valid
+        else:
+            pixels3d = P.transform(pixels_kd)
+        out_img = np.reshape(pixels3d, [H,W,keep]).astype(np.float32)
+        if np.isnan(out_img).any():
+            emb_reduced.append(np.zeros([H, W, keep]))
+            continue
+        emb_reduced.append(out_img)
+    emb_reduced = np.stack(emb_reduced, axis=0).astype(np.float32)
+    return torch.from_numpy(emb_reduced).permute(0, 3, 1, 2)
+def pca_embed_together(emb, keep):
+    # emb is B,C,H,W
+    # keep is the number of principal components to keep
+    emb = emb + EPS
+    emb = emb.permute(0, 2, 3, 1).cpu().detach().float().numpy() #this is B x H x W x C
+    B, H, W, C = np.shape(emb)
+    if np.isnan(emb).any():
+        return torch.zeros(B, keep, H, W)
+    pixelskd = np.reshape(emb, (B*H*W, C))
+    P = PCA(keep)
+    P.fit(pixelskd)
+    pixels3d = P.transform(pixelskd)
+    out_img = np.reshape(pixels3d, [B,H,W,keep]).astype(np.float32)
+    if np.isnan(out_img).any():
+        return torch.zeros(B, keep, H, W)
+    return torch.from_numpy(out_img).permute(0, 3, 1, 2)
+def reduce_emb(emb, valid=None, inbound=None, together=False):
+    S, C, H, W = list(emb.size())
+    keep = 4
+    if together:
+        reduced_emb = pca_embed_together(emb, keep)
+    else:
+        reduced_emb = pca_embed(emb, keep, valid) #not im
+    reduced_emb = reduced_emb[:,1:]
+    reduced_emb = utils.basic.normalize(reduced_emb) - 0.5
+    if inbound is not None:
+        emb_inbound = emb*inbound
+    else:
+        emb_inbound = None
+    return reduced_emb, emb_inbound
+def get_feat_pca(feat, valid=None):
+    B, C, D, W = list(feat.size())
+    pca, _ = reduce_emb(feat, valid=valid,inbound=None, together=True)
+    return pca
+def gif_and_tile(ims, just_gif=False):
+    S = len(ims)
+    # each im is B x H x W x C
+    # i want a gif in the left, and the tiled frames on the right
+    # for the gif tool, this means making a B x S x H x W tensor
+    # where the leftmost part is sequential and the rest is tiled
+    gif = torch.stack(ims, dim=1)
+    if just_gif:
+        return gif
+    til = torch.cat(ims, dim=2)
+    til = til.unsqueeze(dim=1).repeat(1, S, 1, 1, 1)
+    im = torch.cat([gif, til], dim=3)
+    return im
+def preprocess_color(x):
+    if isinstance(x, np.ndarray):
+        return x.astype(np.float32) * 1./255 - 0.5
+    else:
+        return x.float() * 1./255 - 0.5
+def back2color(i, blacken_zeros=False):
+    if blacken_zeros:
+        const = torch.tensor([-0.5])
+        i = torch.where(i==0.0, const.cuda() if i.is_cuda else const, i)
+        return back2color(i)
+    else:
+        return ((i+0.5)*255).type(torch.ByteTensor)
+def draw_frame_id_on_vis(vis, frame_id, scale=0.5, left=5, top=20, shadow=True):
+    rgb = vis.detach().cpu().numpy()[0]
+    rgb = np.transpose(rgb, [1, 2, 0]) # put channels last
+    rgb = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
+    color = (255, 255, 255)
+    # print('putting frame id', frame_id)
+    frame_str = utils.basic.strnum(frame_id)
+    text_color_bg = (0,0,0)
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    text_size, _ = cv2.getTextSize(frame_str, font, scale, 1)
+    text_w, text_h = text_size
+    if shadow:
+        cv2.rectangle(rgb, (left, top-text_h), (left + text_w, top+1), text_color_bg, -1)
+    cv2.putText(
+        rgb,
+        frame_str,
+        (left, top), # from left, from top
+        font,
+        scale, # font scale (float)
+        color,
+        1) # font thickness (int)
+    rgb = cv2.cvtColor(rgb.astype(np.uint8), cv2.COLOR_BGR2RGB)
+    vis = torch.from_numpy(rgb).permute(2, 0, 1).unsqueeze(0)
+    return vis
+def draw_frame_str_on_vis(vis, frame_str, scale=0.5, left=5, top=40, shadow=True):
+    rgb = vis.detach().cpu().numpy()[0]
+    rgb = np.transpose(rgb, [1, 2, 0]) # put channels last
+    rgb = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
+    color = (255, 255, 255)
+    text_color_bg = (0,0,0)
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    text_size, _ = cv2.getTextSize(frame_str, font, scale, 1)
+    text_w, text_h = text_size
+    if shadow:
+        cv2.rectangle(rgb, (left, top-text_h), (left + text_w, top+1), text_color_bg, -1)
+    cv2.putText(
+        rgb,
+        frame_str,
+        (left, top), # from left, from top
+        font,
+        scale, # font scale (float)
+        color,
+        1) # font thickness (int)
+    rgb = cv2.cvtColor(rgb.astype(np.uint8), cv2.COLOR_BGR2RGB)
+    vis = torch.from_numpy(rgb).permute(2, 0, 1).unsqueeze(0)
+    return vis
+class Summ_writer(object):
+    def __init__(self, writer, global_step, log_freq=10, fps=8, scalar_freq=100, just_gif=False):
+        self.writer = writer
+        self.global_step = global_step
+        self.log_freq = log_freq
+        self.scalar_freq = scalar_freq
+        self.fps = fps
+        self.just_gif = just_gif
+        self.maxwidth = 10000
+        self.save_this = (self.global_step % self.log_freq == 0)
+        self.scalar_freq = max(scalar_freq,1)
+        self.save_scalar = (self.global_step % self.scalar_freq == 0)
+        if self.save_this:
+            self.save_scalar = True
+    def summ_gif(self, name, tensor, blacken_zeros=False):
+        # tensor should be in B x S x C x H x W
+        assert tensor.dtype in {torch.uint8,torch.float32}
+        shape = list(tensor.shape)
+        if tensor.dtype == torch.float32:
+            tensor = back2color(tensor, blacken_zeros=blacken_zeros)
+        video_to_write = tensor[0:1]
+        S = video_to_write.shape[1]
+        if S==1:
+            # video_to_write is 1 x 1 x C x H x W
+            self.writer.add_image(name, video_to_write[0,0], global_step=self.global_step)
+        else:
+            self.writer.add_video(name, video_to_write, fps=self.fps, global_step=self.global_step)
+        return video_to_write
+    def summ_rgbs(self, name, ims, frame_ids=None, frame_strs=None, blacken_zeros=False, only_return=False):
+        if self.save_this:
+            ims = gif_and_tile(ims, just_gif=self.just_gif)
+            vis = ims
+            assert vis.dtype in {torch.uint8,torch.float32}
+            if vis.dtype == torch.float32:
+                vis = back2color(vis, blacken_zeros)
+            B, S, C, H, W = list(vis.shape)
+            if frame_ids is not None:
+                assert(len(frame_ids)==S)
+                for s in range(S):
+                    vis[:,s] = draw_frame_id_on_vis(vis[:,s], frame_ids[s])
+            if frame_strs is not None:
+                assert(len(frame_strs)==S)
+                for s in range(S):
+                    vis[:,s] = draw_frame_str_on_vis(vis[:,s], frame_strs[s])
+            if int(W) > self.maxwidth:
+                vis = vis[:,:,:,:self.maxwidth]
+            if only_return:
+                return vis
+            else:
+                return self.summ_gif(name, vis, blacken_zeros)
+    def summ_rgb(self, name, ims, blacken_zeros=False, frame_id=None, frame_str=None, only_return=False, halfres=False, shadow=True):
+        if self.save_this:
+            assert ims.dtype in {torch.uint8,torch.float32}
+            if ims.dtype == torch.float32:
+                ims = back2color(ims, blacken_zeros)
+            #ims is B x C x H x W
+            vis = ims[0:1] # just the first one
+            B, C, H, W = list(vis.shape)
+            if halfres:
+                vis = F.interpolate(vis, scale_factor=0.5)
+            if frame_id is not None:
+                vis = draw_frame_id_on_vis(vis, frame_id, shadow=shadow)
+            if frame_str is not None:
+                vis = draw_frame_str_on_vis(vis, frame_str, shadow=shadow)
+            if int(W) > self.maxwidth:
+                vis = vis[:,:,:,:self.maxwidth]
+            if only_return:
+                return vis
+            else:
+                return self.summ_gif(name, vis.unsqueeze(1), blacken_zeros)
+    def flow2color(self, flow, clip=0.0):
+        B, C, H, W = list(flow.size())
+        assert(C==2)
+        flow = flow[0:1].detach()
+        if False:
+            flow = flow[0].detach().cpu().permute(1,2,0).numpy() # H,W,2
+            if clip > 0:
+                clip_flow = clip
+            else:
+                clip_flow = None
+            im = utils.py.flow_to_image(flow, clip_flow=clip_flow, convert_to_bgr=True)
+            # im = utils.py.flow_to_image(flow, convert_to_bgr=True)
+            im = torch.from_numpy(im).permute(2,0,1).unsqueeze(0).byte() # 1,3,H,W
+            im = torch.flip(im, dims=[1]).clone() # BGR
+            # # i prefer black bkg
+            # white_pixels = (im == 255).all(dim=1, keepdim=True)
+            # im[white_pixels.expand(-1, 3, -1, -1)] = 0
+            return im
+        # flow_abs = torch.abs(flow)
+        # flow_mean = flow_abs.mean(dim=[1,2,3])
+        # flow_std = flow_abs.std(dim=[1,2,3])
+        if clip==0:
+            clip = torch.max(torch.abs(flow)).item()
+        # if clip:
+        flow = torch.clamp(flow, -clip, clip)/clip
+        # else:
+        #     # # Apply some kind of normalization. Divide by the perceived maximum (mean + std*2)
+        #     # flow_max = flow_mean + flow_std*2 + 1e-10
+        #     # for b in range(B):
+        #     #     flow[b] = flow[b].clamp(-flow_max[b].item(), flow_max[b].item()) / flow_max[b].clamp(min=1)
+        #     flow_max = torch.max(flow_abs[b])
+        #     for b in range(B):
+        #         flow[b] = flow[b].clamp(-flow_max.item(), flow_max.item()) / flow_max[b].clamp(min=1)
+        radius = torch.sqrt(torch.sum(flow**2, dim=1, keepdim=True)) #B x 1 x H x W
+        radius_clipped = torch.clamp(radius, 0.0, 1.0)
+        angle = torch.atan2(-flow[:, 1:2], -flow[:, 0:1]) / np.pi # B x 1 x H x W
+        hue = torch.clamp((angle + 1.0) / 2.0, 0.0, 1.0)
+        # hue = torch.mod(angle / (2 * np.pi) + 1.0, 1.0)
+        saturation = torch.ones_like(hue) * 0.75
+        value = radius_clipped
+        hsv = torch.cat([hue, saturation, value], dim=1) #B x 3 x H x W
+        #flow = tf.image.hsv_to_rgb(hsv)
+        flow = hsv_to_rgb(hsv)
+        flow = (flow*255.0).type(torch.ByteTensor)
+        # flow = torch.flip(flow, dims=[1]).clone() # BGR
+        return flow
+    def summ_flow(self, name, im, clip=0.0, only_return=False, frame_id=None, frame_str=None, shadow=True):
+        # flow is B x C x D x W
+        if self.save_this:
+            return self.summ_rgb(name, self.flow2color(im, clip=clip), only_return=only_return, frame_id=frame_id, frame_str=frame_str, shadow=shadow)
+        else:
+            return None
+    def summ_oneds(self, name, ims, frame_ids=None, frame_strs=None, bev=False, fro=False, logvis=False, reduce_max=False, max_val=0.0, norm=True, only_return=False, do_colorize=False):
+        if self.save_this:
+            if bev:
+                B, C, H, _, W = list(ims[0].shape)
+                if reduce_max:
+                    ims = [torch.max(im, dim=3)[0] for im in ims]
+                else:
+                    ims = [torch.mean(im, dim=3) for im in ims]
+            elif fro:
+                B, C, _, H, W = list(ims[0].shape)
+                if reduce_max:
+                    ims = [torch.max(im, dim=2)[0] for im in ims]
+                else:
+                    ims = [torch.mean(im, dim=2) for im in ims]
+            if len(ims) != 1: # sequence
+                im = gif_and_tile(ims, just_gif=self.just_gif)
+            else:
+                im = torch.stack(ims, dim=1) # single frame
+            B, S, C, H, W = list(im.shape)
+            if logvis and max_val:
+                max_val = np.log(max_val)
+                im = torch.log(torch.clamp(im, 0)+1.0)
+                im = torch.clamp(im, 0, max_val)
+                im = im/max_val
+                norm = False
+            elif max_val:
+                im = torch.clamp(im, 0, max_val)
+                im = im/max_val
+                norm = False
+            if norm:
+                # normalize before oned2inferno,
+                # so that the ranges are similar within B across S
+                im = utils.basic.normalize(im)
+            im = im.view(B*S, C, H, W)
+            vis = oned2inferno(im, norm=norm, do_colorize=do_colorize)
+            vis = vis.view(B, S, 3, H, W)
+            if frame_ids is not None:
+                assert(len(frame_ids)==S)
+                for s in range(S):
+                    vis[:,s] = draw_frame_id_on_vis(vis[:,s], frame_ids[s])
+            if frame_strs is not None:
+                assert(len(frame_strs)==S)
+                for s in range(S):
+                    vis[:,s] = draw_frame_str_on_vis(vis[:,s], frame_strs[s])
+            if W > self.maxwidth:
+                vis = vis[...,:self.maxwidth]
+            if only_return:
+                return vis
+            else:
+                self.summ_gif(name, vis)
+    def summ_oned(self, name, im, bev=False, fro=False, logvis=False, max_val=0, max_along_y=False, norm=True, frame_id=None, frame_str=None, only_return=False, shadow=True):
+        if self.save_this:
+            if bev:
+                B, C, H, _, W = list(im.shape)
+                if max_along_y:
+                    im = torch.max(im, dim=3)[0]
+                else:
+                    im = torch.mean(im, dim=3)
+            elif fro:
+                B, C, _, H, W = list(im.shape)
+                if max_along_y:
+                    im = torch.max(im, dim=2)[0]
+                else:
+                    im = torch.mean(im, dim=2)
+            else:
+                B, C, H, W = list(im.shape)
+            im = im[0:1] # just the first one
+            assert(C==1)
+            if logvis and max_val:
+                max_val = np.log(max_val)
+                im = torch.log(im)
+                im = torch.clamp(im, 0, max_val)
+                im = im/max_val
+                norm = False
+            elif max_val:
+                im = torch.clamp(im, 0, max_val)/max_val
+                norm = False
+            vis = oned2inferno(im, norm=norm)
+            if W > self.maxwidth:
+                vis = vis[...,:self.maxwidth]
+            return self.summ_rgb(name, vis, blacken_zeros=False, frame_id=frame_id, frame_str=frame_str, only_return=only_return, shadow=shadow)
+    def summ_feats(self, name, feats, valids=None, pca=True, fro=False, only_return=False, frame_ids=None, frame_strs=None):
+        if self.save_this:
+            if valids is not None:
+                valids = torch.stack(valids, dim=1)
+            feats  = torch.stack(feats, dim=1)
+            # feats leads with B x S x C
+            if feats.ndim==6:
+                # feats is B x S x C x D x H x W
+                if fro:
+                    reduce_dim = 3
+                else:
+                    reduce_dim = 4
+                if valids is None:
+                    feats = torch.mean(feats, dim=reduce_dim)
+                else:
+                    valids = valids.repeat(1, 1, feats.size()[2], 1, 1, 1)
+                    feats = utils.basic.reduce_masked_mean(feats, valids, dim=reduce_dim)
+            B, S, C, D, W = list(feats.size())
+            if not pca:
+                # feats leads with B x S x C
+                feats = torch.mean(torch.abs(feats), dim=2, keepdims=True)
+                # feats leads with B x S x 1
+                feats = torch.unbind(feats, dim=1)
+                return self.summ_oneds(name=name, ims=feats, norm=True, only_return=only_return, frame_ids=frame_ids, frame_strs=frame_strs)
+            else:
+                __p = lambda x: utils.basic.pack_seqdim(x, B)
+                __u = lambda x: utils.basic.unpack_seqdim(x, B)
+                feats_  = __p(feats)
+                if valids is None:
+                    feats_pca_ = get_feat_pca(feats_)
+                else:
+                    valids_ = __p(valids)
+                    feats_pca_ = get_feat_pca(feats_, valids)
+                feats_pca = __u(feats_pca_)
+                return self.summ_rgbs(name=name, ims=torch.unbind(feats_pca, dim=1), only_return=only_return, frame_ids=frame_ids, frame_strs=frame_strs)
+    def summ_feat(self, name, feat, valid=None, pca=True, only_return=False, bev=False, fro=False, frame_id=None, frame_str=None):
+        if self.save_this:
+            if feat.ndim==5: # B x C x D x H x W
+                if bev:
+                    reduce_axis = 3
+                elif fro:
+                    reduce_axis = 2
+                else:
+                    # default to bev
+                    reduce_axis = 3
+                if valid is None:
+                    feat = torch.mean(feat, dim=reduce_axis)
+                else:
+                    valid = valid.repeat(1, feat.size()[1], 1, 1, 1)
+                    feat = utils.basic.reduce_masked_mean(feat, valid, dim=reduce_axis)
+            B, C, D, W = list(feat.shape)
+            if not pca:
+                feat = torch.mean(torch.abs(feat), dim=1, keepdims=True)
+                # feat is B x 1 x D x W
+                return self.summ_oned(name=name, im=feat, norm=True, only_return=only_return, frame_id=frame_id, frame_str=frame_str)
+            else:
+                feat_pca = get_feat_pca(feat, valid)
+                return self.summ_rgb(name, feat_pca, only_return=only_return, frame_id=frame_id, frame_str=frame_str)
+    def summ_scalar(self, name, value):
+        if (not (isinstance(value, int) or isinstance(value, float) or isinstance(value, np.float32))) and ('torch' in value.type()):
+            value = value.detach().cpu().numpy()
+        if not np.isnan(value):
+            if (self.log_freq == 1):
+                self.writer.add_scalar(name, value, global_step=self.global_step)
+            elif self.save_this or self.save_scalar:
+                self.writer.add_scalar(name, value, global_step=self.global_step)
+    def summ_traj2ds_on_rgbs(self, name, trajs, rgbs, visibs=None, valids=None, frame_ids=None, frame_strs=None, only_return=False, show_dots=True, cmap='coolwarm', vals=None, linewidth=1, max_show=1024):
+        # trajs is B, S, N, 2
+        # rgbs is B, S, C, H, W
+        B, S, C, H, W = rgbs.shape
+        B, S2, N, D = trajs.shape
+        assert(S==S2)
+        rgbs = rgbs[0] # S, C, H, W
+        trajs = trajs[0] # S, N, 2
+        if valids is None:
+            valids = torch.ones_like(trajs[:,:,0]) # S, N
+        else:
+            valids = valids[0]
+        if visibs is None:
+            visibs = torch.ones_like(trajs[:,:,0]) # S, N
+        else:
+            visibs = visibs[0]
+        if vals is not None:
+            vals = vals[0] # N
+            # print('vals', vals.shape)
+        if N > max_show:
+            inds = np.random.choice(N, max_show)
+            trajs = trajs[:,inds]
+            valids = valids[:,inds]
+            visibs = visibs[:,inds]
+            if vals is not None:
+                vals = vals[inds]
+            N = trajs.shape[1]
+        trajs = trajs.clamp(-16, W+16)
+        rgbs_color = []
+        for rgb in rgbs:
+            rgb = back2color(rgb).detach().cpu().numpy()
+            rgb = np.transpose(rgb, [1, 2, 0]) # put channels last
+            rgbs_color.append(rgb) # each element 3 x H x W
+        for i in range(min(N, max_show)):
+            if cmap=='onediff' and i==0:
+                cmap_ = 'spring'
+            elif cmap=='onediff':
+                cmap_ = 'winter'
+            else:
+                cmap_ = cmap
+            traj = trajs[:,i].long().detach().cpu().numpy() # S, 2
+            valid = valids[:,i].long().detach().cpu().numpy() # S
+            # print('traj', traj.shape)
+            # print('valid', valid.shape)
+            if vals is not None:
+                # val = vals[:,i].float().detach().cpu().numpy() # []
+                val = vals[i].float().detach().cpu().numpy() # []
+                # print('val', val.shape)
+            else:
+                val = None
+            for t in range(S):
+                if valid[t]:
+                    rgbs_color[t] = self.draw_traj_on_image_py(rgbs_color[t], traj[:t+1], S=S, show_dots=show_dots, cmap=cmap_, val=val, linewidth=linewidth)
+        for i in range(min(N, max_show)):
+            if cmap=='onediff' and i==0:
+                cmap_ = 'spring'
+            elif cmap=='onediff':
+                cmap_ = 'winter'
+            else:
+                cmap_ = cmap
+            traj = trajs[:,i] # S,2
+            vis = visibs[:,i].round() # S
+            valid = valids[:,i] # S
+            rgbs_color = self.draw_circ_on_images_py(rgbs_color, traj, vis, S=S, show_dots=show_dots, cmap=cmap_, linewidth=linewidth)
+        rgbs = []
+        for rgb in rgbs_color:
+            rgb = torch.from_numpy(rgb).permute(2, 0, 1).unsqueeze(0)
+            rgbs.append(preprocess_color(rgb))
+        return self.summ_rgbs(name, rgbs, only_return=only_return, frame_ids=frame_ids, frame_strs=frame_strs)
+    def summ_traj2ds_on_rgbs2(self, name, trajs, visibles, rgbs, valids=None, frame_ids=None, frame_strs=None, only_return=False, show_dots=True, cmap=None, linewidth=1, max_show=1024):
+        # trajs is B, S, N, 2
+        # rgbs is B, S, C, H, W
+        B, S, C, H, W = rgbs.shape
+        B, S2, N, D = trajs.shape
+        assert(S==S2)
+        rgbs = rgbs[0] # S, C, H, W
+        trajs = trajs[0] # S, N, 2
+        visibles = visibles[0] # S, N
+        if valids is None:
+            valids = torch.ones_like(trajs[:,:,0]) # S, N
+        else:
+            valids = valids[0]
+        rgbs_color = []
+        for rgb in rgbs:
+            rgb = back2color(rgb).detach().cpu().numpy()
+            rgb = np.transpose(rgb, [1, 2, 0]) # put channels last
+            rgbs_color.append(rgb) # each element 3 x H x W
+        trajs = trajs.long().detach().cpu().numpy() # S, N, 2
+        visibles = visibles.float().detach().cpu().numpy() # S, N
+        valids = valids.long().detach().cpu().numpy() # S, N
+        for i in range(min(N, max_show)):
+            if cmap=='onediff' and i==0:
+                cmap_ = 'spring'
+            elif cmap=='onediff':
+                cmap_ = 'winter'
+            else:
+                cmap_ = cmap
+            traj = trajs[:,i] # S,2
+            vis = visibles[:,i] # S
+            valid = valids[:,i] # S
+            rgbs_color = self.draw_traj_on_images_py(rgbs_color, traj, S=S, show_dots=show_dots, cmap=cmap_, linewidth=linewidth)
+        for i in range(min(N, max_show)):
+            if cmap=='onediff' and i==0:
+                cmap_ = 'spring'
+            elif cmap=='onediff':
+                cmap_ = 'winter'
+            else:
+                cmap_ = cmap
+            traj = trajs[:,i] # S,2
+            vis = visibles[:,i] # S
+            valid = valids[:,i] # S
+            rgbs_color = self.draw_circ_on_images_py(rgbs_color, traj, vis, S=S, show_dots=show_dots, cmap=None, linewidth=linewidth)
+        rgbs = []
+        for rgb in rgbs_color:
+            rgb = torch.from_numpy(rgb).permute(2, 0, 1).unsqueeze(0)
+            rgbs.append(preprocess_color(rgb))
+        return self.summ_rgbs(name, rgbs, only_return=only_return, frame_ids=frame_ids, frame_strs=frame_strs)
+    def summ_traj2ds_on_rgb(self, name, trajs, rgb, valids=None, show_dots=True, show_lines=True, frame_id=None, frame_str=None, only_return=False, cmap='coolwarm', linewidth=1, max_show=1024):
+        # trajs is B, S, N, 2
+        # rgb is B, C, H, W
+        B, C, H, W = rgb.shape
+        B, S, N, D = trajs.shape
+        rgb = rgb[0] # S, C, H, W
+        trajs = trajs[0] # S, N, 2
+        if valids is None:
+            valids = torch.ones_like(trajs[:,:,0])
+        else:
+            valids = valids[0]
+        rgb_color = back2color(rgb).detach().cpu().numpy()
+        rgb_color = np.transpose(rgb_color, [1, 2, 0]) # put channels last
+        # using maxdist will dampen the colors for short motions
+        # norms = torch.sqrt(1e-4 + torch.sum((trajs[-1] - trajs[0])**2, dim=1)) # N
+        # maxdist = torch.quantile(norms, 0.95).detach().cpu().numpy()
+        maxdist = None
+        trajs = trajs.long().detach().cpu().numpy() # S, N, 2
+        valids = valids.long().detach().cpu().numpy() # S, N
+        if N > max_show:
+            inds = np.random.choice(N, max_show)
+            trajs = trajs[:,inds]
+            valids = valids[:,inds]
+            N = trajs.shape[1]
+        for i in range(min(N, max_show)):
+            if cmap=='onediff' and i==0:
+                cmap_ = 'spring'
+            elif cmap=='onediff':
+                cmap_ = 'winter'
+            else:
+                cmap_ = cmap
+            traj = trajs[:,i] # S, 2
+            valid = valids[:,i] # S
+            if valid[0]==1:
+                traj = traj[valid>0]
+                rgb_color = self.draw_traj_on_image_py(
+                    rgb_color, traj, S=S, show_dots=show_dots, show_lines=show_lines, cmap=cmap_, maxdist=maxdist, linewidth=linewidth)
+        rgb_color = torch.from_numpy(rgb_color).permute(2, 0, 1).unsqueeze(0)
+        rgb = preprocess_color(rgb_color)
+        return self.summ_rgb(name, rgb, only_return=only_return, frame_id=frame_id, frame_str=frame_str)
+    def draw_traj_on_image_py(self, rgb, traj, S=50, linewidth=1, show_dots=False, show_lines=True, cmap='coolwarm', val=None, maxdist=None):
+        # all inputs are numpy tensors
+        # rgb is 3 x H x W
+        # traj is S x 2
+        H, W, C = rgb.shape
+        assert(C==3)
+        rgb = rgb.astype(np.uint8).copy()
+        S1, D = traj.shape
+        assert(D==2)
+        color_map = cm.get_cmap(cmap)
+        S1, D = traj.shape
+        for s in range(S1):
+            if val is not None:
+                color = np.array(color_map(val)[:3]) * 255 # rgb
+            else:
+                if maxdist is not None:
+                    val = (np.sqrt(np.sum((traj[s]-traj[0])**2))/maxdist).clip(0,1)
+                    color = np.array(color_map(val)[:3]) * 255 # rgb
+                else:
+                    color = np.array(color_map((s)/max(1,float(S-2)))[:3]) * 255 # rgb
+            if show_lines and s<(S1-1):
+                cv2.line(rgb,
+                         (int(traj[s,0]), int(traj[s,1])),
+                         (int(traj[s+1,0]), int(traj[s+1,1])),
+                         color,
+                         linewidth,
+                         cv2.LINE_AA)
+            if show_dots:
+                cv2.circle(rgb, (int(traj[s,0]), int(traj[s,1])), linewidth, color, -1)
+        # if maxdist is not None:
+        #     val = (np.sqrt(np.sum((traj[-1]-traj[0])**2))/maxdist).clip(0,1)
+        #     color = np.array(color_map(val)[:3]) * 255 # rgb
+        # else:
+        #     # draw the endpoint of traj, using the next color (which may be the last color)
+        #     color = np.array(color_map((S1-1)/max(1,float(S-2)))[:3]) * 255 # rgb
+        # # emphasize endpoint
+        # cv2.circle(rgb, (traj[-1,0], traj[-1,1]), linewidth*2, color, -1)
+        return rgb
+    def draw_traj_on_images_py(self, rgbs, traj, S=50, linewidth=1, show_dots=False, cmap='coolwarm', maxdist=None):
+        # all inputs are numpy tensors
+        # rgbs is a list of H,W,3
+        # traj is S,2
+        H, W, C = rgbs[0].shape
+        assert(C==3)
+        rgbs = [rgb.astype(np.uint8).copy() for rgb in rgbs]
+        S1, D = traj.shape
+        assert(D==2)
+        x = int(np.clip(traj[0,0], 0, W-1))
+        y = int(np.clip(traj[0,1], 0, H-1))
+        color = rgbs[0][y,x]
+        color = (int(color[0]),int(color[1]),int(color[2]))
+        for s in range(S):
+            # bak_color = np.array(color_map(1.0)[:3]) * 255 # rgb
+            # cv2.circle(rgbs[s], (traj[s,0], traj[s,1]), linewidth*4, bak_color, -1)
+            cv2.polylines(rgbs[s],
+                          [traj[:s+1]],
+                          False,
+                          color,
+                          linewidth,
+                          cv2.LINE_AA)
+        return rgbs
+    def draw_circs_on_image_py(self, rgb, xy, colors=None, linewidth=10, radius=3, show_dots=False, maxdist=None):
+        # all inputs are numpy tensors
+        # rgbs is a list of 3,H,W
+        # xy is N,2
+        H, W, C = rgb.shape
+        assert(C==3)
+        rgb = rgb.astype(np.uint8).copy()
+        N, D = xy.shape
+        assert(D==2)
+        xy = xy.astype(np.float32)
+        xy[:,0] = np.clip(xy[:,0], 0, W-1)
+        xy[:,1] = np.clip(xy[:,1], 0, H-1)
+        xy = xy.astype(np.int32)
+        if colors is None:
+            colors = get_n_colors(N)
+        for n in range(N):
+            color = colors[n]
+            # print('color', color)
+            # color = (color[0]*255).astype(np.uint8)
+            color = (int(color[0]),int(color[1]),int(color[2]))
+            # x = int(np.clip(xy[0,0], 0, W-1))
+            # y = int(np.clip(xy[0,1], 0, H-1))
+            # color_ = rgbs[0][y,x]
+            # color_ = (int(color_[0]),int(color_[1]),int(color_[2]))
+            # color_ = (int(color_[0]),int(color_[1]),int(color_[2]))
+            cv2.circle(rgb, (int(xy[n,0]), int(xy[n,1])), linewidth, color, 3)
+            # vis_color = int(np.squeeze(vis[s])*255)
+            # vis_color = (vis_color,vis_color,vis_color)
+            # cv2.circle(rgbs[s], (traj[s,0], traj[s,1]), linewidth+1, vis_color, -1)
+        return rgb
+    def draw_circ_on_images_py(self, rgbs, traj, vis, S=50, linewidth=1, show_dots=False, cmap=None, maxdist=None):
+        # all inputs are numpy tensors
+        # rgbs is a list of 3,H,W
+        # traj is S,2
+        H, W, C = rgbs[0].shape
+        assert(C==3)
+        rgbs = [rgb.astype(np.uint8).copy() for rgb in rgbs]
+        S1, D = traj.shape
+        assert(D==2)
+        if cmap is None:
+            bremm = ColorMap2d()
+            traj_ = traj[0:1].astype(np.float32)
+            traj_[:,0] /= float(W)
+            traj_[:,1] /= float(H)
+            color = bremm(traj_)
+            # print('color', color)
+            color = (color[0]*255).astype(np.uint8)
+            color = (int(color[0]),int(color[1]),int(color[2]))
+        for s in range(S):
+            if cmap is not None:
+                color_map = cm.get_cmap(cmap)
+                # color = np.array(color_map(s/(S-1))[:3]) * 255 # rgb
+                color = np.array(color_map((s)/max(1,float(S-2)))[:3]) * 255 # rgb
+                # color = color.astype(np.uint8)
+                # color = (color[0], color[1], color[2])
+                # print('color', color)
+            # import ipdb; ipdb.set_trace()
+            cv2.circle(rgbs[s], (int(traj[s,0]), int(traj[s,1])), linewidth+2, color, -1)
+            vis_color = int(np.squeeze(vis[s])*255)
+            vis_color = (vis_color,vis_color,vis_color)
+            cv2.circle(rgbs[s], (int(traj[s,0]), int(traj[s,1])), linewidth+1, vis_color, -1)
+        return rgbs
+    def summ_pts_on_rgb(self, name, trajs, rgb, visibs=None, valids=None, frame_id=None, frame_str=None, only_return=False, show_dots=True, colors=None, cmap='coolwarm', linewidth=1, max_show=1024, already_sorted=False):
+        # trajs is B, S, N, 2
+        # rgbs is B, S, C, H, W
+        B, C, H, W = rgb.shape
+        B, S, N, D = trajs.shape
+        rgb = rgb[0] # C, H, W
+        trajs = trajs[0] # S, N, 2
+        if valids is None:
+            valids = torch.ones_like(trajs[:,:,0]) # S, N
+        else:
+            valids = valids[0]
+        if visibs is None:
+            visibs = torch.ones_like(trajs[:,:,0]) # S, N
+        else:
+            visibs = visibs[0]
+        trajs = trajs.clamp(-16, W+16)
+        if N > max_show:
+            inds = np.random.choice(N, max_show)
+            trajs = trajs[:,inds]
+            valids = valids[:,inds]
+            visibs = visibs[:,inds]
+            N = trajs.shape[1]
+        if not already_sorted:
+            inds = torch.argsort(torch.mean(trajs[:,:,1], dim=0))
+            trajs = trajs[:,inds]
+            valids = valids[:,inds]
+            visibs = visibs[:,inds]
+        rgb = back2color(rgb).detach().cpu().numpy()
+        rgb = np.transpose(rgb, [1, 2, 0]) # put channels last
+        trajs = trajs.long().detach().cpu().numpy() # S, N, 2
+        valids = valids.long().detach().cpu().numpy() # S, N
+        visibs = visibs.long().detach().cpu().numpy() # S, N
+        rgb = rgb.astype(np.uint8).copy()
+        for i in range(min(N, max_show)):
+            if cmap=='onediff' and i==0:
+                cmap_ = 'spring'
+            elif cmap=='onediff':
+                cmap_ = 'winter'
+            else:
+                cmap_ = cmap
+            traj = trajs[:,i] # S,2
+            valid = valids[:,i] # S
+            visib = visibs[:,i] # S
+            if colors is None:
+                ii = i/(1e-4+N-1.0)
+                color_map = cm.get_cmap(cmap)
+                color = np.array(color_map(ii)[:3]) * 255 # rgb
+            else:
+                color = np.array(colors[i]).astype(np.int64)
+            color = (int(color[0]),int(color[1]),int(color[2]))
+            for s in range(S):
+                if valid[s]:
+                    if visib[s]:
+                        thickness = -1
+                    else:
+                        thickness = 2
+                    cv2.circle(rgb, (int(traj[s,0]), int(traj[s,1])), linewidth, color, thickness)
+        rgb = torch.from_numpy(rgb).permute(2,0,1).unsqueeze(0)
+        rgb = preprocess_color(rgb)
+        return self.summ_rgb(name, rgb, only_return=only_return, frame_id=frame_id, frame_str=frame_str)
+    def summ_pts_on_rgbs(self, name, trajs, rgbs, visibs=None, valids=None, frame_ids=None, only_return=False, show_dots=True, cmap='coolwarm', colors=None, linewidth=1, max_show=1024, frame_strs=None):
+        # trajs is B, S, N, 2
+        # rgbs is B, S, C, H, W
+        B, S, C, H, W = rgbs.shape
+        B, S2, N, D = trajs.shape
+        assert(S==S2)
+        rgbs = rgbs[0] # S, C, H, W
+        trajs = trajs[0] # S, N, 2
+        if valids is None:
+            valids = torch.ones_like(trajs[:,:,0]) # S, N
+        else:
+            valids = valids[0]
+        if visibs is None:
+            visibs = torch.ones_like(trajs[:,:,0]) # S, N
+        else:
+            visibs = visibs[0]
+        if N > max_show:
+            inds = np.random.choice(N, max_show)
+            trajs = trajs[:,inds]
+            valids = valids[:,inds]
+            visibs = visibs[:,inds]
+            N = trajs.shape[1]
+        inds = torch.argsort(torch.mean(trajs[:,:,1], dim=0))
+        trajs = trajs[:,inds]
+        valids = valids[:,inds]
+        visibs = visibs[:,inds]
+        rgbs_color = []
+        for rgb in rgbs:
+            rgb = back2color(rgb).detach().cpu().numpy()
+            rgb = np.transpose(rgb, [1, 2, 0]) # put channels last
+            rgbs_color.append(rgb) # each element 3 x H x W
+        trajs = trajs.long().detach().cpu().numpy() # S, N, 2
+        valids = valids.long().detach().cpu().numpy() # S, N
+        visibs = visibs.long().detach().cpu().numpy() # S, N
+        rgbs_color = [rgb.astype(np.uint8).copy() for rgb in rgbs_color]
+        for i in range(min(N, max_show)):
+            traj = trajs[:,i] # S,2
+            valid = valids[:,i] # S
+            visib = visibs[:,i] # S
+            if colors is None:
+                ii = i/(1e-4+N-1.0)
+                color_map = cm.get_cmap(cmap)
+                color = np.array(color_map(ii)[:3]) * 255 # rgb
+            else:
+                color = np.array(colors[i]).astype(np.int64)
+            color = (int(color[0]),int(color[1]),int(color[2]))
+            for s in range(S):
+                if valid[s]:
+                    if visib[s]:
+                        thickness = -1
+                    else:
+                        thickness = 2
+                    cv2.circle(rgbs_color[s], (int(traj[s,0]), int(traj[s,1])), int(linewidth), color, thickness)
+        rgbs = []
+        for rgb in rgbs_color:
+            rgb = torch.from_numpy(rgb).permute(2, 0, 1).unsqueeze(0)
+            rgbs.append(preprocess_color(rgb))
+        return self.summ_rgbs(name, rgbs, only_return=only_return, frame_ids=frame_ids, frame_strs=frame_strs)

utils/loss.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from typing import List
+import utils.basic
+def sequence_loss(
+    flow_preds,
+    flow_gt,
+    valids,
+    vis=None,
+    gamma=0.8,
+    use_huber_loss=False,
+    loss_only_for_visible=False,
+):
+    """Loss function defined over sequence of flow predictions"""
+    total_flow_loss = 0.0
+    for j in range(len(flow_gt)):
+        B, S, N, D = flow_gt[j].shape
+        B, S2, N = valids[j].shape
+        assert S == S2
+        n_predictions = len(flow_preds[j])
+        flow_loss = 0.0
+        for i in range(n_predictions):
+            i_weight = gamma ** (n_predictions - i - 1)
+            flow_pred = flow_preds[j][i]
+            if use_huber_loss:
+                i_loss = huber_loss(flow_pred, flow_gt[j], delta=6.0)
+            else:
+                i_loss = (flow_pred - flow_gt[j]).abs()  # B, S, N, 2
+            i_loss = torch.mean(i_loss, dim=3)  # B, S, N
+            valid_ = valids[j].clone()
+            if loss_only_for_visible:
+                valid_ = valid_ * vis[j]
+            flow_loss += i_weight * utils.basic.reduce_masked_mean(i_loss, valid_)
+        flow_loss = flow_loss / n_predictions
+        total_flow_loss += flow_loss
+    return total_flow_loss / len(flow_gt)
+def sequence_loss_dense(
+    flow_preds,
+    flow_gt,
+    valids,
+    vis=None,
+    gamma=0.8,
+    use_huber_loss=False,
+    loss_only_for_visible=False,
+):
+    """Loss function defined over sequence of flow predictions"""
+    total_flow_loss = 0.0
+    for j in range(len(flow_gt)):
+        # print('flow_gt[j]', flow_gt[j].shape)
+        B, S, D, H, W = flow_gt[j].shape
+        B, S2, _, H, W = valids[j].shape
+        assert S == S2
+        n_predictions = len(flow_preds[j])
+        flow_loss = 0.0
+        # import ipdb; ipdb.set_trace()
+        for i in range(n_predictions):
+            # print('flow_e[j][i]', flow_preds[j][i].shape)
+            i_weight = gamma ** (n_predictions - i - 1)
+            flow_pred = flow_preds[j][i] # B,S,2,H,W
+            if use_huber_loss:
+                i_loss = huber_loss(flow_pred, flow_gt[j], delta=6.0) # B,S,2,H,W
+            else:
+                i_loss = (flow_pred - flow_gt[j]).abs() # B,S,2,H,W
+            i_loss_ = torch.mean(i_loss, dim=2) # B,S,H,W
+            valid_ = valids[j].reshape(B,S,H,W)
+            # print(' (%d,%d) i_loss_' % (i,j), i_loss_.shape)
+            # print(' (%d,%d) valid_' % (i,j), valid_.shape)
+            if loss_only_for_visible:
+                valid_ = valid_ * vis[j].reshape(B,-1,H,W) # usually B,S,H,W, but maybe B,1,H,W
+            flow_loss += i_weight * utils.basic.reduce_masked_mean(i_loss_, valid_, broadcast=True)
+            # import ipdb; ipdb.set_trace()
+        flow_loss = flow_loss / n_predictions
+        total_flow_loss += flow_loss
+    return total_flow_loss / len(flow_gt)
+def huber_loss(x, y, delta=1.0):
+    """Calculate element-wise Huber loss between x and y"""
+    diff = x - y
+    abs_diff = diff.abs()
+    flag = (abs_diff <= delta).float()
+    return flag * 0.5 * diff**2 + (1 - flag) * delta * (abs_diff - 0.5 * delta)
+def sequence_BCE_loss(vis_preds, vis_gts, valids=None, use_logits=False):
+    total_bce_loss = 0.0
+    # all_vis_preds = [torch.stack(vp) for vp in vis_preds]
+    # all_vis_preds = torch.stack(all_vis_preds)
+    # utils.basic.print_stats('all_vis_preds', all_vis_preds)
+    for j in range(len(vis_preds)):
+        n_predictions = len(vis_preds[j])
+        bce_loss = 0.0
+        for i in range(n_predictions):
+            # utils.basic.print_stats('vis_preds[%d][%d]' % (j,i), vis_preds[j][i])
+            # utils.basic.print_stats('vis_gts[%d]' % (i), vis_gts[i])
+            if use_logits:
+                loss = F.binary_cross_entropy_with_logits(vis_preds[j][i], vis_gts[j], reduction='none')
+            else:
+                loss = F.binary_cross_entropy(vis_preds[j][i], vis_gts[j], reduction='none')
+            if valids is None:
+                bce_loss += loss.mean()
+            else:
+                bce_loss += (loss * valids[j]).mean()
+        bce_loss = bce_loss / n_predictions
+        total_bce_loss += bce_loss
+    return total_bce_loss / len(vis_preds)
+# def sequence_BCE_loss_dense(vis_preds, vis_gts):
+#     total_bce_loss = 0.0
+#     for j in range(len(vis_preds)):
+#         n_predictions = len(vis_preds[j])
+#         bce_loss = 0.0
+#         for i in range(n_predictions):
+#             vis_e = vis_preds[j][i]
+#             vis_g = vis_gts[j]
+#             print('vis_e', vis_e.shape, 'vis_g', vis_g.shape)
+#             vis_loss = F.binary_cross_entropy(vis_e, vis_g)
+#             bce_loss += vis_loss
+#         bce_loss = bce_loss / n_predictions
+#         total_bce_loss += bce_loss
+#     return total_bce_loss / len(vis_preds)
+def sequence_prob_loss(
+        tracks: torch.Tensor,
+        confidence: torch.Tensor,
+        target_points: torch.Tensor,
+        visibility: torch.Tensor,
+        expected_dist_thresh: float = 12.0,
+        use_logits=False,
+):
+    """Loss for classifying if a point is within pixel threshold of its target."""
+    # Points with an error larger than 12 pixels are likely to be useless; marking
+    # them as occluded will actually improve Jaccard metrics and give
+    # qualitatively better results.
+    total_logprob_loss = 0.0
+    for j in range(len(tracks)):
+        n_predictions = len(tracks[j])
+        logprob_loss = 0.0
+        for i in range(n_predictions):
+            err = torch.sum((tracks[j][i].detach() - target_points[j]) ** 2, dim=-1)
+            valid = (err <= expected_dist_thresh**2).float()
+            if use_logits:
+                loss = F.binary_cross_entropy_with_logits(confidence[j][i], valid, reduction="none")
+            else:
+                loss = F.binary_cross_entropy(confidence[j][i], valid, reduction="none")
+            loss *= visibility[j]
+            loss = torch.mean(loss, dim=[1, 2])
+            logprob_loss += loss
+        logprob_loss = logprob_loss / n_predictions
+        total_logprob_loss += logprob_loss
+    return total_logprob_loss / len(tracks)
+def sequence_prob_loss_dense(
+    tracks: torch.Tensor,
+    confidence: torch.Tensor,
+    target_points: torch.Tensor,
+    visibility: torch.Tensor,
+    expected_dist_thresh: float = 12.0,
+    use_logits=False,
+):
+    """Loss for classifying if a point is within pixel threshold of its target."""
+    # Points with an error larger than 12 pixels are likely to be useless; marking
+    # them as occluded will actually improve Jaccard metrics and give
+    # qualitatively better results.
+    # all_confidence = [torch.stack(vp) for vp in confidence]
+    # all_confidence = torch.stack(all_confidence)
+    # utils.basic.print_stats('all_confidence', all_confidence)
+    total_logprob_loss = 0.0
+    for j in range(len(tracks)):
+        n_predictions = len(tracks[j])
+        logprob_loss = 0.0
+        for i in range(n_predictions):
+            # print('trajs_e', tracks[j][i].shape)
+            # print('trajs_g', target_points[j].shape)
+            err = torch.sum((tracks[j][i].detach() - target_points[j]) ** 2, dim=2)
+            positive = (err <= expected_dist_thresh**2).float()
+            # print('conf', confidence[j][i].shape, 'positive', positive.shape)
+            if use_logits:
+                loss = F.binary_cross_entropy_with_logits(confidence[j][i].squeeze(2), positive, reduction="none")
+            else:
+                loss = F.binary_cross_entropy(confidence[j][i].squeeze(2), positive, reduction="none")
+            loss *= visibility[j].squeeze(2) # B,S,H,W
+            loss = torch.mean(loss, dim=[1,2,3])
+            logprob_loss += loss
+        logprob_loss = logprob_loss / n_predictions
+        total_logprob_loss += logprob_loss
+    return total_logprob_loss / len(tracks)
+def masked_mean(data, mask, dim):
+    if mask is None:
+        return data.mean(dim=dim, keepdim=True)
+    mask = mask.float()
+    mask_sum = torch.sum(mask, dim=dim, keepdim=True)
+    mask_mean = torch.sum(data * mask, dim=dim, keepdim=True) / torch.clamp(
+        mask_sum, min=1.0
+    )
+    return mask_mean
+def masked_mean_var(data: torch.Tensor, mask: torch.Tensor, dim: List[int]):
+    if mask is None:
+        return data.mean(dim=dim, keepdim=True), data.var(dim=dim, keepdim=True)
+    mask = mask.float()
+    mask_sum = torch.sum(mask, dim=dim, keepdim=True)
+    mask_mean = torch.sum(data * mask, dim=dim, keepdim=True) / torch.clamp(
+        mask_sum, min=1.0
+    )
+    mask_var = torch.sum(
+        mask * (data - mask_mean) ** 2, dim=dim, keepdim=True
+    ) / torch.clamp(mask_sum, min=1.0)
+    return mask_mean.squeeze(dim), mask_var.squeeze(dim)

utils/misc.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import torch
+import numpy as np
+def get_1d_sincos_pos_embed_from_grid(embed_dim, positions):
+    assert embed_dim % 2 == 0
+    omega = torch.arange(embed_dim // 2, dtype=torch.double)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    positions = positions.reshape(-1)  # (M,)
+    out = torch.einsum("m,d->md", positions, omega)  # (M, D/2), outer product
+    emb_sin = torch.sin(out)  # (M, D/2)
+    emb_cos = torch.cos(out)  # (M, D/2)
+    emb = torch.cat([emb_sin, emb_cos], dim=1)  # (M, D)
+    return emb[None].float()
+class SimplePool():
+    def __init__(self, pool_size, version='pt', min_size=1):
+        self.pool_size = pool_size
+        self.version = version
+        self.items = []
+        self.min_size = min_size
+        if not (version=='pt' or version=='np'):
+            print('version = %s; please choose pt or np')
+            assert(False) # please choose pt or np
+    def __len__(self):
+        return len(self.items)
+    def mean(self, min_size=None):
+        if min_size is None:
+            pool_size_thresh = self.min_size
+        elif min_size=='half':
+            pool_size_thresh = self.pool_size/2
+        else:
+            pool_size_thresh = min_size
+        if self.version=='np':
+            if len(self.items) >= pool_size_thresh:
+                return np.sum(self.items)/float(len(self.items))
+            else:
+                return np.nan
+        if self.version=='pt':
+            if len(self.items) >= pool_size_thresh:
+                return torch.sum(self.items)/float(len(self.items))
+            else:
+                return torch.from_numpy(np.nan)
+    def sample(self, with_replacement=True):
+        idx = np.random.randint(len(self.items))
+        if with_replacement:
+            return self.items[idx]
+        else:
+            return self.items.pop(idx)
+    def fetch(self, num=None):
+        if self.version=='pt':
+            item_array = torch.stack(self.items)
+        elif self.version=='np':
+            item_array = np.stack(self.items)
+        if num is not None:
+            # there better be some items
+            assert(len(self.items) >= num)
+            # if there are not that many elements just return however many there are
+            if len(self.items) < num:
+                return item_array
+            else:
+                idxs = np.random.randint(len(self.items), size=num)
+                return item_array[idxs]
+        else:
+            return item_array
+    def is_full(self):
+        full = len(self.items)==self.pool_size
+        return full
+    def empty(self):
+        self.items = []
+    def have_min_size(self):
+        return len(self.items) >= self.min_size
+    def update(self, items):
+        for item in items:
+            if len(self.items) < self.pool_size:
+                # the pool is not full, so let's add this in
+                self.items.append(item)
+            else:
+                # the pool is full
+                # pop from the front
+                self.items.pop(0)
+                # add to the back
+                self.items.append(item)
+        return self.items

utils/py.py ADDED Viewed

	@@ -0,0 +1,755 @@

+import glob, math
+import numpy as np
+# from scipy import misc
+# from scipy import linalg
+from PIL import Image
+import io
+import matplotlib.pyplot as plt
+EPS = 1e-6
+XMIN = -64.0 # right (neg is left)
+XMAX = 64.0 # right
+YMIN = -64.0 # down (neg is up)
+YMAX = 64.0 # down
+ZMIN = -64.0 # forward
+ZMAX = 64.0 # forward
+def print_stats(name, tensor):
+    tensor = tensor.astype(np.float32)
+    print('%s min = %.2f, mean = %.2f, max = %.2f' % (name, np.min(tensor), np.mean(tensor), np.max(tensor)), tensor.shape)
+def reduce_masked_mean(x, mask, axis=None, keepdims=False):
+    # x and mask are the same shape
+    # returns shape-1
+    # axis can be a list of axes
+    prod = x*mask
+    numer = np.sum(prod, axis=axis, keepdims=keepdims)
+    denom = EPS+np.sum(mask, axis=axis, keepdims=keepdims)
+    mean = numer/denom
+    return mean
+def reduce_masked_sum(x, mask, axis=None, keepdims=False):
+    # x and mask are the same shape
+    # returns shape-1
+    # axis can be a list of axes
+    prod = x*mask
+    numer = np.sum(prod, axis=axis, keepdims=keepdims)
+    return numer
+def reduce_masked_median(x, mask, keep_batch=False):
+    # x and mask are the same shape
+    # returns shape-1
+    # axis can be a list of axes
+    if not (x.shape == mask.shape):
+        print('reduce_masked_median: these shapes should match:', x.shape, mask.shape)
+        assert(False)
+    # assert(x.shape == mask.shape)
+    B = list(x.shape)[0]
+    if keep_batch:
+        x = np.reshape(x, [B, -1])
+        mask = np.reshape(mask, [B, -1])
+        meds = np.zeros([B], np.float32)
+        for b in list(range(B)):
+            xb = x[b]
+            mb = mask[b]
+            if np.sum(mb) > 0:
+                xb = xb[mb > 0]
+                meds[b] = np.median(xb)
+            else:
+                meds[b] = np.nan
+        return meds
+    else:
+        x = np.reshape(x, [-1])
+        mask = np.reshape(mask, [-1])
+        if np.sum(mask) > 0:
+            x = x[mask > 0]
+            med = np.median(x)
+        else:
+            med = np.nan
+        med = np.array([med], np.float32)
+        return med
+def get_nFiles(path):
+    return len(glob.glob(path))
+def get_file_list(path):
+    return glob.glob(path)
+def rotm2eul(R):
+    # R is 3x3
+    sy = math.sqrt(R[0,0] * R[0,0] +  R[1,0] * R[1,0])
+    if sy > 1e-6: # singular
+        x = math.atan2(R[2,1] , R[2,2])
+        y = math.atan2(-R[2,0], sy)
+        z = math.atan2(R[1,0], R[0,0])
+    else:
+        x = math.atan2(-R[1,2], R[1,1])
+        y = math.atan2(-R[2,0], sy)
+        z = 0
+    return x, y, z
+def rad2deg(rad):
+    return rad*180.0/np.pi
+def deg2rad(deg):
+    return deg/180.0*np.pi
+def eul2rotm(rx, ry, rz):
+    # copy of matlab, but order of inputs is different
+    # R = [  cy*cz   sy*sx*cz-sz*cx    sy*cx*cz+sz*sx
+    #        cy*sz   sy*sx*sz+cz*cx    sy*cx*sz-cz*sx
+    #        -sy            cy*sx             cy*cx]
+    sinz = np.sin(rz)
+    siny = np.sin(ry)
+    sinx = np.sin(rx)
+    cosz = np.cos(rz)
+    cosy = np.cos(ry)
+    cosx = np.cos(rx)
+    r11 = cosy*cosz
+    r12 = sinx*siny*cosz - cosx*sinz
+    r13 = cosx*siny*cosz + sinx*sinz
+    r21 = cosy*sinz
+    r22 = sinx*siny*sinz + cosx*cosz
+    r23 = cosx*siny*sinz - sinx*cosz
+    r31 = -siny
+    r32 = sinx*cosy
+    r33 = cosx*cosy
+    r1 = np.stack([r11,r12,r13],axis=-1)
+    r2 = np.stack([r21,r22,r23],axis=-1)
+    r3 = np.stack([r31,r32,r33],axis=-1)
+    r = np.stack([r1,r2,r3],axis=0)
+    return r
+def wrap2pi(rad_angle):
+    # puts the angle into the range [-pi, pi]
+    return np.arctan2(np.sin(rad_angle), np.cos(rad_angle))
+def rot2view(rx,ry,rz,x,y,z):
+    # takes rot angles and 3d position as input
+    # returns viewpoint angles as output
+    # (all in radians)
+    # it will perform strangely if z <= 0
+    az = wrap2pi(ry - (-np.arctan2(z, x) - 1.5*np.pi))
+    el = -wrap2pi(rx - (-np.arctan2(z, y) - 1.5*np.pi))
+    th = -rz
+    return az, el, th
+def invAxB(a,b):
+    """
+    Compute the relative 3d transformation between a and b.
+    Input:
+    a -- first pose (homogeneous 4x4 matrix)
+    b -- second pose (homogeneous 4x4 matrix)
+    Output:
+    Relative 3d transformation from a to b.
+    """
+    return np.dot(np.linalg.inv(a),b)
+def merge_rt(r, t):
+    # r is 3 x 3
+    # t is 3 or maybe 3 x 1
+    t = np.reshape(t, [3, 1])
+    rt = np.concatenate((r,t), axis=1)
+    # rt is 3 x 4
+    br = np.reshape(np.array([0,0,0,1], np.float32), [1, 4])
+    # br is 1 x 4
+    rt = np.concatenate((rt, br), axis=0)
+    # rt is 4 x 4
+    return rt
+def split_rt(rt):
+    r = rt[:3,:3]
+    t = rt[:3,3]
+    r = np.reshape(r, [3, 3])
+    t = np.reshape(t, [3, 1])
+    return r, t
+def split_intrinsics(K):
+    # K is 3 x 4 or 4 x 4
+    fx = K[0,0]
+    fy = K[1,1]
+    x0 = K[0,2]
+    y0 = K[1,2]
+    return fx, fy, x0, y0
+def merge_intrinsics(fx, fy, x0, y0):
+    # inputs are shaped []
+    K = np.eye(4)
+    K[0,0] = fx
+    K[1,1] = fy
+    K[0,2] = x0
+    K[1,2] = y0
+    # K is shaped 4 x 4
+    return K
+def scale_intrinsics(K, sx, sy):
+    fx, fy, x0, y0 = split_intrinsics(K)
+    fx *= sx
+    fy *= sy
+    x0 *= sx
+    y0 *= sy
+    return merge_intrinsics(fx, fy, x0, y0)
+# def meshgrid(H, W):
+#     x = np.linspace(0, W-1, W)
+#     y = np.linspace(0, H-1, H)
+#     xv, yv = np.meshgrid(x, y)
+#     return xv, yv
+def compute_distance(transform):
+    """
+    Compute the distance of the translational component of a 4x4 homogeneous matrix.
+    """
+    return numpy.linalg.norm(transform[0:3,3])
+def radian_l1_dist(e, g):
+    # if our angles are in [0, 360] we can follow this stack overflow answer:
+    # https://gamedev.stackexchange.com/questions/4467/comparing-angles-and-working-out-the-difference
+    # wrap2pi brings the angles to [-180, 180]; adding pi puts them in [0, 360]
+    e = wrap2pi(e)+np.pi
+    g = wrap2pi(g)+np.pi
+    l = np.abs(np.pi - np.abs(np.abs(e-g) - np.pi))
+    return l
+def apply_pix_T_cam(pix_T_cam, xyz):
+    fx, fy, x0, y0 = split_intrinsics(pix_T_cam)
+    # xyz is shaped B x H*W x 3
+    # returns xy, shaped B x H*W x 2
+    N, C = xyz.shape
+    x, y, z = np.split(xyz, 3, axis=-1)
+    EPS = 1e-4
+    z = np.clip(z, EPS, None)
+    x = (x*fx)/(z)+x0
+    y = (y*fy)/(z)+y0
+    xy = np.concatenate([x, y], axis=-1)
+    return xy
+def apply_4x4(RT, XYZ):
+    # RT is 4 x 4
+    # XYZ is N x 3
+    # put into homogeneous coords
+    X, Y, Z = np.split(XYZ, 3, axis=1)
+    ones = np.ones_like(X)
+    XYZ1 = np.concatenate([X, Y, Z, ones], axis=1)
+    # XYZ1 is N x 4
+    XYZ1_t = np.transpose(XYZ1)
+    # this is 4 x N
+    XYZ2_t = np.dot(RT, XYZ1_t)
+    # this is 4 x N
+    XYZ2 = np.transpose(XYZ2_t)
+    # this is N x 4
+    XYZ2 = XYZ2[:,:3]
+    # this is N x 3
+    return XYZ2
+def Ref2Mem(xyz, Z, Y, X):
+    # xyz is N x 3, in ref coordinates
+    # transforms ref coordinates into mem coordinates
+    N, C = xyz.shape
+    assert(C==3)
+    mem_T_ref = get_mem_T_ref(Z, Y, X)
+    xyz = apply_4x4(mem_T_ref, xyz)
+    return xyz
+# def Mem2Ref(xyz_mem, MH, MW, MD):
+#     # xyz is B x N x 3, in mem coordinates
+#     # transforms mem coordinates into ref coordinates
+#     B, N, C = xyz_mem.get_shape().as_list()
+#     ref_T_mem = get_ref_T_mem(B, MH, MW, MD)
+#     xyz_ref = utils_geom.apply_4x4(ref_T_mem, xyz_mem)
+#     return xyz_ref
+def get_mem_T_ref(Z, Y, X):
+    # sometimes we want the mat itself
+    # note this is not a rigid transform
+    # for interpretability, let's construct this in two steps...
+    # translation
+    center_T_ref = np.eye(4, dtype=np.float32)
+    center_T_ref[0,3] = -XMIN
+    center_T_ref[1,3] = -YMIN
+    center_T_ref[2,3] = -ZMIN
+    VOX_SIZE_X = (XMAX-XMIN)/float(X)
+    VOX_SIZE_Y = (YMAX-YMIN)/float(Y)
+    VOX_SIZE_Z = (ZMAX-ZMIN)/float(Z)
+    # scaling
+    mem_T_center = np.eye(4, dtype=np.float32)
+    mem_T_center[0,0] = 1./VOX_SIZE_X
+    mem_T_center[1,1] = 1./VOX_SIZE_Y
+    mem_T_center[2,2] = 1./VOX_SIZE_Z
+    mem_T_ref = np.dot(mem_T_center, center_T_ref)
+    return mem_T_ref
+def safe_inverse(a):
+    r, t = split_rt(a)
+    t = np.reshape(t, [3, 1])
+    r_transpose = r.T
+    inv = np.concatenate([r_transpose, -np.matmul(r_transpose, t)], 1)
+    bottom_row = a[3:4, :] # this is [0, 0, 0, 1]
+    inv = np.concatenate([inv, bottom_row], 0)
+    return inv
+def get_ref_T_mem(Z, Y, X):
+    mem_T_ref = get_mem_T_ref(X, Y, X)
+    # note safe_inverse is inapplicable here,
+    # since the transform is nonrigid
+    ref_T_mem = np.linalg.inv(mem_T_ref)
+    return ref_T_mem
+def voxelize_xyz(xyz_ref, Z, Y, X):
+    # xyz_ref is N x 3
+    xyz_mem = Ref2Mem(xyz_ref, Z, Y, X)
+    # this is N x 3
+    voxels = get_occupancy(xyz_mem, Z, Y, X)
+    voxels = np.reshape(voxels, [Z, Y, X, 1])
+    return voxels
+def get_inbounds(xyz, Z, Y, X, already_mem=False):
+    # xyz is H*W x 3
+    if not already_mem:
+        xyz = Ref2Mem(xyz, Z, Y, X)
+    x_valid = np.logical_and(
+        np.greater_equal(xyz[:,0], -0.5),
+        np.less(xyz[:,0], float(X)-0.5))
+    y_valid = np.logical_and(
+        np.greater_equal(xyz[:,1], -0.5),
+        np.less(xyz[:,1], float(Y)-0.5))
+    z_valid = np.logical_and(
+        np.greater_equal(xyz[:,2], -0.5),
+        np.less(xyz[:,2], float(Z)-0.5))
+    inbounds = np.logical_and(np.logical_and(x_valid, y_valid), z_valid)
+    return inbounds
+def sub2ind3d_zyx(depth, height, width, d, h, w):
+    # same as sub2ind3d, but inputs in zyx order
+    # when gathering/scattering with these inds, the tensor should be Z x Y x X
+    return d*height*width + h*width + w
+def sub2ind3d_yxz(height, width, depth, h, w, d):
+    return h*width*depth + w*depth + d
+# def ind2sub(height, width, ind):
+#     # int input
+#     y = int(ind / height)
+#     x = ind % height
+#     return y, x
+def get_occupancy(xyz_mem, Z, Y, X):
+    # xyz_mem is N x 3
+    # we want to fill a voxel tensor with 1's at these inds
+    inbounds = get_inbounds(xyz_mem, Z, Y, X, already_mem=True)
+    inds = np.where(inbounds)
+    xyz_mem = np.reshape(xyz_mem[inds], [-1, 3])
+    # xyz_mem is N x 3
+    # this is more accurate than a cast/floor, but runs into issues when Y==0
+    xyz_mem = np.round(xyz_mem).astype(np.int32)
+    x = xyz_mem[:,0]
+    y = xyz_mem[:,1]
+    z = xyz_mem[:,2]
+    voxels = np.zeros([Z, Y, X], np.float32)
+    voxels[z, y, x] = 1.0
+    return voxels
+def pixels2camera(x,y,z,fx,fy,x0,y0):
+    # x and y are locations in pixel coordinates, z is a depth image in meters
+    # their shapes are H x W
+    # fx, fy, x0, y0 are scalar camera intrinsics
+    # returns xyz, sized [B,H*W,3]
+    H, W = z.shape
+    fx = np.reshape(fx, [1,1])
+    fy = np.reshape(fy, [1,1])
+    x0 = np.reshape(x0, [1,1])
+    y0 = np.reshape(y0, [1,1])
+    # unproject
+    x = ((z+EPS)/fx)*(x-x0)
+    y = ((z+EPS)/fy)*(y-y0)
+    x = np.reshape(x, [-1])
+    y = np.reshape(y, [-1])
+    z = np.reshape(z, [-1])
+    xyz = np.stack([x,y,z], axis=1)
+    return xyz
+def depth2pointcloud(z, pix_T_cam):
+    H = z.shape[0]
+    W = z.shape[1]
+    y, x = meshgrid2d(H, W)
+    z = np.reshape(z, [H, W])
+    fx, fy, x0, y0 = split_intrinsics(pix_T_cam)
+    xyz = pixels2camera(x, y, z, fx, fy, x0, y0)
+    return xyz
+def meshgrid2d(Y, X):
+    grid_y = np.linspace(0.0, Y-1, Y)
+    grid_y = np.reshape(grid_y, [Y, 1])
+    grid_y = np.tile(grid_y, [1, X])
+    grid_x = np.linspace(0.0, X-1, X)
+    grid_x = np.reshape(grid_x, [1, X])
+    grid_x = np.tile(grid_x, [Y, 1])
+    # outputs are Y x X
+    return grid_y, grid_x
+def gridcloud3d(Y, X, Z):
+    x_ = np.linspace(0, X-1, X)
+    y_ = np.linspace(0, Y-1, Y)
+    z_ = np.linspace(0, Z-1, Z)
+    y, x, z = np.meshgrid(y_, x_, z_, indexing='ij')
+    x = np.reshape(x, [-1])
+    y = np.reshape(y, [-1])
+    z = np.reshape(z, [-1])
+    xyz = np.stack([x,y,z], axis=1).astype(np.float32)
+    return xyz
+def gridcloud2d(Y, X):
+    x_ = np.linspace(0, X-1, X)
+    y_ = np.linspace(0, Y-1, Y)
+    y, x = np.meshgrid(y_, x_, indexing='ij')
+    x = np.reshape(x, [-1])
+    y = np.reshape(y, [-1])
+    xy = np.stack([x,y], axis=1).astype(np.float32)
+    return xy
+def normalize(im):
+    im = im - np.min(im)
+    im = im / np.max(im)
+    return im
+def wrap2pi(rad_angle):
+    # rad_angle can be any shape
+    # puts the angle into the range [-pi, pi]
+    return np.arctan2(np.sin(rad_angle), np.cos(rad_angle))
+def convert_occ_to_height(occ):
+    Z, Y, X, C = occ.shape
+    assert(C==1)
+    height = np.linspace(float(Y), 1.0, Y)
+    height = np.reshape(height, [1, Y, 1, 1])
+    height = np.max(occ*height, axis=1)/float(Y)
+    height = np.reshape(height, [Z, X, C])
+    return height
+def create_depth_image(xy, Z, H, W):
+    # turn the xy coordinates into image inds
+    xy = np.round(xy)
+    # lidar reports a sphere of measurements
+    # only use the inds that are within the image bounds
+    # also, only use forward-pointing depths (Z > 0)
+    valid = (xy[:,0] < W-1) & (xy[:,1] < H-1) & (xy[:,0] >= 0) & (xy[:,1] >= 0) & (Z[:] > 0)
+    # gather these up
+    xy = xy[valid]
+    Z = Z[valid]
+    inds = sub2ind(H,W,xy[:,1],xy[:,0])
+    depth = np.zeros((H*W), np.float32)
+    for (index, replacement) in zip(inds, Z):
+        depth[index] = replacement
+    depth[np.where(depth == 0.0)] = 70.0
+    depth = np.reshape(depth, [H, W])
+    return depth
+def vis_depth(depth, maxdepth=80.0, log_vis=True):
+    depth[depth<=0.0] = maxdepth
+    if log_vis:
+        depth = np.log(depth)
+        depth = np.clip(depth, 0, np.log(maxdepth))
+    else:
+        depth = np.clip(depth, 0, maxdepth)
+    depth = (depth*255.0).astype(np.uint8)
+    return depth
+def preprocess_color(x):
+    return x.astype(np.float32) * 1./255 - 0.5
+def convert_box_to_ref_T_obj(boxes):
+    shape = boxes.shape
+    boxes = boxes.reshape(-1,9)
+    rots = [eul2rotm(rx,ry,rz)
+            for rx,ry,rz in boxes[:,6:]]
+    rots = np.stack(rots,axis=0)
+    trans = boxes[:,:3]
+    ref_T_objs = [merge_rt(rot,tran)
+                  for rot,tran in zip(rots,trans)]
+    ref_T_objs = np.stack(ref_T_objs,axis=0)
+    ref_T_objs = ref_T_objs.reshape(shape[:-1]+(4,4))
+    ref_T_objs = ref_T_objs.astype(np.float32)
+    return ref_T_objs
+def get_rot_from_delta(delta, yaw_only=False):
+    dx = delta[:,0]
+    dy = delta[:,1]
+    dz = delta[:,2]
+    bot_hyp = np.sqrt(dz**2 + dx**2)
+    # top_hyp = np.sqrt(bot_hyp**2 + dy**2)
+    pitch = -np.arctan2(dy, bot_hyp)
+    yaw = np.arctan2(dz, dx)
+    if yaw_only:
+        rot = [eul2rotm(0,y,0) for y in yaw]
+    else:
+        rot = [eul2rotm(0,y,p) for (p,y) in zip(pitch,yaw)]
+    rot = np.stack(rot)
+    # rot is B x 3 x 3
+    return rot
+def im2col(im, psize):
+    n_channels = 1 if len(im.shape) == 2 else im.shape[0]
+    (n_channels, rows, cols) = (1,) * (3 - len(im.shape)) + im.shape
+    im_pad = np.zeros((n_channels,
+                       int(math.ceil(1.0 * rows / psize) * psize),
+                       int(math.ceil(1.0 * cols / psize) * psize)))
+    im_pad[:, 0:rows, 0:cols] = im
+    final = np.zeros((im_pad.shape[1], im_pad.shape[2], n_channels,
+                      psize, psize))
+    for c in np.arange(n_channels):
+        for x in np.arange(psize):
+            for y in np.arange(psize):
+                im_shift = np.vstack(
+                    (im_pad[c, x:], im_pad[c, :x]))
+                im_shift = np.column_stack(
+                    (im_shift[:, y:], im_shift[:, :y]))
+                final[x::psize, y::psize, c] = np.swapaxes(
+                    im_shift.reshape(int(im_pad.shape[1] / psize), psize,
+                                     int(im_pad.shape[2] / psize), psize), 1, 2)
+    return np.squeeze(final[0:rows - psize + 1, 0:cols - psize + 1])
+def filter_discontinuities(depth, filter_size=9, thresh=10):
+    H, W = list(depth.shape)
+    # Ensure that filter sizes are okay
+    assert filter_size % 2 == 1, "Can only use odd filter sizes."
+    # Compute discontinuities
+    offset = int((filter_size - 1) / 2)
+    patches = 1.0 * im2col(depth, filter_size)
+    mids = patches[:, :, offset, offset]
+    mins = np.min(patches, axis=(2, 3))
+    maxes = np.max(patches, axis=(2, 3))
+    discont = np.maximum(np.abs(mins - mids),
+                         np.abs(maxes - mids))
+    mark = discont > thresh
+    # Account for offsets
+    final_mark = np.zeros((H, W), dtype=np.uint16)
+    final_mark[offset:offset + mark.shape[0],
+               offset:offset + mark.shape[1]] = mark
+    return depth * (1 - final_mark)
+def argmax2d(tensor):
+    Y, X = list(tensor.shape)
+    # flatten the Tensor along the height and width axes
+    flat_tensor = tensor.reshape(-1)
+    # argmax of the flat tensor
+    argmax = np.argmax(flat_tensor)
+    # convert the indices into 2d coordinates
+    argmax_y = argmax // X # row
+    argmax_x = argmax % X # col
+    return argmax_y, argmax_x
+def plot_traj_3d(traj):
+    # traj is S x 3
+    # print('traj', traj.shape)
+    S, C = list(traj.shape)
+    assert(C==3)
+    fig = plt.figure()
+    ax = fig.add_subplot(111, projection='3d')
+    colors = [plt.cm.RdYlBu(i) for i in np.linspace(0,1,S)]
+    # print('colors', colors)
+    xs = traj[:,0]
+    ys = -traj[:,1]
+    zs = traj[:,2]
+    ax.scatter(xs, zs, ys, s=30, c=colors, marker='o', alpha=1.0, edgecolors=(0,0,0))#, color=color_map[n])
+    ax.set_xlabel('X')
+    ax.set_ylabel('Z')
+    ax.set_zlabel('Y')
+    ax.set_xlim(0,1)
+    ax.set_ylim(0,1) # this is really Z
+    ax.set_zlim(-1,0) # this is really Y
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png')
+    buf.seek(0)
+    image = np.array(Image.open(buf)) # H x W x 4
+    image = image[:,:,:3]
+    plt.close()
+    return image
+def camera2pixels(xyz, pix_T_cam):
+    # xyz is shaped N x 3
+    # returns xy, shaped N x 2
+    fx, fy, x0, y0 = split_intrinsics(pix_T_cam)
+    x, y, z = xyz[:,0], xyz[:,1], xyz[:,2]
+    EPS = 1e-4
+    z = np.clip(z, EPS, None)
+    x = (x*fx)/z + x0
+    y = (y*fy)/z + y0
+    xy = np.stack([x, y], axis=-1)
+    return xy
+def make_colorwheel():
+    """
+    Generates a color wheel for optical flow visualization as presented in:
+        Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007)
+        URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf
+    Code follows the original C++ source code of Daniel Scharstein.
+    Code follows the the Matlab source code of Deqing Sun.
+    Returns:
+        np.ndarray: Color wheel
+    """
+    RY = 15
+    YG = 6
+    GC = 4
+    CB = 11
+    BM = 13
+    MR = 6
+    ncols = RY + YG + GC + CB + BM + MR
+    colorwheel = np.zeros((ncols, 3))
+    col = 0
+    # RY
+    colorwheel[0:RY, 0] = 255
+    colorwheel[0:RY, 1] = np.floor(255*np.arange(0,RY)/RY)
+    col = col+RY
+    # YG
+    colorwheel[col:col+YG, 0] = 255 - np.floor(255*np.arange(0,YG)/YG)
+    colorwheel[col:col+YG, 1] = 255
+    col = col+YG
+    # GC
+    colorwheel[col:col+GC, 1] = 255
+    colorwheel[col:col+GC, 2] = np.floor(255*np.arange(0,GC)/GC)
+    col = col+GC
+    # CB
+    colorwheel[col:col+CB, 1] = 255 - np.floor(255*np.arange(CB)/CB)
+    colorwheel[col:col+CB, 2] = 255
+    col = col+CB
+    # BM
+    colorwheel[col:col+BM, 2] = 255
+    colorwheel[col:col+BM, 0] = np.floor(255*np.arange(0,BM)/BM)
+    col = col+BM
+    # MR
+    colorwheel[col:col+MR, 2] = 255 - np.floor(255*np.arange(MR)/MR)
+    colorwheel[col:col+MR, 0] = 255
+    return colorwheel
+def flow_uv_to_colors(u, v, convert_to_bgr=False):
+    """
+    Applies the flow color wheel to (possibly clipped) flow components u and v.
+    According to the C++ source code of Daniel Scharstein
+    According to the Matlab source code of Deqing Sun
+    Args:
+        u (np.ndarray): Input horizontal flow of shape [H,W]
+        v (np.ndarray): Input vertical flow of shape [H,W]
+        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
+    Returns:
+        np.ndarray: Flow visualization image of shape [H,W,3]
+    """
+    flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8)
+    colorwheel = make_colorwheel()  # shape [55x3]
+    ncols = colorwheel.shape[0]
+    rad = np.sqrt(np.square(u) + np.square(v))
+    a = np.arctan2(-v, -u)/np.pi
+    fk = (a+1) / 2*(ncols-1)
+    k0 = np.floor(fk).astype(np.int32)
+    k1 = k0 + 1
+    k1[k1 == ncols] = 0
+    f = fk - k0
+    for i in range(colorwheel.shape[1]):
+        tmp = colorwheel[:,i]
+        col0 = tmp[k0] / 255.0
+        col1 = tmp[k1] / 255.0
+        col = (1-f)*col0 + f*col1
+        idx = (rad <= 1)
+        col[idx]  = 1 - rad[idx] * (1-col[idx])
+        col[~idx] = col[~idx] * 0.75   # out of range
+        # Note the 2-i => BGR instead of RGB
+        ch_idx = 2-i if convert_to_bgr else i
+        flow_image[:,:,ch_idx] = np.floor(255 * col)
+    return flow_image
+def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False):
+    """
+    Expects a two dimensional flow image of shape.
+    Args:
+        flow_uv (np.ndarray): Flow UV image of shape [H,W,2]
+        clip_flow (float, optional): Clip maximum of flow values. Defaults to None.
+        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
+    Returns:
+        np.ndarray: Flow visualization image of shape [H,W,3]
+    """
+    assert flow_uv.ndim == 3, 'input flow must have three dimensions'
+    assert flow_uv.shape[2] == 2, 'input flow must have shape [H,W,2]'
+    if clip_flow is not None:
+        flow_uv = np.clip(flow_uv, -clip_flow, clip_flow) / clip_flow
+        # flow_uv = np.clamp(flow, -clip, clip)/clip
+    u = flow_uv[:,:,0]
+    v = flow_uv[:,:,1]
+    rad = np.sqrt(np.square(u) + np.square(v))
+    rad_max = np.max(rad)
+    epsilon = 1e-5
+    u = u / (rad_max + epsilon)
+    v = v / (rad_max + epsilon)
+    return flow_uv_to_colors(u, v, convert_to_bgr)

utils/samp.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import torch
+import utils.basic
+import torch.nn.functional as F
+def bilinear_sampler(input, coords, align_corners=True, padding_mode="border"):
+    r"""Sample a tensor using bilinear interpolation
+    `bilinear_sampler(input, coords)` samples a tensor :attr:`input` at
+    coordinates :attr:`coords` using bilinear interpolation. It is the same
+    as `torch.nn.functional.grid_sample()` but with a different coordinate
+    convention.
+    The input tensor is assumed to be of shape :math:`(B, C, H, W)`, where
+    :math:`B` is the batch size, :math:`C` is the number of channels,
+    :math:`H` is the height of the image, and :math:`W` is the width of the
+    image. The tensor :attr:`coords` of shape :math:`(B, H_o, W_o, 2)` is
+    interpreted as an array of 2D point coordinates :math:`(x_i,y_i)`.
+    Alternatively, the input tensor can be of size :math:`(B, C, T, H, W)`,
+    in which case sample points are triplets :math:`(t_i,x_i,y_i)`. Note
+    that in this case the order of the components is slightly different
+    from `grid_sample()`, which would expect :math:`(x_i,y_i,t_i)`.
+    If `align_corners` is `True`, the coordinate :math:`x` is assumed to be
+    in the range :math:`[0,W-1]`, with 0 corresponding to the center of the
+    left-most image pixel :math:`W-1` to the center of the right-most
+    pixel.
+    If `align_corners` is `False`, the coordinate :math:`x` is assumed to
+    be in the range :math:`[0,W]`, with 0 corresponding to the left edge of
+    the left-most pixel :math:`W` to the right edge of the right-most
+    pixel.
+    Similar conventions apply to the :math:`y` for the range
+    :math:`[0,H-1]` and :math:`[0,H]` and to :math:`t` for the range
+    :math:`[0,T-1]` and :math:`[0,T]`.
+    Args:
+        input (Tensor): batch of input images.
+        coords (Tensor): batch of coordinates.
+        align_corners (bool, optional): Coordinate convention. Defaults to `True`.
+        padding_mode (str, optional): Padding mode. Defaults to `"border"`.
+    Returns:
+        Tensor: sampled points.
+    """
+    sizes = input.shape[2:]
+    assert len(sizes) in [2, 3]
+    if len(sizes) == 3:
+        # t x y -> x y t to match dimensions T H W in grid_sample
+        coords = coords[..., [1, 2, 0]]
+    if align_corners:
+        coords = coords * torch.tensor(
+            [2 / max(size - 1, 1) for size in reversed(sizes)], device=coords.device
+        )
+    else:
+        coords = coords * torch.tensor(
+            [2 / size for size in reversed(sizes)], device=coords.device
+        )
+    coords -= 1
+    return F.grid_sample(
+        input, coords, align_corners=align_corners, padding_mode=padding_mode
+    )
+def sample_features4d(input, coords):
+    r"""Sample spatial features
+    `sample_features4d(input, coords)` samples the spatial features
+    :attr:`input` represented by a 4D tensor :math:`(B, C, H, W)`.
+    The field is sampled at coordinates :attr:`coords` using bilinear
+    interpolation. :attr:`coords` is assumed to be of shape :math:`(B, R,
+    3)`, where each sample has the format :math:`(x_i, y_i)`. This uses the
+    same convention as :func:`bilinear_sampler` with `align_corners=True`.
+    The output tensor has one feature per point, and has shape :math:`(B,
+    R, C)`.
+    Args:
+        input (Tensor): spatial features.
+        coords (Tensor): points.
+    Returns:
+        Tensor: sampled features.
+    """
+    B, _, _, _ = input.shape
+    # B R 2 -> B R 1 2
+    coords = coords.unsqueeze(2)
+    # B C R 1
+    feats = bilinear_sampler(input, coords)
+    return feats.permute(0, 2, 1, 3).view(
+        B, -1, feats.shape[1] * feats.shape[3]
+    )  # B C R 1 -> B R C
+def sample_features5d(input, coords):
+    r"""Sample spatio-temporal features
+    `sample_features5d(input, coords)` works in the same way as
+    :func:`sample_features4d` but for spatio-temporal features and points:
+    :attr:`input` is a 5D tensor :math:`(B, T, C, H, W)`, :attr:`coords` is
+    a :math:`(B, R1, R2, 3)` tensor of spatio-temporal point :math:`(t_i,
+    x_i, y_i)`. The output tensor has shape :math:`(B, R1, R2, C)`.
+    Args:
+        input (Tensor): spatio-temporal features.
+        coords (Tensor): spatio-temporal points.
+    Returns:
+        Tensor: sampled features.
+    """
+    B, T, _, _, _ = input.shape
+    # B T C H W -> B C T H W
+    input = input.permute(0, 2, 1, 3, 4)
+    # B R1 R2 3 -> B R1 R2 1 3
+    coords = coords.unsqueeze(3)
+    # B C R1 R2 1
+    feats = bilinear_sampler(input, coords)
+    return feats.permute(0, 2, 3, 1, 4).view(
+        B, feats.shape[2], feats.shape[3], feats.shape[1]
+    )  # B C R1 R2 1 -> B R1 R2 C
+def bilinear_sample2d(im, x, y, return_inbounds=False):
+    # x and y are each B, N
+    # output is B, C, N
+    B, C, H, W = list(im.shape)
+    N = list(x.shape)[1]
+    x = x.float()
+    y = y.float()
+    H_f = torch.tensor(H, dtype=torch.float32)
+    W_f = torch.tensor(W, dtype=torch.float32)
+    # inbound_mask = (x>-0.5).float()*(y>-0.5).float()*(x<W_f+0.5).float()*(y<H_f+0.5).float()
+    max_y = (H_f - 1).int()
+    max_x = (W_f - 1).int()
+    x0 = torch.floor(x).int()
+    x1 = x0 + 1
+    y0 = torch.floor(y).int()
+    y1 = y0 + 1
+    x0_clip = torch.clamp(x0, 0, max_x)
+    x1_clip = torch.clamp(x1, 0, max_x)
+    y0_clip = torch.clamp(y0, 0, max_y)
+    y1_clip = torch.clamp(y1, 0, max_y)
+    dim2 = W
+    dim1 = W * H
+    base = torch.arange(0, B, dtype=torch.int64, device=x.device)*dim1
+    base = torch.reshape(base, [B, 1]).repeat([1, N])
+    base_y0 = base + y0_clip * dim2
+    base_y1 = base + y1_clip * dim2
+    idx_y0_x0 = base_y0 + x0_clip
+    idx_y0_x1 = base_y0 + x1_clip
+    idx_y1_x0 = base_y1 + x0_clip
+    idx_y1_x1 = base_y1 + x1_clip
+    # use the indices to lookup pixels in the flat image
+    # im is B x C x H x W
+    # move C out to last dim
+    im_flat = (im.permute(0, 2, 3, 1)).reshape(B*H*W, C)
+    i_y0_x0 = im_flat[idx_y0_x0.long()]
+    i_y0_x1 = im_flat[idx_y0_x1.long()]
+    i_y1_x0 = im_flat[idx_y1_x0.long()]
+    i_y1_x1 = im_flat[idx_y1_x1.long()]
+    # Finally calculate interpolated values.
+    x0_f = x0.float()
+    x1_f = x1.float()
+    y0_f = y0.float()
+    y1_f = y1.float()
+    w_y0_x0 = ((x1_f - x) * (y1_f - y)).unsqueeze(2)
+    w_y0_x1 = ((x - x0_f) * (y1_f - y)).unsqueeze(2)
+    w_y1_x0 = ((x1_f - x) * (y - y0_f)).unsqueeze(2)
+    w_y1_x1 = ((x - x0_f) * (y - y0_f)).unsqueeze(2)
+    output = w_y0_x0 * i_y0_x0 + w_y0_x1 * i_y0_x1 + \
+             w_y1_x0 * i_y1_x0 + w_y1_x1 * i_y1_x1
+    # output is B*N x C
+    output = output.view(B, -1, C)
+    output = output.permute(0, 2, 1)
+    # output is B x C x N
+    if return_inbounds:
+        x_valid = (x > -0.5).byte() & (x < float(W_f - 0.5)).byte()
+        y_valid = (y > -0.5).byte() & (y < float(H_f - 0.5)).byte()
+        inbounds = (x_valid & y_valid).float()
+        inbounds = inbounds.reshape(B, N) # something seems wrong here for B>1; i'm getting an error here (or downstream if i put -1)
+        return output, inbounds
+    return output # B, C, N

utils/saveload.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import pathlib
+import os
+import torch
+def save(ckpt_dir, module, optimizer, scheduler, global_step, keep_latest=2, model_name='model'):
+    pathlib.Path(ckpt_dir).mkdir(exist_ok=True, parents=True)
+    prev_ckpts = list(pathlib.Path(ckpt_dir).glob('%s-*pth' % model_name))
+    prev_ckpts.sort(key=lambda p: p.stat().st_mtime,reverse=True)
+    if len(prev_ckpts) > keep_latest-1:
+        for f in prev_ckpts[keep_latest-1:]:
+            f.unlink()
+    save_path = '%s/%s-%09d.pth' % (ckpt_dir, model_name, global_step)
+    save_dict = {
+        "model": module.state_dict(),
+        "optimizer": optimizer.state_dict(),
+        "global_step": global_step,
+    }
+    if scheduler is not None:
+        save_dict['scheduler'] = scheduler.state_dict()
+    print(f"saving {save_path}")
+    torch.save(save_dict, save_path)
+    return False
+def load(fabric, ckpt_path, model, optimizer=None, scheduler=None, model_ema=None, step=0, model_name='model', ignore_load=None, strict=True, verbose=True, weights_only=False):
+    if verbose:
+        print('reading ckpt from %s' % ckpt_path)
+    if not os.path.exists(ckpt_path):
+        print('...there is no full checkpoint in %s' % ckpt_path)
+        print('-- note this function no longer appends "saved_checkpoints/" before the ckpt_path --')
+        assert(False)
+    else:
+        if os.path.isfile(ckpt_path):
+            path = ckpt_path
+            print('...found checkpoint %s' % (path))
+        else:
+            prev_ckpts = list(pathlib.Path(ckpt_path).glob('%s-*pth' % model_name))
+            prev_ckpts.sort(key=lambda p: p.stat().st_mtime,reverse=True)
+            if len(prev_ckpts):
+                path = prev_ckpts[0]
+                # e.g., './checkpoints/2Ai4_5e-4_base18_1539/model-000050000.pth'
+                # OR ./whatever.pth
+                step = int(str(path).split('-')[-1].split('.')[0])
+                if verbose:
+                    print('...found checkpoint %s; (parsed step %d from path)' % (path, step))
+            else:
+                print('...there is no full checkpoint here!')
+                return 0
+        if fabric is not None:
+            checkpoint = fabric.load(path)
+        else:
+            checkpoint = torch.load(path, weights_only=weights_only)
+        if optimizer is not None:
+            optimizer.load_state_dict(checkpoint['optimizer'])
+        if scheduler is not None:
+            scheduler.load_state_dict(checkpoint['scheduler'])
+        assert ignore_load is None # not ready yet
+        if 'model' in checkpoint:
+            state_dict = checkpoint['model']
+        else:
+            state_dict = checkpoint
+        model.load_state_dict(state_dict, strict=strict)
+    return step