# ------------------------------------------------------------------------ # RF-DETR # Copyright (c) 2025 Roboflow. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------ # Modified from LW-DETR (https://github.com/Atten4Vis/LW-DETR) # Copyright (c) 2024 Baidu. All Rights Reserved. # ------------------------------------------------------------------------ # Modified from Conditional DETR (https://github.com/Atten4Vis/ConditionalDETR) # Copyright (c) 2021 Microsoft. All Rights Reserved. # ------------------------------------------------------------------------ # Copied from DETR (https://github.com/facebookresearch/detr) # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. # ------------------------------------------------------------------------ """ Transforms and data augmentation for both image + bbox. """ import random import PIL import numpy as np try: from collections.abc import Sequence except Exception: from collections import Sequence from numbers import Number import torch import torchvision.transforms as T # from detectron2.data import transforms as DT import torchvision.transforms.functional as F from rfdetr.util.box_ops import box_xyxy_to_cxcywh from rfdetr.util.misc import interpolate def crop(image, target, region): cropped_image = F.crop(image, *region) target = target.copy() i, j, h, w = region # should we do something wrt the original size? target["size"] = torch.tensor([h, w]) fields = ["labels", "area", "iscrowd"] if "boxes" in target: boxes = target["boxes"] max_size = torch.as_tensor([w, h], dtype=torch.float32) cropped_boxes = boxes - torch.as_tensor([j, i, j, i]) cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size) cropped_boxes = cropped_boxes.clamp(min=0) area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1) target["boxes"] = cropped_boxes.reshape(-1, 4) target["area"] = area fields.append("boxes") if "masks" in target: # FIXME should we update the area here if there are no boxes? target['masks'] = target['masks'][:, i:i + h, j:j + w] fields.append("masks") # remove elements for which the boxes or masks that have zero area if "boxes" in target or "masks" in target: # favor boxes selection when defining which elements to keep # this is compatible with previous implementation if "boxes" in target: cropped_boxes = target['boxes'].reshape(-1, 2, 2) keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1) else: keep = target['masks'].flatten(1).any(1) for field in fields: target[field] = target[field][keep] return cropped_image, target def hflip(image, target): flipped_image = F.hflip(image) w, h = image.size target = target.copy() if "boxes" in target: boxes = target["boxes"] boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0]) target["boxes"] = boxes if "masks" in target: target['masks'] = target['masks'].flip(-1) return flipped_image, target def resize(image, target, size, max_size=None): # size can be min_size (scalar) or (w, h) tuple def get_size_with_aspect_ratio(image_size, size, max_size=None): w, h = image_size if max_size is not None: min_original_size = float(min((w, h))) max_original_size = float(max((w, h))) if max_original_size / min_original_size * size > max_size: size = int(round(max_size * min_original_size / max_original_size)) if (w <= h and w == size) or (h <= w and h == size): return (h, w) if w < h: ow = size oh = int(size * h / w) else: oh = size ow = int(size * w / h) return (oh, ow) def get_size(image_size, size, max_size=None): if isinstance(size, (list, tuple)): return size[::-1] else: return get_size_with_aspect_ratio(image_size, size, max_size) size = get_size(image.size, size, max_size) rescaled_image = F.resize(image, size) if target is None: return rescaled_image, None ratios = tuple( float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)) ratio_width, ratio_height = ratios target = target.copy() if "boxes" in target: boxes = target["boxes"] scaled_boxes = boxes * torch.as_tensor( [ratio_width, ratio_height, ratio_width, ratio_height]) target["boxes"] = scaled_boxes if "area" in target: area = target["area"] scaled_area = area * (ratio_width * ratio_height) target["area"] = scaled_area h, w = size target["size"] = torch.tensor([h, w]) if "masks" in target: target['masks'] = interpolate( target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5 return rescaled_image, target def pad(image, target, padding): # assumes that we only pad on the bottom right corners padded_image = F.pad(image, (0, 0, padding[0], padding[1])) if target is None: return padded_image, None target = target.copy() # should we do something wrt the original size? target["size"] = torch.tensor(padded_image.size[::-1]) if "masks" in target: target['masks'] = torch.nn.functional.pad( target['masks'], (0, padding[0], 0, padding[1])) return padded_image, target class RandomCrop(object): def __init__(self, size): self.size = size def __call__(self, img, target): region = T.RandomCrop.get_params(img, self.size) return crop(img, target, region) class RandomSizeCrop(object): def __init__(self, min_size: int, max_size: int): self.min_size = min_size self.max_size = max_size def __call__(self, img: PIL.Image.Image, target: dict): w = random.randint(self.min_size, min(img.width, self.max_size)) h = random.randint(self.min_size, min(img.height, self.max_size)) region = T.RandomCrop.get_params(img, [h, w]) return crop(img, target, region) class CenterCrop(object): def __init__(self, size): self.size = size def __call__(self, img, target): image_width, image_height = img.size crop_height, crop_width = self.size crop_top = int(round((image_height - crop_height) / 2.)) crop_left = int(round((image_width - crop_width) / 2.)) return crop(img, target, (crop_top, crop_left, crop_height, crop_width)) class RandomHorizontalFlip(object): def __init__(self, p=0.5): self.p = p def __call__(self, img, target): if random.random() < self.p: return hflip(img, target) return img, target class RandomResize(object): def __init__(self, sizes, max_size=None): assert isinstance(sizes, (list, tuple)) self.sizes = sizes self.max_size = max_size def __call__(self, img, target=None): size = random.choice(self.sizes) return resize(img, target, size, self.max_size) class SquareResize(object): def __init__(self, sizes): assert isinstance(sizes, (list, tuple)) self.sizes = sizes def __call__(self, img, target=None): size = random.choice(self.sizes) rescaled_img=F.resize(img, (size, size)) w, h = rescaled_img.size if target is None: return rescaled_img, None ratios = tuple( float(s) / float(s_orig) for s, s_orig in zip(rescaled_img.size, img.size)) ratio_width, ratio_height = ratios target = target.copy() if "boxes" in target: boxes = target["boxes"] scaled_boxes = boxes * torch.as_tensor( [ratio_width, ratio_height, ratio_width, ratio_height]) target["boxes"] = scaled_boxes if "area" in target: area = target["area"] scaled_area = area * (ratio_width * ratio_height) target["area"] = scaled_area target["size"] = torch.tensor([h, w]) return rescaled_img, target class RandomPad(object): def __init__(self, max_pad): self.max_pad = max_pad def __call__(self, img, target): pad_x = random.randint(0, self.max_pad) pad_y = random.randint(0, self.max_pad) return pad(img, target, (pad_x, pad_y)) class PILtoNdArray(object): def __call__(self, img, target): return np.asarray(img), target class NdArraytoPIL(object): def __call__(self, img, target): return F.to_pil_image(img.astype('uint8')), target class Pad(object): def __init__(self, size=None, size_divisor=32, pad_mode=0, offsets=None, fill_value=(127.5, 127.5, 127.5)): """ Pad image to a specified size or multiple of size_divisor. Args: size (int, Sequence): image target size, if None, pad to multiple of size_divisor, default None size_divisor (int): size divisor, default 32 pad_mode (int): pad mode, currently only supports four modes [-1, 0, 1, 2]. if -1, use specified offsets if 0, only pad to right and bottom. if 1, pad according to center. if 2, only pad left and top offsets (list): [offset_x, offset_y], specify offset while padding, only supported pad_mode=-1 fill_value (bool): rgb value of pad area, default (127.5, 127.5, 127.5) """ if not isinstance(size, (int, Sequence)): raise TypeError( "Type of target_size is invalid when random_size is True. \ Must be List, now is {}".format(type(size))) if isinstance(size, int): size = [size, size] assert pad_mode in [ -1, 0, 1, 2 ], 'currently only supports four modes [-1, 0, 1, 2]' if pad_mode == -1: assert offsets, 'if pad_mode is -1, offsets should not be None' self.size = size self.size_divisor = size_divisor self.pad_mode = pad_mode self.fill_value = fill_value self.offsets = offsets def apply_bbox(self, bbox, offsets): return bbox + np.array(offsets * 2, dtype=np.float32) def apply_image(self, image, offsets, im_size, size): x, y = offsets im_h, im_w = im_size h, w = size canvas = np.ones((h, w, 3), dtype=np.float32) canvas *= np.array(self.fill_value, dtype=np.float32) canvas[y:y + im_h, x:x + im_w, :] = image.astype(np.float32) return canvas def __call__(self, im, target): im_h, im_w = im.shape[:2] if self.size: h, w = self.size assert ( im_h <= h and im_w <= w ), '(h, w) of target size should be greater than (im_h, im_w)' else: h = int(np.ceil(im_h / self.size_divisor) * self.size_divisor) w = int(np.ceil(im_w / self.size_divisor) * self.size_divisor) if h == im_h and w == im_w: return im.astype(np.float32), target if self.pad_mode == -1: offset_x, offset_y = self.offsets elif self.pad_mode == 0: offset_y, offset_x = 0, 0 elif self.pad_mode == 1: offset_y, offset_x = (h - im_h) // 2, (w - im_w) // 2 else: offset_y, offset_x = h - im_h, w - im_w offsets, im_size, size = [offset_x, offset_y], [im_h, im_w], [h, w] im = self.apply_image(im, offsets, im_size, size) if self.pad_mode == 0: target["size"] = torch.tensor([h, w]) return im, target if 'boxes' in target and len(target['boxes']) > 0: boxes = np.asarray(target["boxes"]) target["boxes"] = torch.from_numpy(self.apply_bbox(boxes, offsets)) target["size"] = torch.tensor([h, w]) return im, target class RandomExpand(object): """Random expand the canvas. Args: ratio (float): maximum expansion ratio. prob (float): probability to expand. fill_value (list): color value used to fill the canvas. in RGB order. """ def __init__(self, ratio=4., prob=0.5, fill_value=(127.5, 127.5, 127.5)): assert ratio > 1.01, "expand ratio must be larger than 1.01" self.ratio = ratio self.prob = prob assert isinstance(fill_value, (Number, Sequence)), \ "fill value must be either float or sequence" if isinstance(fill_value, Number): fill_value = (fill_value, ) * 3 if not isinstance(fill_value, tuple): fill_value = tuple(fill_value) self.fill_value = fill_value def __call__(self, img, target): if np.random.uniform(0., 1.) < self.prob: return img, target height, width = img.shape[:2] ratio = np.random.uniform(1., self.ratio) h = int(height * ratio) w = int(width * ratio) if not h > height or not w > width: return img, target y = np.random.randint(0, h - height) x = np.random.randint(0, w - width) offsets, size = [x, y], [h, w] pad = Pad(size, pad_mode=-1, offsets=offsets, fill_value=self.fill_value) return pad(img, target) class RandomSelect(object): """ Randomly selects between transforms1 and transforms2, with probability p for transforms1 and (1 - p) for transforms2 """ def __init__(self, transforms1, transforms2, p=0.5): self.transforms1 = transforms1 self.transforms2 = transforms2 self.p = p def __call__(self, img, target): if random.random() < self.p: return self.transforms1(img, target) return self.transforms2(img, target) class ToTensor(object): def __call__(self, img, target): return F.to_tensor(img), target class RandomErasing(object): def __init__(self, *args, **kwargs): self.eraser = T.RandomErasing(*args, **kwargs) def __call__(self, img, target): return self.eraser(img), target class Normalize(object): def __init__(self, mean, std): self.mean = mean self.std = std def __call__(self, image, target=None): image = F.normalize(image, mean=self.mean, std=self.std) if target is None: return image, None target = target.copy() h, w = image.shape[-2:] if "boxes" in target: boxes = target["boxes"] boxes = box_xyxy_to_cxcywh(boxes) boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32) target["boxes"] = boxes return image, target class Compose(object): def __init__(self, transforms): self.transforms = transforms def __call__(self, image, target): for t in self.transforms: image, target = t(image, target) return image, target def __repr__(self): format_string = self.__class__.__name__ + "(" for t in self.transforms: format_string += "\n" format_string += " {0}".format(t) format_string += "\n)" return format_string