# Copyright (c) OpenMMLab. All rights reserved. from typing import List, Optional, Tuple import torch import torch.nn as nn from mmcv.ops import DeformConv2d, MaskedConv2d from mmengine.model import BaseModule from mmengine.structures import InstanceData from torch import Tensor from mmdet.registry import MODELS, TASK_UTILS from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType, OptInstanceList) from ..layers import multiclass_nms from ..task_modules.prior_generators import anchor_inside_flags, calc_region from ..task_modules.samplers import PseudoSampler from ..utils import images_to_levels, multi_apply, unmap from .anchor_head import AnchorHead class FeatureAdaption(BaseModule): """Feature Adaption Module. Feature Adaption Module is implemented based on DCN v1. It uses anchor shape prediction rather than feature map to predict offsets of deform conv layer. Args: in_channels (int): Number of channels in the input feature map. out_channels (int): Number of channels in the output feature map. kernel_size (int): Deformable conv kernel size. Defaults to 3. deform_groups (int): Deformable conv group size. Defaults to 4. init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \ list[dict], optional): Initialization config dict. """ def __init__( self, in_channels: int, out_channels: int, kernel_size: int = 3, deform_groups: int = 4, init_cfg: MultiConfig = dict( type='Normal', layer='Conv2d', std=0.1, override=dict(type='Normal', name='conv_adaption', std=0.01)) ) -> None: super().__init__(init_cfg=init_cfg) offset_channels = kernel_size * kernel_size * 2 self.conv_offset = nn.Conv2d( 2, deform_groups * offset_channels, 1, bias=False) self.conv_adaption = DeformConv2d( in_channels, out_channels, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, deform_groups=deform_groups) self.relu = nn.ReLU(inplace=True) def forward(self, x: Tensor, shape: Tensor) -> Tensor: offset = self.conv_offset(shape.detach()) x = self.relu(self.conv_adaption(x, offset)) return x @MODELS.register_module() class GuidedAnchorHead(AnchorHead): """Guided-Anchor-based head (GA-RPN, GA-RetinaNet, etc.). This GuidedAnchorHead will predict high-quality feature guided anchors and locations where anchors will be kept in inference. There are mainly 3 categories of bounding-boxes. - Sampled 9 pairs for target assignment. (approxes) - The square boxes where the predicted anchors are based on. (squares) - Guided anchors. Please refer to https://arxiv.org/abs/1901.03278 for more details. Args: num_classes (int): Number of classes. in_channels (int): Number of channels in the input feature map. feat_channels (int): Number of hidden channels. Defaults to 256. approx_anchor_generator (:obj:`ConfigDict` or dict): Config dict for approx generator square_anchor_generator (:obj:`ConfigDict` or dict): Config dict for square generator anchor_coder (:obj:`ConfigDict` or dict): Config dict for anchor coder bbox_coder (:obj:`ConfigDict` or dict): Config dict for bbox coder reg_decoded_bbox (bool): If true, the regression loss would be applied directly on decoded bounding boxes, converting both the predicted boxes and regression targets to absolute coordinates format. Defaults to False. It should be `True` when using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head. deform_groups: (int): Group number of DCN in FeatureAdaption module. Defaults to 4. loc_filter_thr (float): Threshold to filter out unconcerned regions. Defaults to 0.01. loss_loc (:obj:`ConfigDict` or dict): Config of location loss. loss_shape (:obj:`ConfigDict` or dict): Config of anchor shape loss. loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. loss_bbox (:obj:`ConfigDict` or dict): Config of bbox regression loss. init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \ list[dict], optional): Initialization config dict. """ def __init__( self, num_classes: int, in_channels: int, feat_channels: int = 256, approx_anchor_generator: ConfigType = dict( type='AnchorGenerator', octave_base_scale=8, scales_per_octave=3, ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), square_anchor_generator: ConfigType = dict( type='AnchorGenerator', ratios=[1.0], scales=[8], strides=[4, 8, 16, 32, 64]), anchor_coder: ConfigType = dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), bbox_coder: ConfigType = dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), reg_decoded_bbox: bool = False, deform_groups: int = 4, loc_filter_thr: float = 0.01, train_cfg: OptConfigType = None, test_cfg: OptConfigType = None, loss_loc: ConfigType = dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_shape: ConfigType = dict( type='BoundedIoULoss', beta=0.2, loss_weight=1.0), loss_cls: ConfigType = dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox: ConfigType = dict( type='SmoothL1Loss', beta=1.0, loss_weight=1.0), init_cfg: MultiConfig = dict( type='Normal', layer='Conv2d', std=0.01, override=dict( type='Normal', name='conv_loc', std=0.01, lbias_prob=0.01)) ) -> None: super(AnchorHead, self).__init__(init_cfg=init_cfg) self.in_channels = in_channels self.num_classes = num_classes self.feat_channels = feat_channels self.deform_groups = deform_groups self.loc_filter_thr = loc_filter_thr # build approx_anchor_generator and square_anchor_generator assert (approx_anchor_generator['octave_base_scale'] == square_anchor_generator['scales'][0]) assert (approx_anchor_generator['strides'] == square_anchor_generator['strides']) self.approx_anchor_generator = TASK_UTILS.build( approx_anchor_generator) self.square_anchor_generator = TASK_UTILS.build( square_anchor_generator) self.approxs_per_octave = self.approx_anchor_generator \ .num_base_priors[0] self.reg_decoded_bbox = reg_decoded_bbox # one anchor per location self.num_base_priors = self.square_anchor_generator.num_base_priors[0] self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) self.loc_focal_loss = loss_loc['type'] in ['FocalLoss'] if self.use_sigmoid_cls: self.cls_out_channels = self.num_classes else: self.cls_out_channels = self.num_classes + 1 # build bbox_coder self.anchor_coder = TASK_UTILS.build(anchor_coder) self.bbox_coder = TASK_UTILS.build(bbox_coder) # build losses self.loss_loc = MODELS.build(loss_loc) self.loss_shape = MODELS.build(loss_shape) self.loss_cls = MODELS.build(loss_cls) self.loss_bbox = MODELS.build(loss_bbox) self.train_cfg = train_cfg self.test_cfg = test_cfg if self.train_cfg: self.assigner = TASK_UTILS.build(self.train_cfg['assigner']) # use PseudoSampler when no sampler in train_cfg if train_cfg.get('sampler', None) is not None: self.sampler = TASK_UTILS.build( self.train_cfg['sampler'], default_args=dict(context=self)) else: self.sampler = PseudoSampler() self.ga_assigner = TASK_UTILS.build(self.train_cfg['ga_assigner']) if train_cfg.get('ga_sampler', None) is not None: self.ga_sampler = TASK_UTILS.build( self.train_cfg['ga_sampler'], default_args=dict(context=self)) else: self.ga_sampler = PseudoSampler() self._init_layers() def _init_layers(self) -> None: """Initialize layers of the head.""" self.relu = nn.ReLU(inplace=True) self.conv_loc = nn.Conv2d(self.in_channels, 1, 1) self.conv_shape = nn.Conv2d(self.in_channels, self.num_base_priors * 2, 1) self.feature_adaption = FeatureAdaption( self.in_channels, self.feat_channels, kernel_size=3, deform_groups=self.deform_groups) self.conv_cls = MaskedConv2d( self.feat_channels, self.num_base_priors * self.cls_out_channels, 1) self.conv_reg = MaskedConv2d(self.feat_channels, self.num_base_priors * 4, 1) def forward_single(self, x: Tensor) -> Tuple[Tensor]: """Forward feature of a single scale level.""" loc_pred = self.conv_loc(x) shape_pred = self.conv_shape(x) x = self.feature_adaption(x, shape_pred) # masked conv is only used during inference for speed-up if not self.training: mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr else: mask = None cls_score = self.conv_cls(x, mask) bbox_pred = self.conv_reg(x, mask) return cls_score, bbox_pred, shape_pred, loc_pred def forward(self, x: List[Tensor]) -> Tuple[List[Tensor]]: """Forward features from the upstream network.""" return multi_apply(self.forward_single, x) def get_sampled_approxs(self, featmap_sizes: List[Tuple[int, int]], batch_img_metas: List[dict], device: str = 'cuda') -> tuple: """Get sampled approxs and inside flags according to feature map sizes. Args: featmap_sizes (list[tuple]): Multi-level feature map sizes. batch_img_metas (list[dict]): Image meta info. device (str): device for returned tensors Returns: tuple: approxes of each image, inside flags of each image """ num_imgs = len(batch_img_metas) # since feature map sizes of all images are the same, we only compute # approxes for one time multi_level_approxs = self.approx_anchor_generator.grid_priors( featmap_sizes, device=device) approxs_list = [multi_level_approxs for _ in range(num_imgs)] # for each image, we compute inside flags of multi level approxes inside_flag_list = [] for img_id, img_meta in enumerate(batch_img_metas): multi_level_flags = [] multi_level_approxs = approxs_list[img_id] # obtain valid flags for each approx first multi_level_approx_flags = self.approx_anchor_generator \ .valid_flags(featmap_sizes, img_meta['pad_shape'], device=device) for i, flags in enumerate(multi_level_approx_flags): approxs = multi_level_approxs[i] inside_flags_list = [] for j in range(self.approxs_per_octave): split_valid_flags = flags[j::self.approxs_per_octave] split_approxs = approxs[j::self.approxs_per_octave, :] inside_flags = anchor_inside_flags( split_approxs, split_valid_flags, img_meta['img_shape'][:2], self.train_cfg['allowed_border']) inside_flags_list.append(inside_flags) # inside_flag for a position is true if any anchor in this # position is true inside_flags = ( torch.stack(inside_flags_list, 0).sum(dim=0) > 0) multi_level_flags.append(inside_flags) inside_flag_list.append(multi_level_flags) return approxs_list, inside_flag_list def get_anchors(self, featmap_sizes: List[Tuple[int, int]], shape_preds: List[Tensor], loc_preds: List[Tensor], batch_img_metas: List[dict], use_loc_filter: bool = False, device: str = 'cuda') -> tuple: """Get squares according to feature map sizes and guided anchors. Args: featmap_sizes (list[tuple]): Multi-level feature map sizes. shape_preds (list[tensor]): Multi-level shape predictions. loc_preds (list[tensor]): Multi-level location predictions. batch_img_metas (list[dict]): Image meta info. use_loc_filter (bool): Use loc filter or not. Defaults to False device (str): device for returned tensors. Defaults to `cuda`. Returns: tuple: square approxs of each image, guided anchors of each image, loc masks of each image. """ num_imgs = len(batch_img_metas) num_levels = len(featmap_sizes) # since feature map sizes of all images are the same, we only compute # squares for one time multi_level_squares = self.square_anchor_generator.grid_priors( featmap_sizes, device=device) squares_list = [multi_level_squares for _ in range(num_imgs)] # for each image, we compute multi level guided anchors guided_anchors_list = [] loc_mask_list = [] for img_id, img_meta in enumerate(batch_img_metas): multi_level_guided_anchors = [] multi_level_loc_mask = [] for i in range(num_levels): squares = squares_list[img_id][i] shape_pred = shape_preds[i][img_id] loc_pred = loc_preds[i][img_id] guided_anchors, loc_mask = self._get_guided_anchors_single( squares, shape_pred, loc_pred, use_loc_filter=use_loc_filter) multi_level_guided_anchors.append(guided_anchors) multi_level_loc_mask.append(loc_mask) guided_anchors_list.append(multi_level_guided_anchors) loc_mask_list.append(multi_level_loc_mask) return squares_list, guided_anchors_list, loc_mask_list def _get_guided_anchors_single( self, squares: Tensor, shape_pred: Tensor, loc_pred: Tensor, use_loc_filter: bool = False) -> Tuple[Tensor]: """Get guided anchors and loc masks for a single level. Args: squares (tensor): Squares of a single level. shape_pred (tensor): Shape predictions of a single level. loc_pred (tensor): Loc predictions of a single level. use_loc_filter (list[tensor]): Use loc filter or not. Defaults to False. Returns: tuple: guided anchors, location masks """ # calculate location filtering mask loc_pred = loc_pred.sigmoid().detach() if use_loc_filter: loc_mask = loc_pred >= self.loc_filter_thr else: loc_mask = loc_pred >= 0.0 mask = loc_mask.permute(1, 2, 0).expand(-1, -1, self.num_base_priors) mask = mask.contiguous().view(-1) # calculate guided anchors squares = squares[mask] anchor_deltas = shape_pred.permute(1, 2, 0).contiguous().view( -1, 2).detach()[mask] bbox_deltas = anchor_deltas.new_full(squares.size(), 0) bbox_deltas[:, 2:] = anchor_deltas guided_anchors = self.anchor_coder.decode( squares, bbox_deltas, wh_ratio_clip=1e-6) return guided_anchors, mask def ga_loc_targets(self, batch_gt_instances: InstanceList, featmap_sizes: List[Tuple[int, int]]) -> tuple: """Compute location targets for guided anchoring. Each feature map is divided into positive, negative and ignore regions. - positive regions: target 1, weight 1 - ignore regions: target 0, weight 0 - negative regions: target 0, weight 0.1 Args: batch_gt_instances (list[:obj:`InstanceData`]): Batch of gt_instance. It usually includes ``bboxes`` and ``labels`` attributes. featmap_sizes (list[tuple]): Multi level sizes of each feature maps. Returns: tuple: Returns a tuple containing location targets. """ anchor_scale = self.approx_anchor_generator.octave_base_scale anchor_strides = self.approx_anchor_generator.strides # Currently only supports same stride in x and y direction. for stride in anchor_strides: assert (stride[0] == stride[1]) anchor_strides = [stride[0] for stride in anchor_strides] center_ratio = self.train_cfg['center_ratio'] ignore_ratio = self.train_cfg['ignore_ratio'] img_per_gpu = len(batch_gt_instances) num_lvls = len(featmap_sizes) r1 = (1 - center_ratio) / 2 r2 = (1 - ignore_ratio) / 2 all_loc_targets = [] all_loc_weights = [] all_ignore_map = [] for lvl_id in range(num_lvls): h, w = featmap_sizes[lvl_id] loc_targets = torch.zeros( img_per_gpu, 1, h, w, device=batch_gt_instances[0].bboxes.device, dtype=torch.float32) loc_weights = torch.full_like(loc_targets, -1) ignore_map = torch.zeros_like(loc_targets) all_loc_targets.append(loc_targets) all_loc_weights.append(loc_weights) all_ignore_map.append(ignore_map) for img_id in range(img_per_gpu): gt_bboxes = batch_gt_instances[img_id].bboxes scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (gt_bboxes[:, 3] - gt_bboxes[:, 1])) min_anchor_size = scale.new_full( (1, ), float(anchor_scale * anchor_strides[0])) # assign gt bboxes to different feature levels w.r.t. their scales target_lvls = torch.floor( torch.log2(scale) - torch.log2(min_anchor_size) + 0.5) target_lvls = target_lvls.clamp(min=0, max=num_lvls - 1).long() for gt_id in range(gt_bboxes.size(0)): lvl = target_lvls[gt_id].item() # rescaled to corresponding feature map gt_ = gt_bboxes[gt_id, :4] / anchor_strides[lvl] # calculate ignore regions ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region( gt_, r2, featmap_sizes[lvl]) # calculate positive (center) regions ctr_x1, ctr_y1, ctr_x2, ctr_y2 = calc_region( gt_, r1, featmap_sizes[lvl]) all_loc_targets[lvl][img_id, 0, ctr_y1:ctr_y2 + 1, ctr_x1:ctr_x2 + 1] = 1 all_loc_weights[lvl][img_id, 0, ignore_y1:ignore_y2 + 1, ignore_x1:ignore_x2 + 1] = 0 all_loc_weights[lvl][img_id, 0, ctr_y1:ctr_y2 + 1, ctr_x1:ctr_x2 + 1] = 1 # calculate ignore map on nearby low level feature if lvl > 0: d_lvl = lvl - 1 # rescaled to corresponding feature map gt_ = gt_bboxes[gt_id, :4] / anchor_strides[d_lvl] ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region( gt_, r2, featmap_sizes[d_lvl]) all_ignore_map[d_lvl][img_id, 0, ignore_y1:ignore_y2 + 1, ignore_x1:ignore_x2 + 1] = 1 # calculate ignore map on nearby high level feature if lvl < num_lvls - 1: u_lvl = lvl + 1 # rescaled to corresponding feature map gt_ = gt_bboxes[gt_id, :4] / anchor_strides[u_lvl] ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region( gt_, r2, featmap_sizes[u_lvl]) all_ignore_map[u_lvl][img_id, 0, ignore_y1:ignore_y2 + 1, ignore_x1:ignore_x2 + 1] = 1 for lvl_id in range(num_lvls): # ignore negative regions w.r.t. ignore map all_loc_weights[lvl_id][(all_loc_weights[lvl_id] < 0) & (all_ignore_map[lvl_id] > 0)] = 0 # set negative regions with weight 0.1 all_loc_weights[lvl_id][all_loc_weights[lvl_id] < 0] = 0.1 # loc average factor to balance loss loc_avg_factor = sum( [t.size(0) * t.size(-1) * t.size(-2) for t in all_loc_targets]) / 200 return all_loc_targets, all_loc_weights, loc_avg_factor def _ga_shape_target_single(self, flat_approxs: Tensor, inside_flags: Tensor, flat_squares: Tensor, gt_instances: InstanceData, gt_instances_ignore: Optional[InstanceData], img_meta: dict, unmap_outputs: bool = True) -> tuple: """Compute guided anchoring targets. This function returns sampled anchors and gt bboxes directly rather than calculates regression targets. Args: flat_approxs (Tensor): flat approxs of a single image, shape (n, 4) inside_flags (Tensor): inside flags of a single image, shape (n, ). flat_squares (Tensor): flat squares of a single image, shape (approxs_per_octave * n, 4) gt_instances (:obj:`InstanceData`): Ground truth of instance annotations. It usually includes ``bboxes`` and ``labels`` attributes. gt_instances_ignore (:obj:`InstanceData`, optional): Instances to be ignored during training. It includes ``bboxes`` attribute data that is ignored during training and testing. img_meta (dict): Meta info of a single image. unmap_outputs (bool): unmap outputs or not. Returns: tuple: Returns a tuple containing shape targets of each image. """ if not inside_flags.any(): raise ValueError( 'There is no valid anchor inside the image boundary. Please ' 'check the image size and anchor sizes, or set ' '``allowed_border`` to -1 to skip the condition.') # assign gt and sample anchors num_square = flat_squares.size(0) approxs = flat_approxs.view(num_square, self.approxs_per_octave, 4) approxs = approxs[inside_flags, ...] squares = flat_squares[inside_flags, :] pred_instances = InstanceData() pred_instances.priors = squares pred_instances.approxs = approxs assign_result = self.ga_assigner.assign( pred_instances=pred_instances, gt_instances=gt_instances, gt_instances_ignore=gt_instances_ignore) sampling_result = self.ga_sampler.sample( assign_result=assign_result, pred_instances=pred_instances, gt_instances=gt_instances) bbox_anchors = torch.zeros_like(squares) bbox_gts = torch.zeros_like(squares) bbox_weights = torch.zeros_like(squares) pos_inds = sampling_result.pos_inds neg_inds = sampling_result.neg_inds if len(pos_inds) > 0: bbox_anchors[pos_inds, :] = sampling_result.pos_bboxes bbox_gts[pos_inds, :] = sampling_result.pos_gt_bboxes bbox_weights[pos_inds, :] = 1.0 # map up to original set of anchors if unmap_outputs: num_total_anchors = flat_squares.size(0) bbox_anchors = unmap(bbox_anchors, num_total_anchors, inside_flags) bbox_gts = unmap(bbox_gts, num_total_anchors, inside_flags) bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags) return (bbox_anchors, bbox_gts, bbox_weights, pos_inds, neg_inds, sampling_result) def ga_shape_targets(self, approx_list: List[List[Tensor]], inside_flag_list: List[List[Tensor]], square_list: List[List[Tensor]], batch_gt_instances: InstanceList, batch_img_metas: List[dict], batch_gt_instances_ignore: OptInstanceList = None, unmap_outputs: bool = True) -> tuple: """Compute guided anchoring targets. Args: approx_list (list[list[Tensor]]): Multi level approxs of each image. inside_flag_list (list[list[Tensor]]): Multi level inside flags of each image. square_list (list[list[Tensor]]): Multi level squares of each image. batch_gt_instances (list[:obj:`InstanceData`]): Batch of gt_instance. It usually includes ``bboxes`` and ``labels`` attributes. batch_img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): Batch of gt_instances_ignore. It includes ``bboxes`` attribute data that is ignored during training and testing. Defaults to None. unmap_outputs (bool): unmap outputs or not. Defaults to None. Returns: tuple: Returns a tuple containing shape targets. """ num_imgs = len(batch_img_metas) assert len(approx_list) == len(inside_flag_list) == len( square_list) == num_imgs # anchor number of multi levels num_level_squares = [squares.size(0) for squares in square_list[0]] # concat all level anchors and flags to a single tensor inside_flag_flat_list = [] approx_flat_list = [] square_flat_list = [] for i in range(num_imgs): assert len(square_list[i]) == len(inside_flag_list[i]) inside_flag_flat_list.append(torch.cat(inside_flag_list[i])) approx_flat_list.append(torch.cat(approx_list[i])) square_flat_list.append(torch.cat(square_list[i])) # compute targets for each image if batch_gt_instances_ignore is None: batch_gt_instances_ignore = [None for _ in range(num_imgs)] (all_bbox_anchors, all_bbox_gts, all_bbox_weights, pos_inds_list, neg_inds_list, sampling_results_list) = multi_apply( self._ga_shape_target_single, approx_flat_list, inside_flag_flat_list, square_flat_list, batch_gt_instances, batch_gt_instances_ignore, batch_img_metas, unmap_outputs=unmap_outputs) # sampled anchors of all images avg_factor = sum( [results.avg_factor for results in sampling_results_list]) # split targets to a list w.r.t. multiple levels bbox_anchors_list = images_to_levels(all_bbox_anchors, num_level_squares) bbox_gts_list = images_to_levels(all_bbox_gts, num_level_squares) bbox_weights_list = images_to_levels(all_bbox_weights, num_level_squares) return (bbox_anchors_list, bbox_gts_list, bbox_weights_list, avg_factor) def loss_shape_single(self, shape_pred: Tensor, bbox_anchors: Tensor, bbox_gts: Tensor, anchor_weights: Tensor, avg_factor: int) -> Tensor: """Compute shape loss in single level.""" shape_pred = shape_pred.permute(0, 2, 3, 1).contiguous().view(-1, 2) bbox_anchors = bbox_anchors.contiguous().view(-1, 4) bbox_gts = bbox_gts.contiguous().view(-1, 4) anchor_weights = anchor_weights.contiguous().view(-1, 4) bbox_deltas = bbox_anchors.new_full(bbox_anchors.size(), 0) bbox_deltas[:, 2:] += shape_pred # filter out negative samples to speed-up weighted_bounded_iou_loss inds = torch.nonzero( anchor_weights[:, 0] > 0, as_tuple=False).squeeze(1) bbox_deltas_ = bbox_deltas[inds] bbox_anchors_ = bbox_anchors[inds] bbox_gts_ = bbox_gts[inds] anchor_weights_ = anchor_weights[inds] pred_anchors_ = self.anchor_coder.decode( bbox_anchors_, bbox_deltas_, wh_ratio_clip=1e-6) loss_shape = self.loss_shape( pred_anchors_, bbox_gts_, anchor_weights_, avg_factor=avg_factor) return loss_shape def loss_loc_single(self, loc_pred: Tensor, loc_target: Tensor, loc_weight: Tensor, avg_factor: float) -> Tensor: """Compute location loss in single level.""" loss_loc = self.loss_loc( loc_pred.reshape(-1, 1), loc_target.reshape(-1).long(), loc_weight.reshape(-1), avg_factor=avg_factor) return loss_loc def loss_by_feat( self, cls_scores: List[Tensor], bbox_preds: List[Tensor], shape_preds: List[Tensor], loc_preds: List[Tensor], batch_gt_instances: InstanceList, batch_img_metas: List[dict], batch_gt_instances_ignore: OptInstanceList = None) -> dict: """Calculate the loss based on the features extracted by the detection head. Args: cls_scores (list[Tensor]): Box scores for each scale level has shape (N, num_anchors * num_classes, H, W). bbox_preds (list[Tensor]): Box energies / deltas for each scale level with shape (N, num_anchors * 4, H, W). shape_preds (list[Tensor]): shape predictions for each scale level with shape (N, 1, H, W). loc_preds (list[Tensor]): location predictions for each scale level with shape (N, num_anchors * 2, H, W). batch_gt_instances (list[:obj:`InstanceData`]): Batch of gt_instance. It usually includes ``bboxes`` and ``labels`` attributes. batch_img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): Batch of gt_instances_ignore. It includes ``bboxes`` attribute data that is ignored during training and testing. Defaults to None. Returns: dict: A dictionary of loss components. """ featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] assert len(featmap_sizes) == self.approx_anchor_generator.num_levels device = cls_scores[0].device # get loc targets loc_targets, loc_weights, loc_avg_factor = self.ga_loc_targets( batch_gt_instances, featmap_sizes) # get sampled approxes approxs_list, inside_flag_list = self.get_sampled_approxs( featmap_sizes, batch_img_metas, device=device) # get squares and guided anchors squares_list, guided_anchors_list, _ = self.get_anchors( featmap_sizes, shape_preds, loc_preds, batch_img_metas, device=device) # get shape targets shape_targets = self.ga_shape_targets(approxs_list, inside_flag_list, squares_list, batch_gt_instances, batch_img_metas) (bbox_anchors_list, bbox_gts_list, anchor_weights_list, ga_avg_factor) = shape_targets # get anchor targets cls_reg_targets = self.get_targets( guided_anchors_list, inside_flag_list, batch_gt_instances, batch_img_metas, batch_gt_instances_ignore=batch_gt_instances_ignore) (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, avg_factor) = cls_reg_targets # anchor number of multi levels num_level_anchors = [ anchors.size(0) for anchors in guided_anchors_list[0] ] # concat all level anchors to a single tensor concat_anchor_list = [] for i in range(len(guided_anchors_list)): concat_anchor_list.append(torch.cat(guided_anchors_list[i])) all_anchor_list = images_to_levels(concat_anchor_list, num_level_anchors) # get classification and bbox regression losses losses_cls, losses_bbox = multi_apply( self.loss_by_feat_single, cls_scores, bbox_preds, all_anchor_list, labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, avg_factor=avg_factor) # get anchor location loss losses_loc = [] for i in range(len(loc_preds)): loss_loc = self.loss_loc_single( loc_preds[i], loc_targets[i], loc_weights[i], avg_factor=loc_avg_factor) losses_loc.append(loss_loc) # get anchor shape loss losses_shape = [] for i in range(len(shape_preds)): loss_shape = self.loss_shape_single( shape_preds[i], bbox_anchors_list[i], bbox_gts_list[i], anchor_weights_list[i], avg_factor=ga_avg_factor) losses_shape.append(loss_shape) return dict( loss_cls=losses_cls, loss_bbox=losses_bbox, loss_shape=losses_shape, loss_loc=losses_loc) def predict_by_feat(self, cls_scores: List[Tensor], bbox_preds: List[Tensor], shape_preds: List[Tensor], loc_preds: List[Tensor], batch_img_metas: List[dict], cfg: OptConfigType = None, rescale: bool = False) -> InstanceList: """Transform a batch of output features extracted from the head into bbox results. Args: cls_scores (list[Tensor]): Classification scores for all scale levels, each is a 4D-tensor, has shape (batch_size, num_priors * num_classes, H, W). bbox_preds (list[Tensor]): Box energies / deltas for all scale levels, each is a 4D-tensor, has shape (batch_size, num_priors * 4, H, W). shape_preds (list[Tensor]): shape predictions for each scale level with shape (N, 1, H, W). loc_preds (list[Tensor]): location predictions for each scale level with shape (N, num_anchors * 2, H, W). batch_img_metas (list[dict], Optional): Batch image meta info. Defaults to None. cfg (ConfigDict, optional): Test / postprocessing configuration, if None, test_cfg would be used. Defaults to None. rescale (bool): If True, return boxes in original image space. Defaults to False. Returns: list[:obj:`InstanceData`]: Object detection results of each image after the post process. Each item usually contains following keys. - scores (Tensor): Classification scores, has a shape (num_instance, ) - labels (Tensor): Labels of bboxes, has a shape (num_instances, ). - bboxes (Tensor): Has a shape (num_instances, 4), the last dimension 4 arrange as (x1, y1, x2, y2). """ assert len(cls_scores) == len(bbox_preds) == len(shape_preds) == len( loc_preds) num_levels = len(cls_scores) featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] device = cls_scores[0].device # get guided anchors _, guided_anchors, loc_masks = self.get_anchors( featmap_sizes, shape_preds, loc_preds, batch_img_metas, use_loc_filter=not self.training, device=device) result_list = [] for img_id in range(len(batch_img_metas)): cls_score_list = [ cls_scores[i][img_id].detach() for i in range(num_levels) ] bbox_pred_list = [ bbox_preds[i][img_id].detach() for i in range(num_levels) ] guided_anchor_list = [ guided_anchors[img_id][i].detach() for i in range(num_levels) ] loc_mask_list = [ loc_masks[img_id][i].detach() for i in range(num_levels) ] proposals = self._predict_by_feat_single( cls_scores=cls_score_list, bbox_preds=bbox_pred_list, mlvl_anchors=guided_anchor_list, mlvl_masks=loc_mask_list, img_meta=batch_img_metas[img_id], cfg=cfg, rescale=rescale) result_list.append(proposals) return result_list def _predict_by_feat_single(self, cls_scores: List[Tensor], bbox_preds: List[Tensor], mlvl_anchors: List[Tensor], mlvl_masks: List[Tensor], img_meta: dict, cfg: ConfigType, rescale: bool = False) -> InstanceData: """Transform a single image's features extracted from the head into bbox results. Args: cls_scores (list[Tensor]): Box scores from all scale levels of a single image, each item has shape (num_priors * num_classes, H, W). bbox_preds (list[Tensor]): Box energies / deltas from all scale levels of a single image, each item has shape (num_priors * 4, H, W). mlvl_anchors (list[Tensor]): Each element in the list is the anchors of a single level in feature pyramid. it has shape (num_priors, 4). mlvl_masks (list[Tensor]): Each element in the list is location masks of a single level. img_meta (dict): Image meta info. cfg (:obj:`ConfigDict` or dict): Test / postprocessing configuration, if None, test_cfg would be used. rescale (bool): If True, return boxes in original image space. Defaults to False. Returns: :obj:`InstanceData`: Detection results of each image after the post process. Each item usually contains following keys. - scores (Tensor): Classification scores, has a shape (num_instance, ) - labels (Tensor): Labels of bboxes, has a shape (num_instances, ). - bboxes (Tensor): Has a shape (num_instances, 4), the last dimension 4 arrange as (x1, y1, x2, y2). """ cfg = self.test_cfg if cfg is None else cfg assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors) mlvl_bbox_preds = [] mlvl_valid_anchors = [] mlvl_scores = [] for cls_score, bbox_pred, anchors, mask in zip(cls_scores, bbox_preds, mlvl_anchors, mlvl_masks): assert cls_score.size()[-2:] == bbox_pred.size()[-2:] # if no location is kept, end. if mask.sum() == 0: continue # reshape scores and bbox_pred cls_score = cls_score.permute(1, 2, 0).reshape(-1, self.cls_out_channels) if self.use_sigmoid_cls: scores = cls_score.sigmoid() else: scores = cls_score.softmax(-1) bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) # filter scores, bbox_pred w.r.t. mask. # anchors are filtered in get_anchors() beforehand. scores = scores[mask, :] bbox_pred = bbox_pred[mask, :] if scores.dim() == 0: anchors = anchors.unsqueeze(0) scores = scores.unsqueeze(0) bbox_pred = bbox_pred.unsqueeze(0) # filter anchors, bbox_pred, scores w.r.t. scores nms_pre = cfg.get('nms_pre', -1) if nms_pre > 0 and scores.shape[0] > nms_pre: if self.use_sigmoid_cls: max_scores, _ = scores.max(dim=1) else: # remind that we set FG labels to [0, num_class-1] # since mmdet v2.0 # BG cat_id: num_class max_scores, _ = scores[:, :-1].max(dim=1) _, topk_inds = max_scores.topk(nms_pre) anchors = anchors[topk_inds, :] bbox_pred = bbox_pred[topk_inds, :] scores = scores[topk_inds, :] mlvl_bbox_preds.append(bbox_pred) mlvl_valid_anchors.append(anchors) mlvl_scores.append(scores) mlvl_bbox_preds = torch.cat(mlvl_bbox_preds) mlvl_anchors = torch.cat(mlvl_valid_anchors) mlvl_scores = torch.cat(mlvl_scores) mlvl_bboxes = self.bbox_coder.decode( mlvl_anchors, mlvl_bbox_preds, max_shape=img_meta['img_shape']) if rescale: assert img_meta.get('scale_factor') is not None mlvl_bboxes /= mlvl_bboxes.new_tensor( img_meta['scale_factor']).repeat((1, 2)) if self.use_sigmoid_cls: # Add a dummy background class to the backend when using sigmoid # remind that we set FG labels to [0, num_class-1] since mmdet v2.0 # BG cat_id: num_class padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) mlvl_scores = torch.cat([mlvl_scores, padding], dim=1) # multi class NMS det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores, cfg.score_thr, cfg.nms, cfg.max_per_img) results = InstanceData() results.bboxes = det_bboxes[:, :-1] results.scores = det_bboxes[:, -1] results.labels = det_labels return results