123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994 |
- # Copyright (c) OpenMMLab. All rights reserved.
- from typing import List, Optional, Tuple
- import torch
- import torch.nn as nn
- from mmcv.ops import DeformConv2d, MaskedConv2d
- from mmengine.model import BaseModule
- from mmengine.structures import InstanceData
- from torch import Tensor
- from mmdet.registry import MODELS, TASK_UTILS
- from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
- OptInstanceList)
- from ..layers import multiclass_nms
- from ..task_modules.prior_generators import anchor_inside_flags, calc_region
- from ..task_modules.samplers import PseudoSampler
- from ..utils import images_to_levels, multi_apply, unmap
- from .anchor_head import AnchorHead
- class FeatureAdaption(BaseModule):
- """Feature Adaption Module.
- Feature Adaption Module is implemented based on DCN v1.
- It uses anchor shape prediction rather than feature map to
- predict offsets of deform conv layer.
- Args:
- in_channels (int): Number of channels in the input feature map.
- out_channels (int): Number of channels in the output feature map.
- kernel_size (int): Deformable conv kernel size. Defaults to 3.
- deform_groups (int): Deformable conv group size. Defaults to 4.
- init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
- list[dict], optional): Initialization config dict.
- """
- def __init__(
- self,
- in_channels: int,
- out_channels: int,
- kernel_size: int = 3,
- deform_groups: int = 4,
- init_cfg: MultiConfig = dict(
- type='Normal',
- layer='Conv2d',
- std=0.1,
- override=dict(type='Normal', name='conv_adaption', std=0.01))
- ) -> None:
- super().__init__(init_cfg=init_cfg)
- offset_channels = kernel_size * kernel_size * 2
- self.conv_offset = nn.Conv2d(
- 2, deform_groups * offset_channels, 1, bias=False)
- self.conv_adaption = DeformConv2d(
- in_channels,
- out_channels,
- kernel_size=kernel_size,
- padding=(kernel_size - 1) // 2,
- deform_groups=deform_groups)
- self.relu = nn.ReLU(inplace=True)
- def forward(self, x: Tensor, shape: Tensor) -> Tensor:
- offset = self.conv_offset(shape.detach())
- x = self.relu(self.conv_adaption(x, offset))
- return x
- @MODELS.register_module()
- class GuidedAnchorHead(AnchorHead):
- """Guided-Anchor-based head (GA-RPN, GA-RetinaNet, etc.).
- This GuidedAnchorHead will predict high-quality feature guided
- anchors and locations where anchors will be kept in inference.
- There are mainly 3 categories of bounding-boxes.
- - Sampled 9 pairs for target assignment. (approxes)
- - The square boxes where the predicted anchors are based on. (squares)
- - Guided anchors.
- Please refer to https://arxiv.org/abs/1901.03278 for more details.
- Args:
- num_classes (int): Number of classes.
- in_channels (int): Number of channels in the input feature map.
- feat_channels (int): Number of hidden channels. Defaults to 256.
- approx_anchor_generator (:obj:`ConfigDict` or dict): Config dict
- for approx generator
- square_anchor_generator (:obj:`ConfigDict` or dict): Config dict
- for square generator
- anchor_coder (:obj:`ConfigDict` or dict): Config dict for anchor coder
- bbox_coder (:obj:`ConfigDict` or dict): Config dict for bbox coder
- reg_decoded_bbox (bool): If true, the regression loss would be
- applied directly on decoded bounding boxes, converting both
- the predicted boxes and regression targets to absolute
- coordinates format. Defaults to False. It should be `True` when
- using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
- deform_groups: (int): Group number of DCN in FeatureAdaption module.
- Defaults to 4.
- loc_filter_thr (float): Threshold to filter out unconcerned regions.
- Defaults to 0.01.
- loss_loc (:obj:`ConfigDict` or dict): Config of location loss.
- loss_shape (:obj:`ConfigDict` or dict): Config of anchor shape loss.
- loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
- loss_bbox (:obj:`ConfigDict` or dict): Config of bbox regression loss.
- init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
- list[dict], optional): Initialization config dict.
- """
- def __init__(
- self,
- num_classes: int,
- in_channels: int,
- feat_channels: int = 256,
- approx_anchor_generator: ConfigType = dict(
- type='AnchorGenerator',
- octave_base_scale=8,
- scales_per_octave=3,
- ratios=[0.5, 1.0, 2.0],
- strides=[4, 8, 16, 32, 64]),
- square_anchor_generator: ConfigType = dict(
- type='AnchorGenerator',
- ratios=[1.0],
- scales=[8],
- strides=[4, 8, 16, 32, 64]),
- anchor_coder: ConfigType = dict(
- type='DeltaXYWHBBoxCoder',
- target_means=[.0, .0, .0, .0],
- target_stds=[1.0, 1.0, 1.0, 1.0]),
- bbox_coder: ConfigType = dict(
- type='DeltaXYWHBBoxCoder',
- target_means=[.0, .0, .0, .0],
- target_stds=[1.0, 1.0, 1.0, 1.0]),
- reg_decoded_bbox: bool = False,
- deform_groups: int = 4,
- loc_filter_thr: float = 0.01,
- train_cfg: OptConfigType = None,
- test_cfg: OptConfigType = None,
- loss_loc: ConfigType = dict(
- type='FocalLoss',
- use_sigmoid=True,
- gamma=2.0,
- alpha=0.25,
- loss_weight=1.0),
- loss_shape: ConfigType = dict(
- type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
- loss_cls: ConfigType = dict(
- type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
- loss_bbox: ConfigType = dict(
- type='SmoothL1Loss', beta=1.0, loss_weight=1.0),
- init_cfg: MultiConfig = dict(
- type='Normal',
- layer='Conv2d',
- std=0.01,
- override=dict(
- type='Normal', name='conv_loc', std=0.01, lbias_prob=0.01))
- ) -> None:
- super(AnchorHead, self).__init__(init_cfg=init_cfg)
- self.in_channels = in_channels
- self.num_classes = num_classes
- self.feat_channels = feat_channels
- self.deform_groups = deform_groups
- self.loc_filter_thr = loc_filter_thr
- # build approx_anchor_generator and square_anchor_generator
- assert (approx_anchor_generator['octave_base_scale'] ==
- square_anchor_generator['scales'][0])
- assert (approx_anchor_generator['strides'] ==
- square_anchor_generator['strides'])
- self.approx_anchor_generator = TASK_UTILS.build(
- approx_anchor_generator)
- self.square_anchor_generator = TASK_UTILS.build(
- square_anchor_generator)
- self.approxs_per_octave = self.approx_anchor_generator \
- .num_base_priors[0]
- self.reg_decoded_bbox = reg_decoded_bbox
- # one anchor per location
- self.num_base_priors = self.square_anchor_generator.num_base_priors[0]
- self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
- self.loc_focal_loss = loss_loc['type'] in ['FocalLoss']
- if self.use_sigmoid_cls:
- self.cls_out_channels = self.num_classes
- else:
- self.cls_out_channels = self.num_classes + 1
- # build bbox_coder
- self.anchor_coder = TASK_UTILS.build(anchor_coder)
- self.bbox_coder = TASK_UTILS.build(bbox_coder)
- # build losses
- self.loss_loc = MODELS.build(loss_loc)
- self.loss_shape = MODELS.build(loss_shape)
- self.loss_cls = MODELS.build(loss_cls)
- self.loss_bbox = MODELS.build(loss_bbox)
- self.train_cfg = train_cfg
- self.test_cfg = test_cfg
- if self.train_cfg:
- self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
- # use PseudoSampler when no sampler in train_cfg
- if train_cfg.get('sampler', None) is not None:
- self.sampler = TASK_UTILS.build(
- self.train_cfg['sampler'], default_args=dict(context=self))
- else:
- self.sampler = PseudoSampler()
- self.ga_assigner = TASK_UTILS.build(self.train_cfg['ga_assigner'])
- if train_cfg.get('ga_sampler', None) is not None:
- self.ga_sampler = TASK_UTILS.build(
- self.train_cfg['ga_sampler'],
- default_args=dict(context=self))
- else:
- self.ga_sampler = PseudoSampler()
- self._init_layers()
- def _init_layers(self) -> None:
- """Initialize layers of the head."""
- self.relu = nn.ReLU(inplace=True)
- self.conv_loc = nn.Conv2d(self.in_channels, 1, 1)
- self.conv_shape = nn.Conv2d(self.in_channels, self.num_base_priors * 2,
- 1)
- self.feature_adaption = FeatureAdaption(
- self.in_channels,
- self.feat_channels,
- kernel_size=3,
- deform_groups=self.deform_groups)
- self.conv_cls = MaskedConv2d(
- self.feat_channels, self.num_base_priors * self.cls_out_channels,
- 1)
- self.conv_reg = MaskedConv2d(self.feat_channels,
- self.num_base_priors * 4, 1)
- def forward_single(self, x: Tensor) -> Tuple[Tensor]:
- """Forward feature of a single scale level."""
- loc_pred = self.conv_loc(x)
- shape_pred = self.conv_shape(x)
- x = self.feature_adaption(x, shape_pred)
- # masked conv is only used during inference for speed-up
- if not self.training:
- mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr
- else:
- mask = None
- cls_score = self.conv_cls(x, mask)
- bbox_pred = self.conv_reg(x, mask)
- return cls_score, bbox_pred, shape_pred, loc_pred
- def forward(self, x: List[Tensor]) -> Tuple[List[Tensor]]:
- """Forward features from the upstream network."""
- return multi_apply(self.forward_single, x)
- def get_sampled_approxs(self,
- featmap_sizes: List[Tuple[int, int]],
- batch_img_metas: List[dict],
- device: str = 'cuda') -> tuple:
- """Get sampled approxs and inside flags according to feature map sizes.
- Args:
- featmap_sizes (list[tuple]): Multi-level feature map sizes.
- batch_img_metas (list[dict]): Image meta info.
- device (str): device for returned tensors
- Returns:
- tuple: approxes of each image, inside flags of each image
- """
- num_imgs = len(batch_img_metas)
- # since feature map sizes of all images are the same, we only compute
- # approxes for one time
- multi_level_approxs = self.approx_anchor_generator.grid_priors(
- featmap_sizes, device=device)
- approxs_list = [multi_level_approxs for _ in range(num_imgs)]
- # for each image, we compute inside flags of multi level approxes
- inside_flag_list = []
- for img_id, img_meta in enumerate(batch_img_metas):
- multi_level_flags = []
- multi_level_approxs = approxs_list[img_id]
- # obtain valid flags for each approx first
- multi_level_approx_flags = self.approx_anchor_generator \
- .valid_flags(featmap_sizes,
- img_meta['pad_shape'],
- device=device)
- for i, flags in enumerate(multi_level_approx_flags):
- approxs = multi_level_approxs[i]
- inside_flags_list = []
- for j in range(self.approxs_per_octave):
- split_valid_flags = flags[j::self.approxs_per_octave]
- split_approxs = approxs[j::self.approxs_per_octave, :]
- inside_flags = anchor_inside_flags(
- split_approxs, split_valid_flags,
- img_meta['img_shape'][:2],
- self.train_cfg['allowed_border'])
- inside_flags_list.append(inside_flags)
- # inside_flag for a position is true if any anchor in this
- # position is true
- inside_flags = (
- torch.stack(inside_flags_list, 0).sum(dim=0) > 0)
- multi_level_flags.append(inside_flags)
- inside_flag_list.append(multi_level_flags)
- return approxs_list, inside_flag_list
- def get_anchors(self,
- featmap_sizes: List[Tuple[int, int]],
- shape_preds: List[Tensor],
- loc_preds: List[Tensor],
- batch_img_metas: List[dict],
- use_loc_filter: bool = False,
- device: str = 'cuda') -> tuple:
- """Get squares according to feature map sizes and guided anchors.
- Args:
- featmap_sizes (list[tuple]): Multi-level feature map sizes.
- shape_preds (list[tensor]): Multi-level shape predictions.
- loc_preds (list[tensor]): Multi-level location predictions.
- batch_img_metas (list[dict]): Image meta info.
- use_loc_filter (bool): Use loc filter or not. Defaults to False
- device (str): device for returned tensors.
- Defaults to `cuda`.
- Returns:
- tuple: square approxs of each image, guided anchors of each image,
- loc masks of each image.
- """
- num_imgs = len(batch_img_metas)
- num_levels = len(featmap_sizes)
- # since feature map sizes of all images are the same, we only compute
- # squares for one time
- multi_level_squares = self.square_anchor_generator.grid_priors(
- featmap_sizes, device=device)
- squares_list = [multi_level_squares for _ in range(num_imgs)]
- # for each image, we compute multi level guided anchors
- guided_anchors_list = []
- loc_mask_list = []
- for img_id, img_meta in enumerate(batch_img_metas):
- multi_level_guided_anchors = []
- multi_level_loc_mask = []
- for i in range(num_levels):
- squares = squares_list[img_id][i]
- shape_pred = shape_preds[i][img_id]
- loc_pred = loc_preds[i][img_id]
- guided_anchors, loc_mask = self._get_guided_anchors_single(
- squares,
- shape_pred,
- loc_pred,
- use_loc_filter=use_loc_filter)
- multi_level_guided_anchors.append(guided_anchors)
- multi_level_loc_mask.append(loc_mask)
- guided_anchors_list.append(multi_level_guided_anchors)
- loc_mask_list.append(multi_level_loc_mask)
- return squares_list, guided_anchors_list, loc_mask_list
- def _get_guided_anchors_single(
- self,
- squares: Tensor,
- shape_pred: Tensor,
- loc_pred: Tensor,
- use_loc_filter: bool = False) -> Tuple[Tensor]:
- """Get guided anchors and loc masks for a single level.
- Args:
- squares (tensor): Squares of a single level.
- shape_pred (tensor): Shape predictions of a single level.
- loc_pred (tensor): Loc predictions of a single level.
- use_loc_filter (list[tensor]): Use loc filter or not.
- Defaults to False.
- Returns:
- tuple: guided anchors, location masks
- """
- # calculate location filtering mask
- loc_pred = loc_pred.sigmoid().detach()
- if use_loc_filter:
- loc_mask = loc_pred >= self.loc_filter_thr
- else:
- loc_mask = loc_pred >= 0.0
- mask = loc_mask.permute(1, 2, 0).expand(-1, -1, self.num_base_priors)
- mask = mask.contiguous().view(-1)
- # calculate guided anchors
- squares = squares[mask]
- anchor_deltas = shape_pred.permute(1, 2, 0).contiguous().view(
- -1, 2).detach()[mask]
- bbox_deltas = anchor_deltas.new_full(squares.size(), 0)
- bbox_deltas[:, 2:] = anchor_deltas
- guided_anchors = self.anchor_coder.decode(
- squares, bbox_deltas, wh_ratio_clip=1e-6)
- return guided_anchors, mask
- def ga_loc_targets(self, batch_gt_instances: InstanceList,
- featmap_sizes: List[Tuple[int, int]]) -> tuple:
- """Compute location targets for guided anchoring.
- Each feature map is divided into positive, negative and ignore regions.
- - positive regions: target 1, weight 1
- - ignore regions: target 0, weight 0
- - negative regions: target 0, weight 0.1
- Args:
- batch_gt_instances (list[:obj:`InstanceData`]): Batch of
- gt_instance. It usually includes ``bboxes`` and ``labels``
- attributes.
- featmap_sizes (list[tuple]): Multi level sizes of each feature
- maps.
- Returns:
- tuple: Returns a tuple containing location targets.
- """
- anchor_scale = self.approx_anchor_generator.octave_base_scale
- anchor_strides = self.approx_anchor_generator.strides
- # Currently only supports same stride in x and y direction.
- for stride in anchor_strides:
- assert (stride[0] == stride[1])
- anchor_strides = [stride[0] for stride in anchor_strides]
- center_ratio = self.train_cfg['center_ratio']
- ignore_ratio = self.train_cfg['ignore_ratio']
- img_per_gpu = len(batch_gt_instances)
- num_lvls = len(featmap_sizes)
- r1 = (1 - center_ratio) / 2
- r2 = (1 - ignore_ratio) / 2
- all_loc_targets = []
- all_loc_weights = []
- all_ignore_map = []
- for lvl_id in range(num_lvls):
- h, w = featmap_sizes[lvl_id]
- loc_targets = torch.zeros(
- img_per_gpu,
- 1,
- h,
- w,
- device=batch_gt_instances[0].bboxes.device,
- dtype=torch.float32)
- loc_weights = torch.full_like(loc_targets, -1)
- ignore_map = torch.zeros_like(loc_targets)
- all_loc_targets.append(loc_targets)
- all_loc_weights.append(loc_weights)
- all_ignore_map.append(ignore_map)
- for img_id in range(img_per_gpu):
- gt_bboxes = batch_gt_instances[img_id].bboxes
- scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
- (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
- min_anchor_size = scale.new_full(
- (1, ), float(anchor_scale * anchor_strides[0]))
- # assign gt bboxes to different feature levels w.r.t. their scales
- target_lvls = torch.floor(
- torch.log2(scale) - torch.log2(min_anchor_size) + 0.5)
- target_lvls = target_lvls.clamp(min=0, max=num_lvls - 1).long()
- for gt_id in range(gt_bboxes.size(0)):
- lvl = target_lvls[gt_id].item()
- # rescaled to corresponding feature map
- gt_ = gt_bboxes[gt_id, :4] / anchor_strides[lvl]
- # calculate ignore regions
- ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
- gt_, r2, featmap_sizes[lvl])
- # calculate positive (center) regions
- ctr_x1, ctr_y1, ctr_x2, ctr_y2 = calc_region(
- gt_, r1, featmap_sizes[lvl])
- all_loc_targets[lvl][img_id, 0, ctr_y1:ctr_y2 + 1,
- ctr_x1:ctr_x2 + 1] = 1
- all_loc_weights[lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
- ignore_x1:ignore_x2 + 1] = 0
- all_loc_weights[lvl][img_id, 0, ctr_y1:ctr_y2 + 1,
- ctr_x1:ctr_x2 + 1] = 1
- # calculate ignore map on nearby low level feature
- if lvl > 0:
- d_lvl = lvl - 1
- # rescaled to corresponding feature map
- gt_ = gt_bboxes[gt_id, :4] / anchor_strides[d_lvl]
- ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
- gt_, r2, featmap_sizes[d_lvl])
- all_ignore_map[d_lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
- ignore_x1:ignore_x2 + 1] = 1
- # calculate ignore map on nearby high level feature
- if lvl < num_lvls - 1:
- u_lvl = lvl + 1
- # rescaled to corresponding feature map
- gt_ = gt_bboxes[gt_id, :4] / anchor_strides[u_lvl]
- ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
- gt_, r2, featmap_sizes[u_lvl])
- all_ignore_map[u_lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
- ignore_x1:ignore_x2 + 1] = 1
- for lvl_id in range(num_lvls):
- # ignore negative regions w.r.t. ignore map
- all_loc_weights[lvl_id][(all_loc_weights[lvl_id] < 0)
- & (all_ignore_map[lvl_id] > 0)] = 0
- # set negative regions with weight 0.1
- all_loc_weights[lvl_id][all_loc_weights[lvl_id] < 0] = 0.1
- # loc average factor to balance loss
- loc_avg_factor = sum(
- [t.size(0) * t.size(-1) * t.size(-2)
- for t in all_loc_targets]) / 200
- return all_loc_targets, all_loc_weights, loc_avg_factor
- def _ga_shape_target_single(self,
- flat_approxs: Tensor,
- inside_flags: Tensor,
- flat_squares: Tensor,
- gt_instances: InstanceData,
- gt_instances_ignore: Optional[InstanceData],
- img_meta: dict,
- unmap_outputs: bool = True) -> tuple:
- """Compute guided anchoring targets.
- This function returns sampled anchors and gt bboxes directly
- rather than calculates regression targets.
- Args:
- flat_approxs (Tensor): flat approxs of a single image,
- shape (n, 4)
- inside_flags (Tensor): inside flags of a single image,
- shape (n, ).
- flat_squares (Tensor): flat squares of a single image,
- shape (approxs_per_octave * n, 4)
- gt_instances (:obj:`InstanceData`): Ground truth of instance
- annotations. It usually includes ``bboxes`` and ``labels``
- attributes.
- gt_instances_ignore (:obj:`InstanceData`, optional): Instances
- to be ignored during training. It includes ``bboxes`` attribute
- data that is ignored during training and testing.
- img_meta (dict): Meta info of a single image.
- unmap_outputs (bool): unmap outputs or not.
- Returns:
- tuple: Returns a tuple containing shape targets of each image.
- """
- if not inside_flags.any():
- raise ValueError(
- 'There is no valid anchor inside the image boundary. Please '
- 'check the image size and anchor sizes, or set '
- '``allowed_border`` to -1 to skip the condition.')
- # assign gt and sample anchors
- num_square = flat_squares.size(0)
- approxs = flat_approxs.view(num_square, self.approxs_per_octave, 4)
- approxs = approxs[inside_flags, ...]
- squares = flat_squares[inside_flags, :]
- pred_instances = InstanceData()
- pred_instances.priors = squares
- pred_instances.approxs = approxs
- assign_result = self.ga_assigner.assign(
- pred_instances=pred_instances,
- gt_instances=gt_instances,
- gt_instances_ignore=gt_instances_ignore)
- sampling_result = self.ga_sampler.sample(
- assign_result=assign_result,
- pred_instances=pred_instances,
- gt_instances=gt_instances)
- bbox_anchors = torch.zeros_like(squares)
- bbox_gts = torch.zeros_like(squares)
- bbox_weights = torch.zeros_like(squares)
- pos_inds = sampling_result.pos_inds
- neg_inds = sampling_result.neg_inds
- if len(pos_inds) > 0:
- bbox_anchors[pos_inds, :] = sampling_result.pos_bboxes
- bbox_gts[pos_inds, :] = sampling_result.pos_gt_bboxes
- bbox_weights[pos_inds, :] = 1.0
- # map up to original set of anchors
- if unmap_outputs:
- num_total_anchors = flat_squares.size(0)
- bbox_anchors = unmap(bbox_anchors, num_total_anchors, inside_flags)
- bbox_gts = unmap(bbox_gts, num_total_anchors, inside_flags)
- bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
- return (bbox_anchors, bbox_gts, bbox_weights, pos_inds, neg_inds,
- sampling_result)
- def ga_shape_targets(self,
- approx_list: List[List[Tensor]],
- inside_flag_list: List[List[Tensor]],
- square_list: List[List[Tensor]],
- batch_gt_instances: InstanceList,
- batch_img_metas: List[dict],
- batch_gt_instances_ignore: OptInstanceList = None,
- unmap_outputs: bool = True) -> tuple:
- """Compute guided anchoring targets.
- Args:
- approx_list (list[list[Tensor]]): Multi level approxs of each
- image.
- inside_flag_list (list[list[Tensor]]): Multi level inside flags
- of each image.
- square_list (list[list[Tensor]]): Multi level squares of each
- image.
- batch_gt_instances (list[:obj:`InstanceData`]): Batch of
- gt_instance. It usually includes ``bboxes`` and ``labels``
- attributes.
- batch_img_metas (list[dict]): Meta information of each image, e.g.,
- image size, scaling factor, etc.
- batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
- Batch of gt_instances_ignore. It includes ``bboxes`` attribute
- data that is ignored during training and testing.
- Defaults to None.
- unmap_outputs (bool): unmap outputs or not. Defaults to None.
- Returns:
- tuple: Returns a tuple containing shape targets.
- """
- num_imgs = len(batch_img_metas)
- assert len(approx_list) == len(inside_flag_list) == len(
- square_list) == num_imgs
- # anchor number of multi levels
- num_level_squares = [squares.size(0) for squares in square_list[0]]
- # concat all level anchors and flags to a single tensor
- inside_flag_flat_list = []
- approx_flat_list = []
- square_flat_list = []
- for i in range(num_imgs):
- assert len(square_list[i]) == len(inside_flag_list[i])
- inside_flag_flat_list.append(torch.cat(inside_flag_list[i]))
- approx_flat_list.append(torch.cat(approx_list[i]))
- square_flat_list.append(torch.cat(square_list[i]))
- # compute targets for each image
- if batch_gt_instances_ignore is None:
- batch_gt_instances_ignore = [None for _ in range(num_imgs)]
- (all_bbox_anchors, all_bbox_gts, all_bbox_weights, pos_inds_list,
- neg_inds_list, sampling_results_list) = multi_apply(
- self._ga_shape_target_single,
- approx_flat_list,
- inside_flag_flat_list,
- square_flat_list,
- batch_gt_instances,
- batch_gt_instances_ignore,
- batch_img_metas,
- unmap_outputs=unmap_outputs)
- # sampled anchors of all images
- avg_factor = sum(
- [results.avg_factor for results in sampling_results_list])
- # split targets to a list w.r.t. multiple levels
- bbox_anchors_list = images_to_levels(all_bbox_anchors,
- num_level_squares)
- bbox_gts_list = images_to_levels(all_bbox_gts, num_level_squares)
- bbox_weights_list = images_to_levels(all_bbox_weights,
- num_level_squares)
- return (bbox_anchors_list, bbox_gts_list, bbox_weights_list,
- avg_factor)
- def loss_shape_single(self, shape_pred: Tensor, bbox_anchors: Tensor,
- bbox_gts: Tensor, anchor_weights: Tensor,
- avg_factor: int) -> Tensor:
- """Compute shape loss in single level."""
- shape_pred = shape_pred.permute(0, 2, 3, 1).contiguous().view(-1, 2)
- bbox_anchors = bbox_anchors.contiguous().view(-1, 4)
- bbox_gts = bbox_gts.contiguous().view(-1, 4)
- anchor_weights = anchor_weights.contiguous().view(-1, 4)
- bbox_deltas = bbox_anchors.new_full(bbox_anchors.size(), 0)
- bbox_deltas[:, 2:] += shape_pred
- # filter out negative samples to speed-up weighted_bounded_iou_loss
- inds = torch.nonzero(
- anchor_weights[:, 0] > 0, as_tuple=False).squeeze(1)
- bbox_deltas_ = bbox_deltas[inds]
- bbox_anchors_ = bbox_anchors[inds]
- bbox_gts_ = bbox_gts[inds]
- anchor_weights_ = anchor_weights[inds]
- pred_anchors_ = self.anchor_coder.decode(
- bbox_anchors_, bbox_deltas_, wh_ratio_clip=1e-6)
- loss_shape = self.loss_shape(
- pred_anchors_, bbox_gts_, anchor_weights_, avg_factor=avg_factor)
- return loss_shape
- def loss_loc_single(self, loc_pred: Tensor, loc_target: Tensor,
- loc_weight: Tensor, avg_factor: float) -> Tensor:
- """Compute location loss in single level."""
- loss_loc = self.loss_loc(
- loc_pred.reshape(-1, 1),
- loc_target.reshape(-1).long(),
- loc_weight.reshape(-1),
- avg_factor=avg_factor)
- return loss_loc
- def loss_by_feat(
- self,
- cls_scores: List[Tensor],
- bbox_preds: List[Tensor],
- shape_preds: List[Tensor],
- loc_preds: List[Tensor],
- batch_gt_instances: InstanceList,
- batch_img_metas: List[dict],
- batch_gt_instances_ignore: OptInstanceList = None) -> dict:
- """Calculate the loss based on the features extracted by the detection
- head.
- Args:
- cls_scores (list[Tensor]): Box scores for each scale level
- has shape (N, num_anchors * num_classes, H, W).
- bbox_preds (list[Tensor]): Box energies / deltas for each scale
- level with shape (N, num_anchors * 4, H, W).
- shape_preds (list[Tensor]): shape predictions for each scale
- level with shape (N, 1, H, W).
- loc_preds (list[Tensor]): location predictions for each scale
- level with shape (N, num_anchors * 2, H, W).
- batch_gt_instances (list[:obj:`InstanceData`]): Batch of
- gt_instance. It usually includes ``bboxes`` and ``labels``
- attributes.
- batch_img_metas (list[dict]): Meta information of each image, e.g.,
- image size, scaling factor, etc.
- batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
- Batch of gt_instances_ignore. It includes ``bboxes`` attribute
- data that is ignored during training and testing.
- Defaults to None.
- Returns:
- dict: A dictionary of loss components.
- """
- featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
- assert len(featmap_sizes) == self.approx_anchor_generator.num_levels
- device = cls_scores[0].device
- # get loc targets
- loc_targets, loc_weights, loc_avg_factor = self.ga_loc_targets(
- batch_gt_instances, featmap_sizes)
- # get sampled approxes
- approxs_list, inside_flag_list = self.get_sampled_approxs(
- featmap_sizes, batch_img_metas, device=device)
- # get squares and guided anchors
- squares_list, guided_anchors_list, _ = self.get_anchors(
- featmap_sizes,
- shape_preds,
- loc_preds,
- batch_img_metas,
- device=device)
- # get shape targets
- shape_targets = self.ga_shape_targets(approxs_list, inside_flag_list,
- squares_list, batch_gt_instances,
- batch_img_metas)
- (bbox_anchors_list, bbox_gts_list, anchor_weights_list,
- ga_avg_factor) = shape_targets
- # get anchor targets
- cls_reg_targets = self.get_targets(
- guided_anchors_list,
- inside_flag_list,
- batch_gt_instances,
- batch_img_metas,
- batch_gt_instances_ignore=batch_gt_instances_ignore)
- (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
- avg_factor) = cls_reg_targets
- # anchor number of multi levels
- num_level_anchors = [
- anchors.size(0) for anchors in guided_anchors_list[0]
- ]
- # concat all level anchors to a single tensor
- concat_anchor_list = []
- for i in range(len(guided_anchors_list)):
- concat_anchor_list.append(torch.cat(guided_anchors_list[i]))
- all_anchor_list = images_to_levels(concat_anchor_list,
- num_level_anchors)
- # get classification and bbox regression losses
- losses_cls, losses_bbox = multi_apply(
- self.loss_by_feat_single,
- cls_scores,
- bbox_preds,
- all_anchor_list,
- labels_list,
- label_weights_list,
- bbox_targets_list,
- bbox_weights_list,
- avg_factor=avg_factor)
- # get anchor location loss
- losses_loc = []
- for i in range(len(loc_preds)):
- loss_loc = self.loss_loc_single(
- loc_preds[i],
- loc_targets[i],
- loc_weights[i],
- avg_factor=loc_avg_factor)
- losses_loc.append(loss_loc)
- # get anchor shape loss
- losses_shape = []
- for i in range(len(shape_preds)):
- loss_shape = self.loss_shape_single(
- shape_preds[i],
- bbox_anchors_list[i],
- bbox_gts_list[i],
- anchor_weights_list[i],
- avg_factor=ga_avg_factor)
- losses_shape.append(loss_shape)
- return dict(
- loss_cls=losses_cls,
- loss_bbox=losses_bbox,
- loss_shape=losses_shape,
- loss_loc=losses_loc)
- def predict_by_feat(self,
- cls_scores: List[Tensor],
- bbox_preds: List[Tensor],
- shape_preds: List[Tensor],
- loc_preds: List[Tensor],
- batch_img_metas: List[dict],
- cfg: OptConfigType = None,
- rescale: bool = False) -> InstanceList:
- """Transform a batch of output features extracted from the head into
- bbox results.
- Args:
- cls_scores (list[Tensor]): Classification scores for all
- scale levels, each is a 4D-tensor, has shape
- (batch_size, num_priors * num_classes, H, W).
- bbox_preds (list[Tensor]): Box energies / deltas for all
- scale levels, each is a 4D-tensor, has shape
- (batch_size, num_priors * 4, H, W).
- shape_preds (list[Tensor]): shape predictions for each scale
- level with shape (N, 1, H, W).
- loc_preds (list[Tensor]): location predictions for each scale
- level with shape (N, num_anchors * 2, H, W).
- batch_img_metas (list[dict], Optional): Batch image meta info.
- Defaults to None.
- cfg (ConfigDict, optional): Test / postprocessing
- configuration, if None, test_cfg would be used.
- Defaults to None.
- rescale (bool): If True, return boxes in original image space.
- Defaults to False.
- Returns:
- list[:obj:`InstanceData`]: Object detection results of each image
- after the post process. Each item usually contains following keys.
- - scores (Tensor): Classification scores, has a shape
- (num_instance, )
- - labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
- - bboxes (Tensor): Has a shape (num_instances, 4), the last
- dimension 4 arrange as (x1, y1, x2, y2).
- """
- assert len(cls_scores) == len(bbox_preds) == len(shape_preds) == len(
- loc_preds)
- num_levels = len(cls_scores)
- featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
- device = cls_scores[0].device
- # get guided anchors
- _, guided_anchors, loc_masks = self.get_anchors(
- featmap_sizes,
- shape_preds,
- loc_preds,
- batch_img_metas,
- use_loc_filter=not self.training,
- device=device)
- result_list = []
- for img_id in range(len(batch_img_metas)):
- cls_score_list = [
- cls_scores[i][img_id].detach() for i in range(num_levels)
- ]
- bbox_pred_list = [
- bbox_preds[i][img_id].detach() for i in range(num_levels)
- ]
- guided_anchor_list = [
- guided_anchors[img_id][i].detach() for i in range(num_levels)
- ]
- loc_mask_list = [
- loc_masks[img_id][i].detach() for i in range(num_levels)
- ]
- proposals = self._predict_by_feat_single(
- cls_scores=cls_score_list,
- bbox_preds=bbox_pred_list,
- mlvl_anchors=guided_anchor_list,
- mlvl_masks=loc_mask_list,
- img_meta=batch_img_metas[img_id],
- cfg=cfg,
- rescale=rescale)
- result_list.append(proposals)
- return result_list
- def _predict_by_feat_single(self,
- cls_scores: List[Tensor],
- bbox_preds: List[Tensor],
- mlvl_anchors: List[Tensor],
- mlvl_masks: List[Tensor],
- img_meta: dict,
- cfg: ConfigType,
- rescale: bool = False) -> InstanceData:
- """Transform a single image's features extracted from the head into
- bbox results.
- Args:
- cls_scores (list[Tensor]): Box scores from all scale
- levels of a single image, each item has shape
- (num_priors * num_classes, H, W).
- bbox_preds (list[Tensor]): Box energies / deltas from
- all scale levels of a single image, each item has shape
- (num_priors * 4, H, W).
- mlvl_anchors (list[Tensor]): Each element in the list is
- the anchors of a single level in feature pyramid. it has
- shape (num_priors, 4).
- mlvl_masks (list[Tensor]): Each element in the list is location
- masks of a single level.
- img_meta (dict): Image meta info.
- cfg (:obj:`ConfigDict` or dict): Test / postprocessing
- configuration, if None, test_cfg would be used.
- rescale (bool): If True, return boxes in original image space.
- Defaults to False.
- Returns:
- :obj:`InstanceData`: Detection results of each image
- after the post process.
- Each item usually contains following keys.
- - scores (Tensor): Classification scores, has a shape
- (num_instance, )
- - labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
- - bboxes (Tensor): Has a shape (num_instances, 4), the last
- dimension 4 arrange as (x1, y1, x2, y2).
- """
- cfg = self.test_cfg if cfg is None else cfg
- assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
- mlvl_bbox_preds = []
- mlvl_valid_anchors = []
- mlvl_scores = []
- for cls_score, bbox_pred, anchors, mask in zip(cls_scores, bbox_preds,
- mlvl_anchors,
- mlvl_masks):
- assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
- # if no location is kept, end.
- if mask.sum() == 0:
- continue
- # reshape scores and bbox_pred
- cls_score = cls_score.permute(1, 2,
- 0).reshape(-1, self.cls_out_channels)
- if self.use_sigmoid_cls:
- scores = cls_score.sigmoid()
- else:
- scores = cls_score.softmax(-1)
- bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
- # filter scores, bbox_pred w.r.t. mask.
- # anchors are filtered in get_anchors() beforehand.
- scores = scores[mask, :]
- bbox_pred = bbox_pred[mask, :]
- if scores.dim() == 0:
- anchors = anchors.unsqueeze(0)
- scores = scores.unsqueeze(0)
- bbox_pred = bbox_pred.unsqueeze(0)
- # filter anchors, bbox_pred, scores w.r.t. scores
- nms_pre = cfg.get('nms_pre', -1)
- if nms_pre > 0 and scores.shape[0] > nms_pre:
- if self.use_sigmoid_cls:
- max_scores, _ = scores.max(dim=1)
- else:
- # remind that we set FG labels to [0, num_class-1]
- # since mmdet v2.0
- # BG cat_id: num_class
- max_scores, _ = scores[:, :-1].max(dim=1)
- _, topk_inds = max_scores.topk(nms_pre)
- anchors = anchors[topk_inds, :]
- bbox_pred = bbox_pred[topk_inds, :]
- scores = scores[topk_inds, :]
- mlvl_bbox_preds.append(bbox_pred)
- mlvl_valid_anchors.append(anchors)
- mlvl_scores.append(scores)
- mlvl_bbox_preds = torch.cat(mlvl_bbox_preds)
- mlvl_anchors = torch.cat(mlvl_valid_anchors)
- mlvl_scores = torch.cat(mlvl_scores)
- mlvl_bboxes = self.bbox_coder.decode(
- mlvl_anchors, mlvl_bbox_preds, max_shape=img_meta['img_shape'])
- if rescale:
- assert img_meta.get('scale_factor') is not None
- mlvl_bboxes /= mlvl_bboxes.new_tensor(
- img_meta['scale_factor']).repeat((1, 2))
- if self.use_sigmoid_cls:
- # Add a dummy background class to the backend when using sigmoid
- # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
- # BG cat_id: num_class
- padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
- mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
- # multi class NMS
- det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores,
- cfg.score_thr, cfg.nms,
- cfg.max_per_img)
- results = InstanceData()
- results.bboxes = det_bboxes[:, :-1]
- results.scores = det_bboxes[:, -1]
- results.labels = det_labels
- return results
|