1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084 |
- # Copyright (c) OpenMMLab. All rights reserved.
- from logging import warning
- from math import ceil, log
- from typing import List, Optional, Sequence, Tuple
- import torch
- import torch.nn as nn
- from mmcv.cnn import ConvModule
- from mmcv.ops import CornerPool, batched_nms
- from mmengine.config import ConfigDict
- from mmengine.model import BaseModule, bias_init_with_prob
- from mmengine.structures import InstanceData
- from torch import Tensor
- from mmdet.registry import MODELS
- from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
- OptInstanceList, OptMultiConfig)
- from ..utils import (gather_feat, gaussian_radius, gen_gaussian_target,
- get_local_maximum, get_topk_from_heatmap, multi_apply,
- transpose_and_gather_feat)
- from .base_dense_head import BaseDenseHead
- class BiCornerPool(BaseModule):
- """Bidirectional Corner Pooling Module (TopLeft, BottomRight, etc.)
- Args:
- in_channels (int): Input channels of module.
- directions (list[str]): Directions of two CornerPools.
- out_channels (int): Output channels of module.
- feat_channels (int): Feature channels of module.
- norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct
- and config norm layer.
- init_cfg (:obj:`ConfigDict` or dict, optional): the config to
- control the initialization.
- """
- def __init__(self,
- in_channels: int,
- directions: List[int],
- feat_channels: int = 128,
- out_channels: int = 128,
- norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
- init_cfg: OptMultiConfig = None) -> None:
- super().__init__(init_cfg)
- self.direction1_conv = ConvModule(
- in_channels, feat_channels, 3, padding=1, norm_cfg=norm_cfg)
- self.direction2_conv = ConvModule(
- in_channels, feat_channels, 3, padding=1, norm_cfg=norm_cfg)
- self.aftpool_conv = ConvModule(
- feat_channels,
- out_channels,
- 3,
- padding=1,
- norm_cfg=norm_cfg,
- act_cfg=None)
- self.conv1 = ConvModule(
- in_channels, out_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
- self.conv2 = ConvModule(
- in_channels, out_channels, 3, padding=1, norm_cfg=norm_cfg)
- self.direction1_pool = CornerPool(directions[0])
- self.direction2_pool = CornerPool(directions[1])
- self.relu = nn.ReLU(inplace=True)
- def forward(self, x: Tensor) -> Tensor:
- """Forward features from the upstream network.
- Args:
- x (tensor): Input feature of BiCornerPool.
- Returns:
- conv2 (tensor): Output feature of BiCornerPool.
- """
- direction1_conv = self.direction1_conv(x)
- direction2_conv = self.direction2_conv(x)
- direction1_feat = self.direction1_pool(direction1_conv)
- direction2_feat = self.direction2_pool(direction2_conv)
- aftpool_conv = self.aftpool_conv(direction1_feat + direction2_feat)
- conv1 = self.conv1(x)
- relu = self.relu(aftpool_conv + conv1)
- conv2 = self.conv2(relu)
- return conv2
- @MODELS.register_module()
- class CornerHead(BaseDenseHead):
- """Head of CornerNet: Detecting Objects as Paired Keypoints.
- Code is modified from the `official github repo
- <https://github.com/princeton-vl/CornerNet/blob/master/models/py_utils/
- kp.py#L73>`_ .
- More details can be found in the `paper
- <https://arxiv.org/abs/1808.01244>`_ .
- Args:
- num_classes (int): Number of categories excluding the background
- category.
- in_channels (int): Number of channels in the input feature map.
- num_feat_levels (int): Levels of feature from the previous module.
- 2 for HourglassNet-104 and 1 for HourglassNet-52. Because
- HourglassNet-104 outputs the final feature and intermediate
- supervision feature and HourglassNet-52 only outputs the final
- feature. Defaults to 2.
- corner_emb_channels (int): Channel of embedding vector. Defaults to 1.
- train_cfg (:obj:`ConfigDict` or dict, optional): Training config.
- Useless in CornerHead, but we keep this variable for
- SingleStageDetector.
- test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
- CornerHead.
- loss_heatmap (:obj:`ConfigDict` or dict): Config of corner heatmap
- loss. Defaults to GaussianFocalLoss.
- loss_embedding (:obj:`ConfigDict` or dict): Config of corner embedding
- loss. Defaults to AssociativeEmbeddingLoss.
- loss_offset (:obj:`ConfigDict` or dict): Config of corner offset loss.
- Defaults to SmoothL1Loss.
- init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
- the initialization.
- """
- def __init__(self,
- num_classes: int,
- in_channels: int,
- num_feat_levels: int = 2,
- corner_emb_channels: int = 1,
- train_cfg: OptConfigType = None,
- test_cfg: OptConfigType = None,
- loss_heatmap: ConfigType = dict(
- type='GaussianFocalLoss',
- alpha=2.0,
- gamma=4.0,
- loss_weight=1),
- loss_embedding: ConfigType = dict(
- type='AssociativeEmbeddingLoss',
- pull_weight=0.25,
- push_weight=0.25),
- loss_offset: ConfigType = dict(
- type='SmoothL1Loss', beta=1.0, loss_weight=1),
- init_cfg: OptMultiConfig = None) -> None:
- assert init_cfg is None, 'To prevent abnormal initialization ' \
- 'behavior, init_cfg is not allowed to be set'
- super().__init__(init_cfg=init_cfg)
- self.num_classes = num_classes
- self.in_channels = in_channels
- self.corner_emb_channels = corner_emb_channels
- self.with_corner_emb = self.corner_emb_channels > 0
- self.corner_offset_channels = 2
- self.num_feat_levels = num_feat_levels
- self.loss_heatmap = MODELS.build(
- loss_heatmap) if loss_heatmap is not None else None
- self.loss_embedding = MODELS.build(
- loss_embedding) if loss_embedding is not None else None
- self.loss_offset = MODELS.build(
- loss_offset) if loss_offset is not None else None
- self.train_cfg = train_cfg
- self.test_cfg = test_cfg
- self._init_layers()
- def _make_layers(self,
- out_channels: int,
- in_channels: int = 256,
- feat_channels: int = 256) -> nn.Sequential:
- """Initialize conv sequential for CornerHead."""
- return nn.Sequential(
- ConvModule(in_channels, feat_channels, 3, padding=1),
- ConvModule(
- feat_channels, out_channels, 1, norm_cfg=None, act_cfg=None))
- def _init_corner_kpt_layers(self) -> None:
- """Initialize corner keypoint layers.
- Including corner heatmap branch and corner offset branch. Each branch
- has two parts: prefix `tl_` for top-left and `br_` for bottom-right.
- """
- self.tl_pool, self.br_pool = nn.ModuleList(), nn.ModuleList()
- self.tl_heat, self.br_heat = nn.ModuleList(), nn.ModuleList()
- self.tl_off, self.br_off = nn.ModuleList(), nn.ModuleList()
- for _ in range(self.num_feat_levels):
- self.tl_pool.append(
- BiCornerPool(
- self.in_channels, ['top', 'left'],
- out_channels=self.in_channels))
- self.br_pool.append(
- BiCornerPool(
- self.in_channels, ['bottom', 'right'],
- out_channels=self.in_channels))
- self.tl_heat.append(
- self._make_layers(
- out_channels=self.num_classes,
- in_channels=self.in_channels))
- self.br_heat.append(
- self._make_layers(
- out_channels=self.num_classes,
- in_channels=self.in_channels))
- self.tl_off.append(
- self._make_layers(
- out_channels=self.corner_offset_channels,
- in_channels=self.in_channels))
- self.br_off.append(
- self._make_layers(
- out_channels=self.corner_offset_channels,
- in_channels=self.in_channels))
- def _init_corner_emb_layers(self) -> None:
- """Initialize corner embedding layers.
- Only include corner embedding branch with two parts: prefix `tl_` for
- top-left and `br_` for bottom-right.
- """
- self.tl_emb, self.br_emb = nn.ModuleList(), nn.ModuleList()
- for _ in range(self.num_feat_levels):
- self.tl_emb.append(
- self._make_layers(
- out_channels=self.corner_emb_channels,
- in_channels=self.in_channels))
- self.br_emb.append(
- self._make_layers(
- out_channels=self.corner_emb_channels,
- in_channels=self.in_channels))
- def _init_layers(self) -> None:
- """Initialize layers for CornerHead.
- Including two parts: corner keypoint layers and corner embedding layers
- """
- self._init_corner_kpt_layers()
- if self.with_corner_emb:
- self._init_corner_emb_layers()
- def init_weights(self) -> None:
- super().init_weights()
- bias_init = bias_init_with_prob(0.1)
- for i in range(self.num_feat_levels):
- # The initialization of parameters are different between
- # nn.Conv2d and ConvModule. Our experiments show that
- # using the original initialization of nn.Conv2d increases
- # the final mAP by about 0.2%
- self.tl_heat[i][-1].conv.reset_parameters()
- self.tl_heat[i][-1].conv.bias.data.fill_(bias_init)
- self.br_heat[i][-1].conv.reset_parameters()
- self.br_heat[i][-1].conv.bias.data.fill_(bias_init)
- self.tl_off[i][-1].conv.reset_parameters()
- self.br_off[i][-1].conv.reset_parameters()
- if self.with_corner_emb:
- self.tl_emb[i][-1].conv.reset_parameters()
- self.br_emb[i][-1].conv.reset_parameters()
- def forward(self, feats: Tuple[Tensor]) -> tuple:
- """Forward features from the upstream network.
- Args:
- feats (tuple[Tensor]): Features from the upstream network, each is
- a 4D-tensor.
- Returns:
- tuple: Usually a tuple of corner heatmaps, offset heatmaps and
- embedding heatmaps.
- - tl_heats (list[Tensor]): Top-left corner heatmaps for all
- levels, each is a 4D-tensor, the channels number is
- num_classes.
- - br_heats (list[Tensor]): Bottom-right corner heatmaps for all
- levels, each is a 4D-tensor, the channels number is
- num_classes.
- - tl_embs (list[Tensor] | list[None]): Top-left embedding
- heatmaps for all levels, each is a 4D-tensor or None.
- If not None, the channels number is corner_emb_channels.
- - br_embs (list[Tensor] | list[None]): Bottom-right embedding
- heatmaps for all levels, each is a 4D-tensor or None.
- If not None, the channels number is corner_emb_channels.
- - tl_offs (list[Tensor]): Top-left offset heatmaps for all
- levels, each is a 4D-tensor. The channels number is
- corner_offset_channels.
- - br_offs (list[Tensor]): Bottom-right offset heatmaps for all
- levels, each is a 4D-tensor. The channels number is
- corner_offset_channels.
- """
- lvl_ind = list(range(self.num_feat_levels))
- return multi_apply(self.forward_single, feats, lvl_ind)
- def forward_single(self,
- x: Tensor,
- lvl_ind: int,
- return_pool: bool = False) -> List[Tensor]:
- """Forward feature of a single level.
- Args:
- x (Tensor): Feature of a single level.
- lvl_ind (int): Level index of current feature.
- return_pool (bool): Return corner pool feature or not.
- Defaults to False.
- Returns:
- tuple[Tensor]: A tuple of CornerHead's output for current feature
- level. Containing the following Tensors:
- - tl_heat (Tensor): Predicted top-left corner heatmap.
- - br_heat (Tensor): Predicted bottom-right corner heatmap.
- - tl_emb (Tensor | None): Predicted top-left embedding heatmap.
- None for `self.with_corner_emb == False`.
- - br_emb (Tensor | None): Predicted bottom-right embedding
- heatmap. None for `self.with_corner_emb == False`.
- - tl_off (Tensor): Predicted top-left offset heatmap.
- - br_off (Tensor): Predicted bottom-right offset heatmap.
- - tl_pool (Tensor): Top-left corner pool feature. Not must
- have.
- - br_pool (Tensor): Bottom-right corner pool feature. Not must
- have.
- """
- tl_pool = self.tl_pool[lvl_ind](x)
- tl_heat = self.tl_heat[lvl_ind](tl_pool)
- br_pool = self.br_pool[lvl_ind](x)
- br_heat = self.br_heat[lvl_ind](br_pool)
- tl_emb, br_emb = None, None
- if self.with_corner_emb:
- tl_emb = self.tl_emb[lvl_ind](tl_pool)
- br_emb = self.br_emb[lvl_ind](br_pool)
- tl_off = self.tl_off[lvl_ind](tl_pool)
- br_off = self.br_off[lvl_ind](br_pool)
- result_list = [tl_heat, br_heat, tl_emb, br_emb, tl_off, br_off]
- if return_pool:
- result_list.append(tl_pool)
- result_list.append(br_pool)
- return result_list
- def get_targets(self,
- gt_bboxes: List[Tensor],
- gt_labels: List[Tensor],
- feat_shape: Sequence[int],
- img_shape: Sequence[int],
- with_corner_emb: bool = False,
- with_guiding_shift: bool = False,
- with_centripetal_shift: bool = False) -> dict:
- """Generate corner targets.
- Including corner heatmap, corner offset.
- Optional: corner embedding, corner guiding shift, centripetal shift.
- For CornerNet, we generate corner heatmap, corner offset and corner
- embedding from this function.
- For CentripetalNet, we generate corner heatmap, corner offset, guiding
- shift and centripetal shift from this function.
- Args:
- gt_bboxes (list[Tensor]): Ground truth bboxes of each image, each
- has shape (num_gt, 4).
- gt_labels (list[Tensor]): Ground truth labels of each box, each has
- shape (num_gt, ).
- feat_shape (Sequence[int]): Shape of output feature,
- [batch, channel, height, width].
- img_shape (Sequence[int]): Shape of input image,
- [height, width, channel].
- with_corner_emb (bool): Generate corner embedding target or not.
- Defaults to False.
- with_guiding_shift (bool): Generate guiding shift target or not.
- Defaults to False.
- with_centripetal_shift (bool): Generate centripetal shift target or
- not. Defaults to False.
- Returns:
- dict: Ground truth of corner heatmap, corner offset, corner
- embedding, guiding shift and centripetal shift. Containing the
- following keys:
- - topleft_heatmap (Tensor): Ground truth top-left corner
- heatmap.
- - bottomright_heatmap (Tensor): Ground truth bottom-right
- corner heatmap.
- - topleft_offset (Tensor): Ground truth top-left corner offset.
- - bottomright_offset (Tensor): Ground truth bottom-right corner
- offset.
- - corner_embedding (list[list[list[int]]]): Ground truth corner
- embedding. Not must have.
- - topleft_guiding_shift (Tensor): Ground truth top-left corner
- guiding shift. Not must have.
- - bottomright_guiding_shift (Tensor): Ground truth bottom-right
- corner guiding shift. Not must have.
- - topleft_centripetal_shift (Tensor): Ground truth top-left
- corner centripetal shift. Not must have.
- - bottomright_centripetal_shift (Tensor): Ground truth
- bottom-right corner centripetal shift. Not must have.
- """
- batch_size, _, height, width = feat_shape
- img_h, img_w = img_shape[:2]
- width_ratio = float(width / img_w)
- height_ratio = float(height / img_h)
- gt_tl_heatmap = gt_bboxes[-1].new_zeros(
- [batch_size, self.num_classes, height, width])
- gt_br_heatmap = gt_bboxes[-1].new_zeros(
- [batch_size, self.num_classes, height, width])
- gt_tl_offset = gt_bboxes[-1].new_zeros([batch_size, 2, height, width])
- gt_br_offset = gt_bboxes[-1].new_zeros([batch_size, 2, height, width])
- if with_corner_emb:
- match = []
- # Guiding shift is a kind of offset, from center to corner
- if with_guiding_shift:
- gt_tl_guiding_shift = gt_bboxes[-1].new_zeros(
- [batch_size, 2, height, width])
- gt_br_guiding_shift = gt_bboxes[-1].new_zeros(
- [batch_size, 2, height, width])
- # Centripetal shift is also a kind of offset, from center to corner
- # and normalized by log.
- if with_centripetal_shift:
- gt_tl_centripetal_shift = gt_bboxes[-1].new_zeros(
- [batch_size, 2, height, width])
- gt_br_centripetal_shift = gt_bboxes[-1].new_zeros(
- [batch_size, 2, height, width])
- for batch_id in range(batch_size):
- # Ground truth of corner embedding per image is a list of coord set
- corner_match = []
- for box_id in range(len(gt_labels[batch_id])):
- left, top, right, bottom = gt_bboxes[batch_id][box_id]
- center_x = (left + right) / 2.0
- center_y = (top + bottom) / 2.0
- label = gt_labels[batch_id][box_id]
- # Use coords in the feature level to generate ground truth
- scale_left = left * width_ratio
- scale_right = right * width_ratio
- scale_top = top * height_ratio
- scale_bottom = bottom * height_ratio
- scale_center_x = center_x * width_ratio
- scale_center_y = center_y * height_ratio
- # Int coords on feature map/ground truth tensor
- left_idx = int(min(scale_left, width - 1))
- right_idx = int(min(scale_right, width - 1))
- top_idx = int(min(scale_top, height - 1))
- bottom_idx = int(min(scale_bottom, height - 1))
- # Generate gaussian heatmap
- scale_box_width = ceil(scale_right - scale_left)
- scale_box_height = ceil(scale_bottom - scale_top)
- radius = gaussian_radius((scale_box_height, scale_box_width),
- min_overlap=0.3)
- radius = max(0, int(radius))
- gt_tl_heatmap[batch_id, label] = gen_gaussian_target(
- gt_tl_heatmap[batch_id, label], [left_idx, top_idx],
- radius)
- gt_br_heatmap[batch_id, label] = gen_gaussian_target(
- gt_br_heatmap[batch_id, label], [right_idx, bottom_idx],
- radius)
- # Generate corner offset
- left_offset = scale_left - left_idx
- top_offset = scale_top - top_idx
- right_offset = scale_right - right_idx
- bottom_offset = scale_bottom - bottom_idx
- gt_tl_offset[batch_id, 0, top_idx, left_idx] = left_offset
- gt_tl_offset[batch_id, 1, top_idx, left_idx] = top_offset
- gt_br_offset[batch_id, 0, bottom_idx, right_idx] = right_offset
- gt_br_offset[batch_id, 1, bottom_idx,
- right_idx] = bottom_offset
- # Generate corner embedding
- if with_corner_emb:
- corner_match.append([[top_idx, left_idx],
- [bottom_idx, right_idx]])
- # Generate guiding shift
- if with_guiding_shift:
- gt_tl_guiding_shift[batch_id, 0, top_idx,
- left_idx] = scale_center_x - left_idx
- gt_tl_guiding_shift[batch_id, 1, top_idx,
- left_idx] = scale_center_y - top_idx
- gt_br_guiding_shift[batch_id, 0, bottom_idx,
- right_idx] = right_idx - scale_center_x
- gt_br_guiding_shift[
- batch_id, 1, bottom_idx,
- right_idx] = bottom_idx - scale_center_y
- # Generate centripetal shift
- if with_centripetal_shift:
- gt_tl_centripetal_shift[batch_id, 0, top_idx,
- left_idx] = log(scale_center_x -
- scale_left)
- gt_tl_centripetal_shift[batch_id, 1, top_idx,
- left_idx] = log(scale_center_y -
- scale_top)
- gt_br_centripetal_shift[batch_id, 0, bottom_idx,
- right_idx] = log(scale_right -
- scale_center_x)
- gt_br_centripetal_shift[batch_id, 1, bottom_idx,
- right_idx] = log(scale_bottom -
- scale_center_y)
- if with_corner_emb:
- match.append(corner_match)
- target_result = dict(
- topleft_heatmap=gt_tl_heatmap,
- topleft_offset=gt_tl_offset,
- bottomright_heatmap=gt_br_heatmap,
- bottomright_offset=gt_br_offset)
- if with_corner_emb:
- target_result.update(corner_embedding=match)
- if with_guiding_shift:
- target_result.update(
- topleft_guiding_shift=gt_tl_guiding_shift,
- bottomright_guiding_shift=gt_br_guiding_shift)
- if with_centripetal_shift:
- target_result.update(
- topleft_centripetal_shift=gt_tl_centripetal_shift,
- bottomright_centripetal_shift=gt_br_centripetal_shift)
- return target_result
- def loss_by_feat(
- self,
- tl_heats: List[Tensor],
- br_heats: List[Tensor],
- tl_embs: List[Tensor],
- br_embs: List[Tensor],
- tl_offs: List[Tensor],
- br_offs: List[Tensor],
- batch_gt_instances: InstanceList,
- batch_img_metas: List[dict],
- batch_gt_instances_ignore: OptInstanceList = None) -> dict:
- """Calculate the loss based on the features extracted by the detection
- head.
- Args:
- tl_heats (list[Tensor]): Top-left corner heatmaps for each level
- with shape (N, num_classes, H, W).
- br_heats (list[Tensor]): Bottom-right corner heatmaps for each
- level with shape (N, num_classes, H, W).
- tl_embs (list[Tensor]): Top-left corner embeddings for each level
- with shape (N, corner_emb_channels, H, W).
- br_embs (list[Tensor]): Bottom-right corner embeddings for each
- level with shape (N, corner_emb_channels, H, W).
- tl_offs (list[Tensor]): Top-left corner offsets for each level
- with shape (N, corner_offset_channels, H, W).
- br_offs (list[Tensor]): Bottom-right corner offsets for each level
- with shape (N, corner_offset_channels, H, W).
- batch_gt_instances (list[:obj:`InstanceData`]): Batch of
- gt_instance. It usually includes ``bboxes`` and ``labels``
- attributes.
- batch_img_metas (list[dict]): Meta information of each image, e.g.,
- image size, scaling factor, etc.
- batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
- Specify which bounding boxes can be ignored when computing
- the loss.
- Returns:
- dict[str, Tensor]: A dictionary of loss components. Containing the
- following losses:
- - det_loss (list[Tensor]): Corner keypoint losses of all
- feature levels.
- - pull_loss (list[Tensor]): Part one of AssociativeEmbedding
- losses of all feature levels.
- - push_loss (list[Tensor]): Part two of AssociativeEmbedding
- losses of all feature levels.
- - off_loss (list[Tensor]): Corner offset losses of all feature
- levels.
- """
- gt_bboxes = [
- gt_instances.bboxes for gt_instances in batch_gt_instances
- ]
- gt_labels = [
- gt_instances.labels for gt_instances in batch_gt_instances
- ]
- targets = self.get_targets(
- gt_bboxes,
- gt_labels,
- tl_heats[-1].shape,
- batch_img_metas[0]['batch_input_shape'],
- with_corner_emb=self.with_corner_emb)
- mlvl_targets = [targets for _ in range(self.num_feat_levels)]
- det_losses, pull_losses, push_losses, off_losses = multi_apply(
- self.loss_by_feat_single, tl_heats, br_heats, tl_embs, br_embs,
- tl_offs, br_offs, mlvl_targets)
- loss_dict = dict(det_loss=det_losses, off_loss=off_losses)
- if self.with_corner_emb:
- loss_dict.update(pull_loss=pull_losses, push_loss=push_losses)
- return loss_dict
- def loss_by_feat_single(self, tl_hmp: Tensor, br_hmp: Tensor,
- tl_emb: Optional[Tensor], br_emb: Optional[Tensor],
- tl_off: Tensor, br_off: Tensor,
- targets: dict) -> Tuple[Tensor, ...]:
- """Calculate the loss of a single scale level based on the features
- extracted by the detection head.
- Args:
- tl_hmp (Tensor): Top-left corner heatmap for current level with
- shape (N, num_classes, H, W).
- br_hmp (Tensor): Bottom-right corner heatmap for current level with
- shape (N, num_classes, H, W).
- tl_emb (Tensor, optional): Top-left corner embedding for current
- level with shape (N, corner_emb_channels, H, W).
- br_emb (Tensor, optional): Bottom-right corner embedding for
- current level with shape (N, corner_emb_channels, H, W).
- tl_off (Tensor): Top-left corner offset for current level with
- shape (N, corner_offset_channels, H, W).
- br_off (Tensor): Bottom-right corner offset for current level with
- shape (N, corner_offset_channels, H, W).
- targets (dict): Corner target generated by `get_targets`.
- Returns:
- tuple[torch.Tensor]: Losses of the head's different branches
- containing the following losses:
- - det_loss (Tensor): Corner keypoint loss.
- - pull_loss (Tensor): Part one of AssociativeEmbedding loss.
- - push_loss (Tensor): Part two of AssociativeEmbedding loss.
- - off_loss (Tensor): Corner offset loss.
- """
- gt_tl_hmp = targets['topleft_heatmap']
- gt_br_hmp = targets['bottomright_heatmap']
- gt_tl_off = targets['topleft_offset']
- gt_br_off = targets['bottomright_offset']
- gt_embedding = targets['corner_embedding']
- # Detection loss
- tl_det_loss = self.loss_heatmap(
- tl_hmp.sigmoid(),
- gt_tl_hmp,
- avg_factor=max(1,
- gt_tl_hmp.eq(1).sum()))
- br_det_loss = self.loss_heatmap(
- br_hmp.sigmoid(),
- gt_br_hmp,
- avg_factor=max(1,
- gt_br_hmp.eq(1).sum()))
- det_loss = (tl_det_loss + br_det_loss) / 2.0
- # AssociativeEmbedding loss
- if self.with_corner_emb and self.loss_embedding is not None:
- pull_loss, push_loss = self.loss_embedding(tl_emb, br_emb,
- gt_embedding)
- else:
- pull_loss, push_loss = None, None
- # Offset loss
- # We only compute the offset loss at the real corner position.
- # The value of real corner would be 1 in heatmap ground truth.
- # The mask is computed in class agnostic mode and its shape is
- # batch * 1 * width * height.
- tl_off_mask = gt_tl_hmp.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
- gt_tl_hmp)
- br_off_mask = gt_br_hmp.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
- gt_br_hmp)
- tl_off_loss = self.loss_offset(
- tl_off,
- gt_tl_off,
- tl_off_mask,
- avg_factor=max(1, tl_off_mask.sum()))
- br_off_loss = self.loss_offset(
- br_off,
- gt_br_off,
- br_off_mask,
- avg_factor=max(1, br_off_mask.sum()))
- off_loss = (tl_off_loss + br_off_loss) / 2.0
- return det_loss, pull_loss, push_loss, off_loss
- def predict_by_feat(self,
- tl_heats: List[Tensor],
- br_heats: List[Tensor],
- tl_embs: List[Tensor],
- br_embs: List[Tensor],
- tl_offs: List[Tensor],
- br_offs: List[Tensor],
- batch_img_metas: Optional[List[dict]] = None,
- rescale: bool = False,
- with_nms: bool = True) -> InstanceList:
- """Transform a batch of output features extracted from the head into
- bbox results.
- Args:
- tl_heats (list[Tensor]): Top-left corner heatmaps for each level
- with shape (N, num_classes, H, W).
- br_heats (list[Tensor]): Bottom-right corner heatmaps for each
- level with shape (N, num_classes, H, W).
- tl_embs (list[Tensor]): Top-left corner embeddings for each level
- with shape (N, corner_emb_channels, H, W).
- br_embs (list[Tensor]): Bottom-right corner embeddings for each
- level with shape (N, corner_emb_channels, H, W).
- tl_offs (list[Tensor]): Top-left corner offsets for each level
- with shape (N, corner_offset_channels, H, W).
- br_offs (list[Tensor]): Bottom-right corner offsets for each level
- with shape (N, corner_offset_channels, H, W).
- batch_img_metas (list[dict], optional): Batch image meta info.
- Defaults to None.
- rescale (bool): If True, return boxes in original image space.
- Defaults to False.
- with_nms (bool): If True, do nms before return boxes.
- Defaults to True.
- Returns:
- list[:obj:`InstanceData`]: Object detection results of each image
- after the post process. Each item usually contains following keys.
- - scores (Tensor): Classification scores, has a shape
- (num_instance, )
- - labels (Tensor): Labels of bboxes, has a shape
- (num_instances, ).
- - bboxes (Tensor): Has a shape (num_instances, 4),
- the last dimension 4 arrange as (x1, y1, x2, y2).
- """
- assert tl_heats[-1].shape[0] == br_heats[-1].shape[0] == len(
- batch_img_metas)
- result_list = []
- for img_id in range(len(batch_img_metas)):
- result_list.append(
- self._predict_by_feat_single(
- tl_heats[-1][img_id:img_id + 1, :],
- br_heats[-1][img_id:img_id + 1, :],
- tl_offs[-1][img_id:img_id + 1, :],
- br_offs[-1][img_id:img_id + 1, :],
- batch_img_metas[img_id],
- tl_emb=tl_embs[-1][img_id:img_id + 1, :],
- br_emb=br_embs[-1][img_id:img_id + 1, :],
- rescale=rescale,
- with_nms=with_nms))
- return result_list
- def _predict_by_feat_single(self,
- tl_heat: Tensor,
- br_heat: Tensor,
- tl_off: Tensor,
- br_off: Tensor,
- img_meta: dict,
- tl_emb: Optional[Tensor] = None,
- br_emb: Optional[Tensor] = None,
- tl_centripetal_shift: Optional[Tensor] = None,
- br_centripetal_shift: Optional[Tensor] = None,
- rescale: bool = False,
- with_nms: bool = True) -> InstanceData:
- """Transform a single image's features extracted from the head into
- bbox results.
- Args:
- tl_heat (Tensor): Top-left corner heatmap for current level with
- shape (N, num_classes, H, W).
- br_heat (Tensor): Bottom-right corner heatmap for current level
- with shape (N, num_classes, H, W).
- tl_off (Tensor): Top-left corner offset for current level with
- shape (N, corner_offset_channels, H, W).
- br_off (Tensor): Bottom-right corner offset for current level with
- shape (N, corner_offset_channels, H, W).
- img_meta (dict): Meta information of current image, e.g.,
- image size, scaling factor, etc.
- tl_emb (Tensor): Top-left corner embedding for current level with
- shape (N, corner_emb_channels, H, W).
- br_emb (Tensor): Bottom-right corner embedding for current level
- with shape (N, corner_emb_channels, H, W).
- tl_centripetal_shift: Top-left corner's centripetal shift for
- current level with shape (N, 2, H, W).
- br_centripetal_shift: Bottom-right corner's centripetal shift for
- current level with shape (N, 2, H, W).
- rescale (bool): If True, return boxes in original image space.
- Defaults to False.
- with_nms (bool): If True, do nms before return boxes.
- Defaults to True.
- Returns:
- :obj:`InstanceData`: Detection results of each image
- after the post process.
- Each item usually contains following keys.
- - scores (Tensor): Classification scores, has a shape
- (num_instance, )
- - labels (Tensor): Labels of bboxes, has a shape
- (num_instances, ).
- - bboxes (Tensor): Has a shape (num_instances, 4),
- the last dimension 4 arrange as (x1, y1, x2, y2).
- """
- if isinstance(img_meta, (list, tuple)):
- img_meta = img_meta[0]
- batch_bboxes, batch_scores, batch_clses = self._decode_heatmap(
- tl_heat=tl_heat.sigmoid(),
- br_heat=br_heat.sigmoid(),
- tl_off=tl_off,
- br_off=br_off,
- tl_emb=tl_emb,
- br_emb=br_emb,
- tl_centripetal_shift=tl_centripetal_shift,
- br_centripetal_shift=br_centripetal_shift,
- img_meta=img_meta,
- k=self.test_cfg.corner_topk,
- kernel=self.test_cfg.local_maximum_kernel,
- distance_threshold=self.test_cfg.distance_threshold)
- if rescale and 'scale_factor' in img_meta:
- batch_bboxes /= batch_bboxes.new_tensor(
- img_meta['scale_factor']).repeat((1, 2))
- bboxes = batch_bboxes.view([-1, 4])
- scores = batch_scores.view(-1)
- clses = batch_clses.view(-1)
- det_bboxes = torch.cat([bboxes, scores.unsqueeze(-1)], -1)
- keepinds = (det_bboxes[:, -1] > -0.1)
- det_bboxes = det_bboxes[keepinds]
- det_labels = clses[keepinds]
- if with_nms:
- det_bboxes, det_labels = self._bboxes_nms(det_bboxes, det_labels,
- self.test_cfg)
- results = InstanceData()
- results.bboxes = det_bboxes[..., :4]
- results.scores = det_bboxes[..., 4]
- results.labels = det_labels
- return results
- def _bboxes_nms(self, bboxes: Tensor, labels: Tensor,
- cfg: ConfigDict) -> Tuple[Tensor, Tensor]:
- """bboxes nms."""
- if 'nms_cfg' in cfg:
- warning.warn('nms_cfg in test_cfg will be deprecated. '
- 'Please rename it as nms')
- if 'nms' not in cfg:
- cfg.nms = cfg.nms_cfg
- if labels.numel() > 0:
- max_num = cfg.max_per_img
- bboxes, keep = batched_nms(bboxes[:, :4], bboxes[:,
- -1].contiguous(),
- labels, cfg.nms)
- if max_num > 0:
- bboxes = bboxes[:max_num]
- labels = labels[keep][:max_num]
- return bboxes, labels
- def _decode_heatmap(self,
- tl_heat: Tensor,
- br_heat: Tensor,
- tl_off: Tensor,
- br_off: Tensor,
- tl_emb: Optional[Tensor] = None,
- br_emb: Optional[Tensor] = None,
- tl_centripetal_shift: Optional[Tensor] = None,
- br_centripetal_shift: Optional[Tensor] = None,
- img_meta: Optional[dict] = None,
- k: int = 100,
- kernel: int = 3,
- distance_threshold: float = 0.5,
- num_dets: int = 1000) -> Tuple[Tensor, Tensor, Tensor]:
- """Transform outputs into detections raw bbox prediction.
- Args:
- tl_heat (Tensor): Top-left corner heatmap for current level with
- shape (N, num_classes, H, W).
- br_heat (Tensor): Bottom-right corner heatmap for current level
- with shape (N, num_classes, H, W).
- tl_off (Tensor): Top-left corner offset for current level with
- shape (N, corner_offset_channels, H, W).
- br_off (Tensor): Bottom-right corner offset for current level with
- shape (N, corner_offset_channels, H, W).
- tl_emb (Tensor, Optional): Top-left corner embedding for current
- level with shape (N, corner_emb_channels, H, W).
- br_emb (Tensor, Optional): Bottom-right corner embedding for
- current level with shape (N, corner_emb_channels, H, W).
- tl_centripetal_shift (Tensor, Optional): Top-left centripetal shift
- for current level with shape (N, 2, H, W).
- br_centripetal_shift (Tensor, Optional): Bottom-right centripetal
- shift for current level with shape (N, 2, H, W).
- img_meta (dict): Meta information of current image, e.g.,
- image size, scaling factor, etc.
- k (int): Get top k corner keypoints from heatmap.
- kernel (int): Max pooling kernel for extract local maximum pixels.
- distance_threshold (float): Distance threshold. Top-left and
- bottom-right corner keypoints with feature distance less than
- the threshold will be regarded as keypoints from same object.
- num_dets (int): Num of raw boxes before doing nms.
- Returns:
- tuple[torch.Tensor]: Decoded output of CornerHead, containing the
- following Tensors:
- - bboxes (Tensor): Coords of each box.
- - scores (Tensor): Scores of each box.
- - clses (Tensor): Categories of each box.
- """
- with_embedding = tl_emb is not None and br_emb is not None
- with_centripetal_shift = (
- tl_centripetal_shift is not None
- and br_centripetal_shift is not None)
- assert with_embedding + with_centripetal_shift == 1
- batch, _, height, width = tl_heat.size()
- if torch.onnx.is_in_onnx_export():
- inp_h, inp_w = img_meta['pad_shape_for_onnx'][:2]
- else:
- inp_h, inp_w = img_meta['batch_input_shape'][:2]
- # perform nms on heatmaps
- tl_heat = get_local_maximum(tl_heat, kernel=kernel)
- br_heat = get_local_maximum(br_heat, kernel=kernel)
- tl_scores, tl_inds, tl_clses, tl_ys, tl_xs = get_topk_from_heatmap(
- tl_heat, k=k)
- br_scores, br_inds, br_clses, br_ys, br_xs = get_topk_from_heatmap(
- br_heat, k=k)
- # We use repeat instead of expand here because expand is a
- # shallow-copy function. Thus it could cause unexpected testing result
- # sometimes. Using expand will decrease about 10% mAP during testing
- # compared to repeat.
- tl_ys = tl_ys.view(batch, k, 1).repeat(1, 1, k)
- tl_xs = tl_xs.view(batch, k, 1).repeat(1, 1, k)
- br_ys = br_ys.view(batch, 1, k).repeat(1, k, 1)
- br_xs = br_xs.view(batch, 1, k).repeat(1, k, 1)
- tl_off = transpose_and_gather_feat(tl_off, tl_inds)
- tl_off = tl_off.view(batch, k, 1, 2)
- br_off = transpose_and_gather_feat(br_off, br_inds)
- br_off = br_off.view(batch, 1, k, 2)
- tl_xs = tl_xs + tl_off[..., 0]
- tl_ys = tl_ys + tl_off[..., 1]
- br_xs = br_xs + br_off[..., 0]
- br_ys = br_ys + br_off[..., 1]
- if with_centripetal_shift:
- tl_centripetal_shift = transpose_and_gather_feat(
- tl_centripetal_shift, tl_inds).view(batch, k, 1, 2).exp()
- br_centripetal_shift = transpose_and_gather_feat(
- br_centripetal_shift, br_inds).view(batch, 1, k, 2).exp()
- tl_ctxs = tl_xs + tl_centripetal_shift[..., 0]
- tl_ctys = tl_ys + tl_centripetal_shift[..., 1]
- br_ctxs = br_xs - br_centripetal_shift[..., 0]
- br_ctys = br_ys - br_centripetal_shift[..., 1]
- # all possible boxes based on top k corners (ignoring class)
- tl_xs *= (inp_w / width)
- tl_ys *= (inp_h / height)
- br_xs *= (inp_w / width)
- br_ys *= (inp_h / height)
- if with_centripetal_shift:
- tl_ctxs *= (inp_w / width)
- tl_ctys *= (inp_h / height)
- br_ctxs *= (inp_w / width)
- br_ctys *= (inp_h / height)
- x_off, y_off = 0, 0 # no crop
- if not torch.onnx.is_in_onnx_export():
- # since `RandomCenterCropPad` is done on CPU with numpy and it's
- # not dynamic traceable when exporting to ONNX, thus 'border'
- # does not appears as key in 'img_meta'. As a tmp solution,
- # we move this 'border' handle part to the postprocess after
- # finished exporting to ONNX, which is handle in
- # `mmdet/core/export/model_wrappers.py`. Though difference between
- # pytorch and exported onnx model, it might be ignored since
- # comparable performance is achieved between them (e.g. 40.4 vs
- # 40.6 on COCO val2017, for CornerNet without test-time flip)
- if 'border' in img_meta:
- x_off = img_meta['border'][2]
- y_off = img_meta['border'][0]
- tl_xs -= x_off
- tl_ys -= y_off
- br_xs -= x_off
- br_ys -= y_off
- zeros = tl_xs.new_zeros(*tl_xs.size())
- tl_xs = torch.where(tl_xs > 0.0, tl_xs, zeros)
- tl_ys = torch.where(tl_ys > 0.0, tl_ys, zeros)
- br_xs = torch.where(br_xs > 0.0, br_xs, zeros)
- br_ys = torch.where(br_ys > 0.0, br_ys, zeros)
- bboxes = torch.stack((tl_xs, tl_ys, br_xs, br_ys), dim=3)
- area_bboxes = ((br_xs - tl_xs) * (br_ys - tl_ys)).abs()
- if with_centripetal_shift:
- tl_ctxs -= x_off
- tl_ctys -= y_off
- br_ctxs -= x_off
- br_ctys -= y_off
- tl_ctxs *= tl_ctxs.gt(0.0).type_as(tl_ctxs)
- tl_ctys *= tl_ctys.gt(0.0).type_as(tl_ctys)
- br_ctxs *= br_ctxs.gt(0.0).type_as(br_ctxs)
- br_ctys *= br_ctys.gt(0.0).type_as(br_ctys)
- ct_bboxes = torch.stack((tl_ctxs, tl_ctys, br_ctxs, br_ctys),
- dim=3)
- area_ct_bboxes = ((br_ctxs - tl_ctxs) * (br_ctys - tl_ctys)).abs()
- rcentral = torch.zeros_like(ct_bboxes)
- # magic nums from paper section 4.1
- mu = torch.ones_like(area_bboxes) / 2.4
- mu[area_bboxes > 3500] = 1 / 2.1 # large bbox have smaller mu
- bboxes_center_x = (bboxes[..., 0] + bboxes[..., 2]) / 2
- bboxes_center_y = (bboxes[..., 1] + bboxes[..., 3]) / 2
- rcentral[..., 0] = bboxes_center_x - mu * (bboxes[..., 2] -
- bboxes[..., 0]) / 2
- rcentral[..., 1] = bboxes_center_y - mu * (bboxes[..., 3] -
- bboxes[..., 1]) / 2
- rcentral[..., 2] = bboxes_center_x + mu * (bboxes[..., 2] -
- bboxes[..., 0]) / 2
- rcentral[..., 3] = bboxes_center_y + mu * (bboxes[..., 3] -
- bboxes[..., 1]) / 2
- area_rcentral = ((rcentral[..., 2] - rcentral[..., 0]) *
- (rcentral[..., 3] - rcentral[..., 1])).abs()
- dists = area_ct_bboxes / area_rcentral
- tl_ctx_inds = (ct_bboxes[..., 0] <= rcentral[..., 0]) | (
- ct_bboxes[..., 0] >= rcentral[..., 2])
- tl_cty_inds = (ct_bboxes[..., 1] <= rcentral[..., 1]) | (
- ct_bboxes[..., 1] >= rcentral[..., 3])
- br_ctx_inds = (ct_bboxes[..., 2] <= rcentral[..., 0]) | (
- ct_bboxes[..., 2] >= rcentral[..., 2])
- br_cty_inds = (ct_bboxes[..., 3] <= rcentral[..., 1]) | (
- ct_bboxes[..., 3] >= rcentral[..., 3])
- if with_embedding:
- tl_emb = transpose_and_gather_feat(tl_emb, tl_inds)
- tl_emb = tl_emb.view(batch, k, 1)
- br_emb = transpose_and_gather_feat(br_emb, br_inds)
- br_emb = br_emb.view(batch, 1, k)
- dists = torch.abs(tl_emb - br_emb)
- tl_scores = tl_scores.view(batch, k, 1).repeat(1, 1, k)
- br_scores = br_scores.view(batch, 1, k).repeat(1, k, 1)
- scores = (tl_scores + br_scores) / 2 # scores for all possible boxes
- # tl and br should have same class
- tl_clses = tl_clses.view(batch, k, 1).repeat(1, 1, k)
- br_clses = br_clses.view(batch, 1, k).repeat(1, k, 1)
- cls_inds = (tl_clses != br_clses)
- # reject boxes based on distances
- dist_inds = dists > distance_threshold
- # reject boxes based on widths and heights
- width_inds = (br_xs <= tl_xs)
- height_inds = (br_ys <= tl_ys)
- # No use `scores[cls_inds]`, instead we use `torch.where` here.
- # Since only 1-D indices with type 'tensor(bool)' are supported
- # when exporting to ONNX, any other bool indices with more dimensions
- # (e.g. 2-D bool tensor) as input parameter in node is invalid
- negative_scores = -1 * torch.ones_like(scores)
- scores = torch.where(cls_inds, negative_scores, scores)
- scores = torch.where(width_inds, negative_scores, scores)
- scores = torch.where(height_inds, negative_scores, scores)
- scores = torch.where(dist_inds, negative_scores, scores)
- if with_centripetal_shift:
- scores[tl_ctx_inds] = -1
- scores[tl_cty_inds] = -1
- scores[br_ctx_inds] = -1
- scores[br_cty_inds] = -1
- scores = scores.view(batch, -1)
- scores, inds = torch.topk(scores, num_dets)
- scores = scores.unsqueeze(2)
- bboxes = bboxes.view(batch, -1, 4)
- bboxes = gather_feat(bboxes, inds)
- clses = tl_clses.contiguous().view(batch, -1, 1)
- clses = gather_feat(clses, inds)
- return bboxes, scores, clses
|