fovea_head.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509
  1. # Copyright (c) OpenMMLab. All rights reserved.
  2. from typing import Dict, List, Optional, Tuple
  3. import torch
  4. import torch.nn as nn
  5. from mmcv.cnn import ConvModule
  6. from mmcv.ops import DeformConv2d
  7. from mmengine.config import ConfigDict
  8. from mmengine.model import BaseModule
  9. from mmengine.structures import InstanceData
  10. from torch import Tensor
  11. from mmdet.registry import MODELS
  12. from mmdet.utils import InstanceList, OptInstanceList, OptMultiConfig
  13. from ..utils import filter_scores_and_topk, multi_apply
  14. from .anchor_free_head import AnchorFreeHead
  15. INF = 1e8
  16. class FeatureAlign(BaseModule):
  17. """Feature Align Module.
  18. Feature Align Module is implemented based on DCN v1.
  19. It uses anchor shape prediction rather than feature map to
  20. predict offsets of deform conv layer.
  21. Args:
  22. in_channels (int): Number of channels in the input feature map.
  23. out_channels (int): Number of channels in the output feature map.
  24. kernel_size (int): Size of the convolution kernel.
  25. ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``.
  26. deform_groups: (int): Group number of DCN in
  27. FeatureAdaption module.
  28. init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
  29. dict], optional): Initialization config dict.
  30. """
  31. def __init__(
  32. self,
  33. in_channels: int,
  34. out_channels: int,
  35. kernel_size: int = 3,
  36. deform_groups: int = 4,
  37. init_cfg: OptMultiConfig = dict(
  38. type='Normal',
  39. layer='Conv2d',
  40. std=0.1,
  41. override=dict(type='Normal', name='conv_adaption', std=0.01))
  42. ) -> None:
  43. super().__init__(init_cfg=init_cfg)
  44. offset_channels = kernel_size * kernel_size * 2
  45. self.conv_offset = nn.Conv2d(
  46. 4, deform_groups * offset_channels, 1, bias=False)
  47. self.conv_adaption = DeformConv2d(
  48. in_channels,
  49. out_channels,
  50. kernel_size=kernel_size,
  51. padding=(kernel_size - 1) // 2,
  52. deform_groups=deform_groups)
  53. self.relu = nn.ReLU(inplace=True)
  54. def forward(self, x: Tensor, shape: Tensor) -> Tensor:
  55. """Forward function of feature align module.
  56. Args:
  57. x (Tensor): Features from the upstream network.
  58. shape (Tensor): Exponential of bbox predictions.
  59. Returns:
  60. x (Tensor): The aligned features.
  61. """
  62. offset = self.conv_offset(shape)
  63. x = self.relu(self.conv_adaption(x, offset))
  64. return x
  65. @MODELS.register_module()
  66. class FoveaHead(AnchorFreeHead):
  67. """Detection Head of `FoveaBox: Beyond Anchor-based Object Detector.
  68. <https://arxiv.org/abs/1904.03797>`_.
  69. Args:
  70. num_classes (int): Number of categories excluding the background
  71. category.
  72. in_channels (int): Number of channels in the input feature map.
  73. base_edge_list (list[int]): List of edges.
  74. scale_ranges (list[tuple]): Range of scales.
  75. sigma (float): Super parameter of ``FoveaHead``.
  76. with_deform (bool): Whether use deform conv.
  77. deform_groups (int): Deformable conv group size.
  78. init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
  79. dict], optional): Initialization config dict.
  80. """
  81. def __init__(self,
  82. num_classes: int,
  83. in_channels: int,
  84. base_edge_list: List[int] = (16, 32, 64, 128, 256),
  85. scale_ranges: List[tuple] = ((8, 32), (16, 64), (32, 128),
  86. (64, 256), (128, 512)),
  87. sigma: float = 0.4,
  88. with_deform: bool = False,
  89. deform_groups: int = 4,
  90. init_cfg: OptMultiConfig = dict(
  91. type='Normal',
  92. layer='Conv2d',
  93. std=0.01,
  94. override=dict(
  95. type='Normal',
  96. name='conv_cls',
  97. std=0.01,
  98. bias_prob=0.01)),
  99. **kwargs) -> None:
  100. self.base_edge_list = base_edge_list
  101. self.scale_ranges = scale_ranges
  102. self.sigma = sigma
  103. self.with_deform = with_deform
  104. self.deform_groups = deform_groups
  105. super().__init__(
  106. num_classes=num_classes,
  107. in_channels=in_channels,
  108. init_cfg=init_cfg,
  109. **kwargs)
  110. def _init_layers(self) -> None:
  111. """Initialize layers of the head."""
  112. # box branch
  113. super()._init_reg_convs()
  114. self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
  115. # cls branch
  116. if not self.with_deform:
  117. super()._init_cls_convs()
  118. self.conv_cls = nn.Conv2d(
  119. self.feat_channels, self.cls_out_channels, 3, padding=1)
  120. else:
  121. self.cls_convs = nn.ModuleList()
  122. self.cls_convs.append(
  123. ConvModule(
  124. self.feat_channels, (self.feat_channels * 4),
  125. 3,
  126. stride=1,
  127. padding=1,
  128. conv_cfg=self.conv_cfg,
  129. norm_cfg=self.norm_cfg,
  130. bias=self.norm_cfg is None))
  131. self.cls_convs.append(
  132. ConvModule((self.feat_channels * 4), (self.feat_channels * 4),
  133. 1,
  134. stride=1,
  135. padding=0,
  136. conv_cfg=self.conv_cfg,
  137. norm_cfg=self.norm_cfg,
  138. bias=self.norm_cfg is None))
  139. self.feature_adaption = FeatureAlign(
  140. self.feat_channels,
  141. self.feat_channels,
  142. kernel_size=3,
  143. deform_groups=self.deform_groups)
  144. self.conv_cls = nn.Conv2d(
  145. int(self.feat_channels * 4),
  146. self.cls_out_channels,
  147. 3,
  148. padding=1)
  149. def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
  150. """Forward features of a single scale level.
  151. Args:
  152. x (Tensor): FPN feature maps of the specified stride.
  153. Returns:
  154. tuple: scores for each class and bbox predictions of input
  155. feature maps.
  156. """
  157. cls_feat = x
  158. reg_feat = x
  159. for reg_layer in self.reg_convs:
  160. reg_feat = reg_layer(reg_feat)
  161. bbox_pred = self.conv_reg(reg_feat)
  162. if self.with_deform:
  163. cls_feat = self.feature_adaption(cls_feat, bbox_pred.exp())
  164. for cls_layer in self.cls_convs:
  165. cls_feat = cls_layer(cls_feat)
  166. cls_score = self.conv_cls(cls_feat)
  167. return cls_score, bbox_pred
  168. def loss_by_feat(
  169. self,
  170. cls_scores: List[Tensor],
  171. bbox_preds: List[Tensor],
  172. batch_gt_instances: InstanceList,
  173. batch_img_metas: List[dict],
  174. batch_gt_instances_ignore: OptInstanceList = None
  175. ) -> Dict[str, Tensor]:
  176. """Calculate the loss based on the features extracted by the detection
  177. head.
  178. Args:
  179. cls_scores (list[Tensor]): Box scores for each scale level,
  180. each is a 4D-tensor, the channel number is
  181. num_priors * num_classes.
  182. bbox_preds (list[Tensor]): Box energies / deltas for each scale
  183. level, each is a 4D-tensor, the channel number is
  184. num_priors * 4.
  185. batch_gt_instances (list[:obj:`InstanceData`]): Batch of
  186. gt_instance. It usually includes ``bboxes`` and ``labels``
  187. attributes.
  188. batch_img_metas (list[dict]): Meta information of each image, e.g.,
  189. image size, scaling factor, etc.
  190. batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
  191. Batch of gt_instances_ignore. It includes ``bboxes`` attribute
  192. data that is ignored during training and testing.
  193. Defaults to None.
  194. Returns:
  195. dict[str, Tensor]: A dictionary of loss components.
  196. """
  197. assert len(cls_scores) == len(bbox_preds)
  198. featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
  199. priors = self.prior_generator.grid_priors(
  200. featmap_sizes,
  201. dtype=bbox_preds[0].dtype,
  202. device=bbox_preds[0].device)
  203. num_imgs = cls_scores[0].size(0)
  204. flatten_cls_scores = [
  205. cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
  206. for cls_score in cls_scores
  207. ]
  208. flatten_bbox_preds = [
  209. bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
  210. for bbox_pred in bbox_preds
  211. ]
  212. flatten_cls_scores = torch.cat(flatten_cls_scores)
  213. flatten_bbox_preds = torch.cat(flatten_bbox_preds)
  214. flatten_labels, flatten_bbox_targets = self.get_targets(
  215. batch_gt_instances, featmap_sizes, priors)
  216. # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
  217. pos_inds = ((flatten_labels >= 0)
  218. & (flatten_labels < self.num_classes)).nonzero().view(-1)
  219. num_pos = len(pos_inds)
  220. loss_cls = self.loss_cls(
  221. flatten_cls_scores, flatten_labels, avg_factor=num_pos + num_imgs)
  222. if num_pos > 0:
  223. pos_bbox_preds = flatten_bbox_preds[pos_inds]
  224. pos_bbox_targets = flatten_bbox_targets[pos_inds]
  225. pos_weights = pos_bbox_targets.new_ones(pos_bbox_targets.size())
  226. loss_bbox = self.loss_bbox(
  227. pos_bbox_preds,
  228. pos_bbox_targets,
  229. pos_weights,
  230. avg_factor=num_pos)
  231. else:
  232. loss_bbox = torch.tensor(
  233. 0,
  234. dtype=flatten_bbox_preds.dtype,
  235. device=flatten_bbox_preds.device)
  236. return dict(loss_cls=loss_cls, loss_bbox=loss_bbox)
  237. def get_targets(
  238. self, batch_gt_instances: InstanceList, featmap_sizes: List[tuple],
  239. priors_list: List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
  240. """Compute regression and classification for priors in multiple images.
  241. Args:
  242. batch_gt_instances (list[:obj:`InstanceData`]): Batch of
  243. gt_instance. It usually includes ``bboxes`` and ``labels``
  244. attributes.
  245. featmap_sizes (list[tuple]): Size tuple of feature maps.
  246. priors_list (list[Tensor]): Priors list of each fpn level, each has
  247. shape (num_priors, 2).
  248. Returns:
  249. tuple: Targets of each level.
  250. - flatten_labels (list[Tensor]): Labels of each level.
  251. - flatten_bbox_targets (list[Tensor]): BBox targets of each
  252. level.
  253. """
  254. label_list, bbox_target_list = multi_apply(
  255. self._get_targets_single,
  256. batch_gt_instances,
  257. featmap_size_list=featmap_sizes,
  258. priors_list=priors_list)
  259. flatten_labels = [
  260. torch.cat([
  261. labels_level_img.flatten() for labels_level_img in labels_level
  262. ]) for labels_level in zip(*label_list)
  263. ]
  264. flatten_bbox_targets = [
  265. torch.cat([
  266. bbox_targets_level_img.reshape(-1, 4)
  267. for bbox_targets_level_img in bbox_targets_level
  268. ]) for bbox_targets_level in zip(*bbox_target_list)
  269. ]
  270. flatten_labels = torch.cat(flatten_labels)
  271. flatten_bbox_targets = torch.cat(flatten_bbox_targets)
  272. return flatten_labels, flatten_bbox_targets
  273. def _get_targets_single(self,
  274. gt_instances: InstanceData,
  275. featmap_size_list: List[tuple] = None,
  276. priors_list: List[Tensor] = None) -> tuple:
  277. """Compute regression and classification targets for a single image.
  278. Args:
  279. gt_instances (:obj:`InstanceData`): Ground truth of instance
  280. annotations. It usually includes ``bboxes`` and ``labels``
  281. attributes.
  282. featmap_size_list (list[tuple]): Size tuple of feature maps.
  283. priors_list (list[Tensor]): Priors of each fpn level, each has
  284. shape (num_priors, 2).
  285. Returns:
  286. tuple:
  287. - label_list (list[Tensor]): Labels of all anchors in the image.
  288. - box_target_list (list[Tensor]): BBox targets of all anchors in
  289. the image.
  290. """
  291. gt_bboxes_raw = gt_instances.bboxes
  292. gt_labels_raw = gt_instances.labels
  293. gt_areas = torch.sqrt((gt_bboxes_raw[:, 2] - gt_bboxes_raw[:, 0]) *
  294. (gt_bboxes_raw[:, 3] - gt_bboxes_raw[:, 1]))
  295. label_list = []
  296. bbox_target_list = []
  297. # for each pyramid, find the cls and box target
  298. for base_len, (lower_bound, upper_bound), stride, featmap_size, \
  299. priors in zip(self.base_edge_list, self.scale_ranges,
  300. self.strides, featmap_size_list, priors_list):
  301. # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
  302. priors = priors.view(*featmap_size, 2)
  303. x, y = priors[..., 0], priors[..., 1]
  304. labels = gt_labels_raw.new_full(featmap_size, self.num_classes)
  305. bbox_targets = gt_bboxes_raw.new_ones(featmap_size[0],
  306. featmap_size[1], 4)
  307. # scale assignment
  308. hit_indices = ((gt_areas >= lower_bound) &
  309. (gt_areas <= upper_bound)).nonzero().flatten()
  310. if len(hit_indices) == 0:
  311. label_list.append(labels)
  312. bbox_target_list.append(torch.log(bbox_targets))
  313. continue
  314. _, hit_index_order = torch.sort(-gt_areas[hit_indices])
  315. hit_indices = hit_indices[hit_index_order]
  316. gt_bboxes = gt_bboxes_raw[hit_indices, :] / stride
  317. gt_labels = gt_labels_raw[hit_indices]
  318. half_w = 0.5 * (gt_bboxes[:, 2] - gt_bboxes[:, 0])
  319. half_h = 0.5 * (gt_bboxes[:, 3] - gt_bboxes[:, 1])
  320. # valid fovea area: left, right, top, down
  321. pos_left = torch.ceil(
  322. gt_bboxes[:, 0] + (1 - self.sigma) * half_w - 0.5).long(). \
  323. clamp(0, featmap_size[1] - 1)
  324. pos_right = torch.floor(
  325. gt_bboxes[:, 0] + (1 + self.sigma) * half_w - 0.5).long(). \
  326. clamp(0, featmap_size[1] - 1)
  327. pos_top = torch.ceil(
  328. gt_bboxes[:, 1] + (1 - self.sigma) * half_h - 0.5).long(). \
  329. clamp(0, featmap_size[0] - 1)
  330. pos_down = torch.floor(
  331. gt_bboxes[:, 1] + (1 + self.sigma) * half_h - 0.5).long(). \
  332. clamp(0, featmap_size[0] - 1)
  333. for px1, py1, px2, py2, label, (gt_x1, gt_y1, gt_x2, gt_y2) in \
  334. zip(pos_left, pos_top, pos_right, pos_down, gt_labels,
  335. gt_bboxes_raw[hit_indices, :]):
  336. labels[py1:py2 + 1, px1:px2 + 1] = label
  337. bbox_targets[py1:py2 + 1, px1:px2 + 1, 0] = \
  338. (x[py1:py2 + 1, px1:px2 + 1] - gt_x1) / base_len
  339. bbox_targets[py1:py2 + 1, px1:px2 + 1, 1] = \
  340. (y[py1:py2 + 1, px1:px2 + 1] - gt_y1) / base_len
  341. bbox_targets[py1:py2 + 1, px1:px2 + 1, 2] = \
  342. (gt_x2 - x[py1:py2 + 1, px1:px2 + 1]) / base_len
  343. bbox_targets[py1:py2 + 1, px1:px2 + 1, 3] = \
  344. (gt_y2 - y[py1:py2 + 1, px1:px2 + 1]) / base_len
  345. bbox_targets = bbox_targets.clamp(min=1. / 16, max=16.)
  346. label_list.append(labels)
  347. bbox_target_list.append(torch.log(bbox_targets))
  348. return label_list, bbox_target_list
  349. # Same as base_dense_head/_predict_by_feat_single except self._bbox_decode
  350. def _predict_by_feat_single(self,
  351. cls_score_list: List[Tensor],
  352. bbox_pred_list: List[Tensor],
  353. score_factor_list: List[Tensor],
  354. mlvl_priors: List[Tensor],
  355. img_meta: dict,
  356. cfg: Optional[ConfigDict] = None,
  357. rescale: bool = False,
  358. with_nms: bool = True) -> InstanceData:
  359. """Transform a single image's features extracted from the head into
  360. bbox results.
  361. Args:
  362. cls_score_list (list[Tensor]): Box scores from all scale
  363. levels of a single image, each item has shape
  364. (num_priors * num_classes, H, W).
  365. bbox_pred_list (list[Tensor]): Box energies / deltas from
  366. all scale levels of a single image, each item has shape
  367. (num_priors * 4, H, W).
  368. score_factor_list (list[Tensor]): Score factor from all scale
  369. levels of a single image, each item has shape
  370. (num_priors * 1, H, W).
  371. mlvl_priors (list[Tensor]): Each element in the list is
  372. the priors of a single level in feature pyramid, has shape
  373. (num_priors, 2).
  374. img_meta (dict): Image meta info.
  375. cfg (ConfigDict, optional): Test / postprocessing
  376. configuration, if None, test_cfg would be used.
  377. Defaults to None.
  378. rescale (bool): If True, return boxes in original image space.
  379. Defaults to False.
  380. with_nms (bool): If True, do nms before return boxes.
  381. Defaults to True.
  382. Returns:
  383. :obj:`InstanceData`: Detection results of each image
  384. after the post process.
  385. Each item usually contains following keys.
  386. - scores (Tensor): Classification scores, has a shape
  387. (num_instance, )
  388. - labels (Tensor): Labels of bboxes, has a shape
  389. (num_instances, ).
  390. - bboxes (Tensor): Has a shape (num_instances, 4),
  391. the last dimension 4 arrange as (x1, y1, x2, y2).
  392. """
  393. cfg = self.test_cfg if cfg is None else cfg
  394. assert len(cls_score_list) == len(bbox_pred_list)
  395. img_shape = img_meta['img_shape']
  396. nms_pre = cfg.get('nms_pre', -1)
  397. mlvl_bboxes = []
  398. mlvl_scores = []
  399. mlvl_labels = []
  400. for level_idx, (cls_score, bbox_pred, stride, base_len, priors) in \
  401. enumerate(zip(cls_score_list, bbox_pred_list, self.strides,
  402. self.base_edge_list, mlvl_priors)):
  403. assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
  404. bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
  405. scores = cls_score.permute(1, 2, 0).reshape(
  406. -1, self.cls_out_channels).sigmoid()
  407. # After https://github.com/open-mmlab/mmdetection/pull/6268/,
  408. # this operation keeps fewer bboxes under the same `nms_pre`.
  409. # There is no difference in performance for most models. If you
  410. # find a slight drop in performance, you can set a larger
  411. # `nms_pre` than before.
  412. results = filter_scores_and_topk(
  413. scores, cfg.score_thr, nms_pre,
  414. dict(bbox_pred=bbox_pred, priors=priors))
  415. scores, labels, _, filtered_results = results
  416. bbox_pred = filtered_results['bbox_pred']
  417. priors = filtered_results['priors']
  418. bboxes = self._bbox_decode(priors, bbox_pred, base_len, img_shape)
  419. mlvl_bboxes.append(bboxes)
  420. mlvl_scores.append(scores)
  421. mlvl_labels.append(labels)
  422. results = InstanceData()
  423. results.bboxes = torch.cat(mlvl_bboxes)
  424. results.scores = torch.cat(mlvl_scores)
  425. results.labels = torch.cat(mlvl_labels)
  426. return self._bbox_post_process(
  427. results=results,
  428. cfg=cfg,
  429. rescale=rescale,
  430. with_nms=with_nms,
  431. img_meta=img_meta)
  432. def _bbox_decode(self, priors: Tensor, bbox_pred: Tensor, base_len: int,
  433. max_shape: int) -> Tensor:
  434. """Function to decode bbox.
  435. Args:
  436. priors (Tensor): Center proiors of an image, has shape
  437. (num_instances, 2).
  438. bbox_preds (Tensor): Box energies / deltas for all instances,
  439. has shape (batch_size, num_instances, 4).
  440. base_len (int): The base length.
  441. max_shape (int): The max shape of bbox.
  442. Returns:
  443. Tensor: Decoded bboxes in (tl_x, tl_y, br_x, br_y) format. Has
  444. shape (batch_size, num_instances, 4).
  445. """
  446. bbox_pred = bbox_pred.exp()
  447. y = priors[:, 1]
  448. x = priors[:, 0]
  449. x1 = (x - base_len * bbox_pred[:, 0]). \
  450. clamp(min=0, max=max_shape[1] - 1)
  451. y1 = (y - base_len * bbox_pred[:, 1]). \
  452. clamp(min=0, max=max_shape[0] - 1)
  453. x2 = (x + base_len * bbox_pred[:, 2]). \
  454. clamp(min=0, max=max_shape[1] - 1)
  455. y2 = (y + base_len * bbox_pred[:, 3]). \
  456. clamp(min=0, max=max_shape[0] - 1)
  457. decoded_bboxes = torch.stack([x1, y1, x2, y2], -1)
  458. return decoded_bboxes