fsaf_head.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. # Copyright (c) OpenMMLab. All rights reserved.
  2. from typing import Dict, List, Optional, Tuple
  3. import numpy as np
  4. import torch
  5. from mmengine.structures import InstanceData
  6. from torch import Tensor
  7. from mmdet.registry import MODELS
  8. from mmdet.utils import InstanceList, OptInstanceList, OptMultiConfig
  9. from ..losses.accuracy import accuracy
  10. from ..losses.utils import weight_reduce_loss
  11. from ..task_modules.prior_generators import anchor_inside_flags
  12. from ..utils import images_to_levels, multi_apply, unmap
  13. from .retina_head import RetinaHead
  14. @MODELS.register_module()
  15. class FSAFHead(RetinaHead):
  16. """Anchor-free head used in `FSAF <https://arxiv.org/abs/1903.00621>`_.
  17. The head contains two subnetworks. The first classifies anchor boxes and
  18. the second regresses deltas for the anchors (num_anchors is 1 for anchor-
  19. free methods)
  20. Args:
  21. *args: Same as its base class in :class:`RetinaHead`
  22. score_threshold (float, optional): The score_threshold to calculate
  23. positive recall. If given, prediction scores lower than this value
  24. is counted as incorrect prediction. Defaults to None.
  25. init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
  26. dict]): Initialization config dict.
  27. **kwargs: Same as its base class in :class:`RetinaHead`
  28. Example:
  29. >>> import torch
  30. >>> self = FSAFHead(11, 7)
  31. >>> x = torch.rand(1, 7, 32, 32)
  32. >>> cls_score, bbox_pred = self.forward_single(x)
  33. >>> # Each anchor predicts a score for each class except background
  34. >>> cls_per_anchor = cls_score.shape[1] / self.num_anchors
  35. >>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors
  36. >>> assert cls_per_anchor == self.num_classes
  37. >>> assert box_per_anchor == 4
  38. """
  39. def __init__(self,
  40. *args,
  41. score_threshold: Optional[float] = None,
  42. init_cfg: OptMultiConfig = None,
  43. **kwargs) -> None:
  44. # The positive bias in self.retina_reg conv is to prevent predicted \
  45. # bbox with 0 area
  46. if init_cfg is None:
  47. init_cfg = dict(
  48. type='Normal',
  49. layer='Conv2d',
  50. std=0.01,
  51. override=[
  52. dict(
  53. type='Normal',
  54. name='retina_cls',
  55. std=0.01,
  56. bias_prob=0.01),
  57. dict(
  58. type='Normal', name='retina_reg', std=0.01, bias=0.25)
  59. ])
  60. super().__init__(*args, init_cfg=init_cfg, **kwargs)
  61. self.score_threshold = score_threshold
  62. def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
  63. """Forward feature map of a single scale level.
  64. Args:
  65. x (Tensor): Feature map of a single scale level.
  66. Returns:
  67. tuple[Tensor, Tensor]:
  68. - cls_score (Tensor): Box scores for each scale level Has \
  69. shape (N, num_points * num_classes, H, W).
  70. - bbox_pred (Tensor): Box energies / deltas for each scale \
  71. level with shape (N, num_points * 4, H, W).
  72. """
  73. cls_score, bbox_pred = super().forward_single(x)
  74. # relu: TBLR encoder only accepts positive bbox_pred
  75. return cls_score, self.relu(bbox_pred)
  76. def _get_targets_single(self,
  77. flat_anchors: Tensor,
  78. valid_flags: Tensor,
  79. gt_instances: InstanceData,
  80. img_meta: dict,
  81. gt_instances_ignore: Optional[InstanceData] = None,
  82. unmap_outputs: bool = True) -> tuple:
  83. """Compute regression and classification targets for anchors in a
  84. single image.
  85. Most of the codes are the same with the base class :obj: `AnchorHead`,
  86. except that it also collects and returns the matched gt index in the
  87. image (from 0 to num_gt-1). If the anchor bbox is not matched to any
  88. gt, the corresponding value in pos_gt_inds is -1.
  89. Args:
  90. flat_anchors (Tensor): Multi-level anchors of the image, which are
  91. concatenated into a single tensor of shape (num_anchors, 4)
  92. valid_flags (Tensor): Multi level valid flags of the image,
  93. which are concatenated into a single tensor of
  94. shape (num_anchors, ).
  95. gt_instances (:obj:`InstanceData`): Ground truth of instance
  96. annotations. It should includes ``bboxes`` and ``labels``
  97. attributes.
  98. img_meta (dict): Meta information for current image.
  99. gt_instances_ignore (:obj:`InstanceData`, optional): Instances
  100. to be ignored during training. It includes ``bboxes`` attribute
  101. data that is ignored during training and testing.
  102. Defaults to None.
  103. unmap_outputs (bool): Whether to map outputs back to the original
  104. set of anchors. Defaults to True.
  105. """
  106. inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
  107. img_meta['img_shape'][:2],
  108. self.train_cfg['allowed_border'])
  109. if not inside_flags.any():
  110. raise ValueError(
  111. 'There is no valid anchor inside the image boundary. Please '
  112. 'check the image size and anchor sizes, or set '
  113. '``allowed_border`` to -1 to skip the condition.')
  114. # Assign gt and sample anchors
  115. anchors = flat_anchors[inside_flags.type(torch.bool), :]
  116. pred_instances = InstanceData(priors=anchors)
  117. assign_result = self.assigner.assign(pred_instances, gt_instances,
  118. gt_instances_ignore)
  119. sampling_result = self.sampler.sample(assign_result, pred_instances,
  120. gt_instances)
  121. num_valid_anchors = anchors.shape[0]
  122. bbox_targets = torch.zeros_like(anchors)
  123. bbox_weights = torch.zeros_like(anchors)
  124. labels = anchors.new_full((num_valid_anchors, ),
  125. self.num_classes,
  126. dtype=torch.long)
  127. label_weights = anchors.new_zeros(
  128. (num_valid_anchors, self.cls_out_channels), dtype=torch.float)
  129. pos_gt_inds = anchors.new_full((num_valid_anchors, ),
  130. -1,
  131. dtype=torch.long)
  132. pos_inds = sampling_result.pos_inds
  133. neg_inds = sampling_result.neg_inds
  134. if len(pos_inds) > 0:
  135. if not self.reg_decoded_bbox:
  136. pos_bbox_targets = self.bbox_coder.encode(
  137. sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
  138. else:
  139. # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
  140. # is applied directly on the decoded bounding boxes, both
  141. # the predicted boxes and regression targets should be with
  142. # absolute coordinate format.
  143. pos_bbox_targets = sampling_result.pos_gt_bboxes
  144. bbox_targets[pos_inds, :] = pos_bbox_targets
  145. bbox_weights[pos_inds, :] = 1.0
  146. # The assigned gt_index for each anchor. (0-based)
  147. pos_gt_inds[pos_inds] = sampling_result.pos_assigned_gt_inds
  148. labels[pos_inds] = sampling_result.pos_gt_labels
  149. if self.train_cfg['pos_weight'] <= 0:
  150. label_weights[pos_inds] = 1.0
  151. else:
  152. label_weights[pos_inds] = self.train_cfg['pos_weight']
  153. if len(neg_inds) > 0:
  154. label_weights[neg_inds] = 1.0
  155. # shadowed_labels is a tensor composed of tuples
  156. # (anchor_inds, class_label) that indicate those anchors lying in the
  157. # outer region of a gt or overlapped by another gt with a smaller
  158. # area.
  159. #
  160. # Therefore, only the shadowed labels are ignored for loss calculation.
  161. # the key `shadowed_labels` is defined in :obj:`CenterRegionAssigner`
  162. shadowed_labels = assign_result.get_extra_property('shadowed_labels')
  163. if shadowed_labels is not None and shadowed_labels.numel():
  164. if len(shadowed_labels.shape) == 2:
  165. idx_, label_ = shadowed_labels[:, 0], shadowed_labels[:, 1]
  166. assert (labels[idx_] != label_).all(), \
  167. 'One label cannot be both positive and ignored'
  168. label_weights[idx_, label_] = 0
  169. else:
  170. label_weights[shadowed_labels] = 0
  171. # map up to original set of anchors
  172. if unmap_outputs:
  173. num_total_anchors = flat_anchors.size(0)
  174. labels = unmap(
  175. labels, num_total_anchors, inside_flags,
  176. fill=self.num_classes) # fill bg label
  177. label_weights = unmap(label_weights, num_total_anchors,
  178. inside_flags)
  179. bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
  180. bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
  181. pos_gt_inds = unmap(
  182. pos_gt_inds, num_total_anchors, inside_flags, fill=-1)
  183. return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
  184. neg_inds, sampling_result, pos_gt_inds)
  185. def loss_by_feat(
  186. self,
  187. cls_scores: List[Tensor],
  188. bbox_preds: List[Tensor],
  189. batch_gt_instances: InstanceList,
  190. batch_img_metas: List[dict],
  191. batch_gt_instances_ignore: OptInstanceList = None
  192. ) -> Dict[str, Tensor]:
  193. """Compute loss of the head.
  194. Args:
  195. cls_scores (list[Tensor]): Box scores for each scale level
  196. Has shape (N, num_points * num_classes, H, W).
  197. bbox_preds (list[Tensor]): Box energies / deltas for each scale
  198. level with shape (N, num_points * 4, H, W).
  199. batch_gt_instances (list[:obj:`InstanceData`]): Batch of
  200. gt_instance. It usually includes ``bboxes`` and ``labels``
  201. attributes.
  202. batch_img_metas (list[dict]): Meta information of each image, e.g.,
  203. image size, scaling factor, etc.
  204. batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
  205. Batch of gt_instances_ignore. It includes ``bboxes`` attribute
  206. data that is ignored during training and testing.
  207. Defaults to None.
  208. Returns:
  209. dict[str, Tensor]: A dictionary of loss components.
  210. """
  211. for i in range(len(bbox_preds)): # loop over fpn level
  212. # avoid 0 area of the predicted bbox
  213. bbox_preds[i] = bbox_preds[i].clamp(min=1e-4)
  214. # TODO: It may directly use the base-class loss function.
  215. featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
  216. assert len(featmap_sizes) == self.prior_generator.num_levels
  217. batch_size = len(batch_img_metas)
  218. device = cls_scores[0].device
  219. anchor_list, valid_flag_list = self.get_anchors(
  220. featmap_sizes, batch_img_metas, device=device)
  221. cls_reg_targets = self.get_targets(
  222. anchor_list,
  223. valid_flag_list,
  224. batch_gt_instances,
  225. batch_img_metas,
  226. batch_gt_instances_ignore=batch_gt_instances_ignore,
  227. return_sampling_results=True)
  228. (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
  229. avg_factor, sampling_results_list,
  230. pos_assigned_gt_inds_list) = cls_reg_targets
  231. num_gts = np.array(list(map(len, batch_gt_instances)))
  232. # anchor number of multi levels
  233. num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
  234. # concat all level anchors and flags to a single tensor
  235. concat_anchor_list = []
  236. for i in range(len(anchor_list)):
  237. concat_anchor_list.append(torch.cat(anchor_list[i]))
  238. all_anchor_list = images_to_levels(concat_anchor_list,
  239. num_level_anchors)
  240. losses_cls, losses_bbox = multi_apply(
  241. self.loss_by_feat_single,
  242. cls_scores,
  243. bbox_preds,
  244. all_anchor_list,
  245. labels_list,
  246. label_weights_list,
  247. bbox_targets_list,
  248. bbox_weights_list,
  249. avg_factor=avg_factor)
  250. # `pos_assigned_gt_inds_list` (length: fpn_levels) stores the assigned
  251. # gt index of each anchor bbox in each fpn level.
  252. cum_num_gts = list(np.cumsum(num_gts)) # length of batch_size
  253. for i, assign in enumerate(pos_assigned_gt_inds_list):
  254. # loop over fpn levels
  255. for j in range(1, batch_size):
  256. # loop over batch size
  257. # Convert gt indices in each img to those in the batch
  258. assign[j][assign[j] >= 0] += int(cum_num_gts[j - 1])
  259. pos_assigned_gt_inds_list[i] = assign.flatten()
  260. labels_list[i] = labels_list[i].flatten()
  261. num_gts = num_gts.sum() # total number of gt in the batch
  262. # The unique label index of each gt in the batch
  263. label_sequence = torch.arange(num_gts, device=device)
  264. # Collect the average loss of each gt in each level
  265. with torch.no_grad():
  266. loss_levels, = multi_apply(
  267. self.collect_loss_level_single,
  268. losses_cls,
  269. losses_bbox,
  270. pos_assigned_gt_inds_list,
  271. labels_seq=label_sequence)
  272. # Shape: (fpn_levels, num_gts). Loss of each gt at each fpn level
  273. loss_levels = torch.stack(loss_levels, dim=0)
  274. # Locate the best fpn level for loss back-propagation
  275. if loss_levels.numel() == 0: # zero gt
  276. argmin = loss_levels.new_empty((num_gts, ), dtype=torch.long)
  277. else:
  278. _, argmin = loss_levels.min(dim=0)
  279. # Reweight the loss of each (anchor, label) pair, so that only those
  280. # at the best gt level are back-propagated.
  281. losses_cls, losses_bbox, pos_inds = multi_apply(
  282. self.reweight_loss_single,
  283. losses_cls,
  284. losses_bbox,
  285. pos_assigned_gt_inds_list,
  286. labels_list,
  287. list(range(len(losses_cls))),
  288. min_levels=argmin)
  289. num_pos = torch.cat(pos_inds, 0).sum().float()
  290. pos_recall = self.calculate_pos_recall(cls_scores, labels_list,
  291. pos_inds)
  292. if num_pos == 0: # No gt
  293. num_total_neg = sum(
  294. [results.num_neg for results in sampling_results_list])
  295. avg_factor = num_pos + num_total_neg
  296. else:
  297. avg_factor = num_pos
  298. for i in range(len(losses_cls)):
  299. losses_cls[i] /= avg_factor
  300. losses_bbox[i] /= avg_factor
  301. return dict(
  302. loss_cls=losses_cls,
  303. loss_bbox=losses_bbox,
  304. num_pos=num_pos / batch_size,
  305. pos_recall=pos_recall)
  306. def calculate_pos_recall(self, cls_scores: List[Tensor],
  307. labels_list: List[Tensor],
  308. pos_inds: List[Tensor]) -> Tensor:
  309. """Calculate positive recall with score threshold.
  310. Args:
  311. cls_scores (list[Tensor]): Classification scores at all fpn levels.
  312. Each tensor is in shape (N, num_classes * num_anchors, H, W)
  313. labels_list (list[Tensor]): The label that each anchor is assigned
  314. to. Shape (N * H * W * num_anchors, )
  315. pos_inds (list[Tensor]): List of bool tensors indicating whether
  316. the anchor is assigned to a positive label.
  317. Shape (N * H * W * num_anchors, )
  318. Returns:
  319. Tensor: A single float number indicating the positive recall.
  320. """
  321. with torch.no_grad():
  322. num_class = self.num_classes
  323. scores = [
  324. cls.permute(0, 2, 3, 1).reshape(-1, num_class)[pos]
  325. for cls, pos in zip(cls_scores, pos_inds)
  326. ]
  327. labels = [
  328. label.reshape(-1)[pos]
  329. for label, pos in zip(labels_list, pos_inds)
  330. ]
  331. scores = torch.cat(scores, dim=0)
  332. labels = torch.cat(labels, dim=0)
  333. if self.use_sigmoid_cls:
  334. scores = scores.sigmoid()
  335. else:
  336. scores = scores.softmax(dim=1)
  337. return accuracy(scores, labels, thresh=self.score_threshold)
  338. def collect_loss_level_single(self, cls_loss: Tensor, reg_loss: Tensor,
  339. assigned_gt_inds: Tensor,
  340. labels_seq: Tensor) -> Tensor:
  341. """Get the average loss in each FPN level w.r.t. each gt label.
  342. Args:
  343. cls_loss (Tensor): Classification loss of each feature map pixel,
  344. shape (num_anchor, num_class)
  345. reg_loss (Tensor): Regression loss of each feature map pixel,
  346. shape (num_anchor, 4)
  347. assigned_gt_inds (Tensor): It indicates which gt the prior is
  348. assigned to (0-based, -1: no assignment). shape (num_anchor),
  349. labels_seq: The rank of labels. shape (num_gt)
  350. Returns:
  351. Tensor: shape (num_gt), average loss of each gt in this level
  352. """
  353. if len(reg_loss.shape) == 2: # iou loss has shape (num_prior, 4)
  354. reg_loss = reg_loss.sum(dim=-1) # sum loss in tblr dims
  355. if len(cls_loss.shape) == 2:
  356. cls_loss = cls_loss.sum(dim=-1) # sum loss in class dims
  357. loss = cls_loss + reg_loss
  358. assert loss.size(0) == assigned_gt_inds.size(0)
  359. # Default loss value is 1e6 for a layer where no anchor is positive
  360. # to ensure it will not be chosen to back-propagate gradient
  361. losses_ = loss.new_full(labels_seq.shape, 1e6)
  362. for i, l in enumerate(labels_seq):
  363. match = assigned_gt_inds == l
  364. if match.any():
  365. losses_[i] = loss[match].mean()
  366. return losses_,
  367. def reweight_loss_single(self, cls_loss: Tensor, reg_loss: Tensor,
  368. assigned_gt_inds: Tensor, labels: Tensor,
  369. level: int, min_levels: Tensor) -> tuple:
  370. """Reweight loss values at each level.
  371. Reassign loss values at each level by masking those where the
  372. pre-calculated loss is too large. Then return the reduced losses.
  373. Args:
  374. cls_loss (Tensor): Element-wise classification loss.
  375. Shape: (num_anchors, num_classes)
  376. reg_loss (Tensor): Element-wise regression loss.
  377. Shape: (num_anchors, 4)
  378. assigned_gt_inds (Tensor): The gt indices that each anchor bbox
  379. is assigned to. -1 denotes a negative anchor, otherwise it is the
  380. gt index (0-based). Shape: (num_anchors, ),
  381. labels (Tensor): Label assigned to anchors. Shape: (num_anchors, ).
  382. level (int): The current level index in the pyramid
  383. (0-4 for RetinaNet)
  384. min_levels (Tensor): The best-matching level for each gt.
  385. Shape: (num_gts, ),
  386. Returns:
  387. tuple:
  388. - cls_loss: Reduced corrected classification loss. Scalar.
  389. - reg_loss: Reduced corrected regression loss. Scalar.
  390. - pos_flags (Tensor): Corrected bool tensor indicating the \
  391. final positive anchors. Shape: (num_anchors, ).
  392. """
  393. loc_weight = torch.ones_like(reg_loss)
  394. cls_weight = torch.ones_like(cls_loss)
  395. pos_flags = assigned_gt_inds >= 0 # positive pixel flag
  396. pos_indices = torch.nonzero(pos_flags, as_tuple=False).flatten()
  397. if pos_flags.any(): # pos pixels exist
  398. pos_assigned_gt_inds = assigned_gt_inds[pos_flags]
  399. zeroing_indices = (min_levels[pos_assigned_gt_inds] != level)
  400. neg_indices = pos_indices[zeroing_indices]
  401. if neg_indices.numel():
  402. pos_flags[neg_indices] = 0
  403. loc_weight[neg_indices] = 0
  404. # Only the weight corresponding to the label is
  405. # zeroed out if not selected
  406. zeroing_labels = labels[neg_indices]
  407. assert (zeroing_labels >= 0).all()
  408. cls_weight[neg_indices, zeroing_labels] = 0
  409. # Weighted loss for both cls and reg loss
  410. cls_loss = weight_reduce_loss(cls_loss, cls_weight, reduction='sum')
  411. reg_loss = weight_reduce_loss(reg_loss, loc_weight, reduction='sum')
  412. return cls_loss, reg_loss, pos_flags