yolof_head.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399
  1. # Copyright (c) OpenMMLab. All rights reserved.
  2. from typing import List, Optional, Tuple
  3. import torch
  4. import torch.nn as nn
  5. from mmcv.cnn import ConvModule, is_norm
  6. from mmengine.model import bias_init_with_prob, constant_init, normal_init
  7. from mmengine.structures import InstanceData
  8. from torch import Tensor
  9. from mmdet.registry import MODELS
  10. from mmdet.utils import ConfigType, InstanceList, OptInstanceList, reduce_mean
  11. from ..task_modules.prior_generators import anchor_inside_flags
  12. from ..utils import levels_to_images, multi_apply, unmap
  13. from .anchor_head import AnchorHead
  14. INF = 1e8
  15. @MODELS.register_module()
  16. class YOLOFHead(AnchorHead):
  17. """Detection Head of `YOLOF <https://arxiv.org/abs/2103.09460>`_
  18. Args:
  19. num_classes (int): The number of object classes (w/o background)
  20. in_channels (list[int]): The number of input channels per scale.
  21. cls_num_convs (int): The number of convolutions of cls branch.
  22. Defaults to 2.
  23. reg_num_convs (int): The number of convolutions of reg branch.
  24. Defaults to 4.
  25. norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
  26. layer. Defaults to ``dict(type='BN', requires_grad=True)``.
  27. """
  28. def __init__(self,
  29. num_classes: int,
  30. in_channels: List[int],
  31. num_cls_convs: int = 2,
  32. num_reg_convs: int = 4,
  33. norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
  34. **kwargs) -> None:
  35. self.num_cls_convs = num_cls_convs
  36. self.num_reg_convs = num_reg_convs
  37. self.norm_cfg = norm_cfg
  38. super().__init__(
  39. num_classes=num_classes, in_channels=in_channels, **kwargs)
  40. def _init_layers(self) -> None:
  41. cls_subnet = []
  42. bbox_subnet = []
  43. for i in range(self.num_cls_convs):
  44. cls_subnet.append(
  45. ConvModule(
  46. self.in_channels,
  47. self.in_channels,
  48. kernel_size=3,
  49. padding=1,
  50. norm_cfg=self.norm_cfg))
  51. for i in range(self.num_reg_convs):
  52. bbox_subnet.append(
  53. ConvModule(
  54. self.in_channels,
  55. self.in_channels,
  56. kernel_size=3,
  57. padding=1,
  58. norm_cfg=self.norm_cfg))
  59. self.cls_subnet = nn.Sequential(*cls_subnet)
  60. self.bbox_subnet = nn.Sequential(*bbox_subnet)
  61. self.cls_score = nn.Conv2d(
  62. self.in_channels,
  63. self.num_base_priors * self.num_classes,
  64. kernel_size=3,
  65. stride=1,
  66. padding=1)
  67. self.bbox_pred = nn.Conv2d(
  68. self.in_channels,
  69. self.num_base_priors * 4,
  70. kernel_size=3,
  71. stride=1,
  72. padding=1)
  73. self.object_pred = nn.Conv2d(
  74. self.in_channels,
  75. self.num_base_priors,
  76. kernel_size=3,
  77. stride=1,
  78. padding=1)
  79. def init_weights(self) -> None:
  80. for m in self.modules():
  81. if isinstance(m, nn.Conv2d):
  82. normal_init(m, mean=0, std=0.01)
  83. if is_norm(m):
  84. constant_init(m, 1)
  85. # Use prior in model initialization to improve stability
  86. bias_cls = bias_init_with_prob(0.01)
  87. torch.nn.init.constant_(self.cls_score.bias, bias_cls)
  88. def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
  89. """Forward feature of a single scale level.
  90. Args:
  91. x (Tensor): Features of a single scale level.
  92. Returns:
  93. tuple:
  94. normalized_cls_score (Tensor): Normalized Cls scores for a \
  95. single scale level, the channels number is \
  96. num_base_priors * num_classes.
  97. bbox_reg (Tensor): Box energies / deltas for a single scale \
  98. level, the channels number is num_base_priors * 4.
  99. """
  100. cls_score = self.cls_score(self.cls_subnet(x))
  101. N, _, H, W = cls_score.shape
  102. cls_score = cls_score.view(N, -1, self.num_classes, H, W)
  103. reg_feat = self.bbox_subnet(x)
  104. bbox_reg = self.bbox_pred(reg_feat)
  105. objectness = self.object_pred(reg_feat)
  106. # implicit objectness
  107. objectness = objectness.view(N, -1, 1, H, W)
  108. normalized_cls_score = cls_score + objectness - torch.log(
  109. 1. + torch.clamp(cls_score.exp(), max=INF) +
  110. torch.clamp(objectness.exp(), max=INF))
  111. normalized_cls_score = normalized_cls_score.view(N, -1, H, W)
  112. return normalized_cls_score, bbox_reg
  113. def loss_by_feat(
  114. self,
  115. cls_scores: List[Tensor],
  116. bbox_preds: List[Tensor],
  117. batch_gt_instances: InstanceList,
  118. batch_img_metas: List[dict],
  119. batch_gt_instances_ignore: OptInstanceList = None) -> dict:
  120. """Calculate the loss based on the features extracted by the detection
  121. head.
  122. Args:
  123. cls_scores (list[Tensor]): Box scores for each scale level
  124. has shape (N, num_anchors * num_classes, H, W).
  125. bbox_preds (list[Tensor]): Box energies / deltas for each scale
  126. level with shape (N, num_anchors * 4, H, W).
  127. batch_gt_instances (list[:obj:`InstanceData`]): Batch of
  128. gt_instance. It usually includes ``bboxes`` and ``labels``
  129. attributes.
  130. batch_img_metas (list[dict]): Meta information of each image, e.g.,
  131. image size, scaling factor, etc.
  132. batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
  133. Batch of gt_instances_ignore. It includes ``bboxes`` attribute
  134. data that is ignored during training and testing.
  135. Defaults to None.
  136. Returns:
  137. dict: A dictionary of loss components.
  138. """
  139. assert len(cls_scores) == 1
  140. assert self.prior_generator.num_levels == 1
  141. device = cls_scores[0].device
  142. featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
  143. anchor_list, valid_flag_list = self.get_anchors(
  144. featmap_sizes, batch_img_metas, device=device)
  145. # The output level is always 1
  146. anchor_list = [anchors[0] for anchors in anchor_list]
  147. valid_flag_list = [valid_flags[0] for valid_flags in valid_flag_list]
  148. cls_scores_list = levels_to_images(cls_scores)
  149. bbox_preds_list = levels_to_images(bbox_preds)
  150. cls_reg_targets = self.get_targets(
  151. cls_scores_list,
  152. bbox_preds_list,
  153. anchor_list,
  154. valid_flag_list,
  155. batch_gt_instances,
  156. batch_img_metas,
  157. batch_gt_instances_ignore=batch_gt_instances_ignore)
  158. if cls_reg_targets is None:
  159. return None
  160. (batch_labels, batch_label_weights, avg_factor, batch_bbox_weights,
  161. batch_pos_predicted_boxes, batch_target_boxes) = cls_reg_targets
  162. flatten_labels = batch_labels.reshape(-1)
  163. batch_label_weights = batch_label_weights.reshape(-1)
  164. cls_score = cls_scores[0].permute(0, 2, 3,
  165. 1).reshape(-1, self.cls_out_channels)
  166. avg_factor = reduce_mean(
  167. torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
  168. # classification loss
  169. loss_cls = self.loss_cls(
  170. cls_score,
  171. flatten_labels,
  172. batch_label_weights,
  173. avg_factor=avg_factor)
  174. # regression loss
  175. if batch_pos_predicted_boxes.shape[0] == 0:
  176. # no pos sample
  177. loss_bbox = batch_pos_predicted_boxes.sum() * 0
  178. else:
  179. loss_bbox = self.loss_bbox(
  180. batch_pos_predicted_boxes,
  181. batch_target_boxes,
  182. batch_bbox_weights.float(),
  183. avg_factor=avg_factor)
  184. return dict(loss_cls=loss_cls, loss_bbox=loss_bbox)
  185. def get_targets(self,
  186. cls_scores_list: List[Tensor],
  187. bbox_preds_list: List[Tensor],
  188. anchor_list: List[Tensor],
  189. valid_flag_list: List[Tensor],
  190. batch_gt_instances: InstanceList,
  191. batch_img_metas: List[dict],
  192. batch_gt_instances_ignore: OptInstanceList = None,
  193. unmap_outputs: bool = True):
  194. """Compute regression and classification targets for anchors in
  195. multiple images.
  196. Args:
  197. cls_scores_list (list[Tensor]): Classification scores of
  198. each image. each is a 4D-tensor, the shape is
  199. (h * w, num_anchors * num_classes).
  200. bbox_preds_list (list[Tensor]): Bbox preds of each image.
  201. each is a 4D-tensor, the shape is (h * w, num_anchors * 4).
  202. anchor_list (list[Tensor]): Anchors of each image. Each element of
  203. is a tensor of shape (h * w * num_anchors, 4).
  204. valid_flag_list (list[Tensor]): Valid flags of each image. Each
  205. element of is a tensor of shape (h * w * num_anchors, )
  206. batch_gt_instances (list[:obj:`InstanceData`]): Batch of
  207. gt_instance. It usually includes ``bboxes`` and ``labels``
  208. attributes.
  209. batch_img_metas (list[dict]): Meta information of each image, e.g.,
  210. image size, scaling factor, etc.
  211. batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
  212. Batch of gt_instances_ignore. It includes ``bboxes`` attribute
  213. data that is ignored during training and testing.
  214. Defaults to None.
  215. unmap_outputs (bool): Whether to map outputs back to the original
  216. set of anchors.
  217. Returns:
  218. tuple: Usually returns a tuple containing learning targets.
  219. - batch_labels (Tensor): Label of all images. Each element \
  220. of is a tensor of shape (batch, h * w * num_anchors)
  221. - batch_label_weights (Tensor): Label weights of all images \
  222. of is a tensor of shape (batch, h * w * num_anchors)
  223. - num_total_pos (int): Number of positive samples in all \
  224. images.
  225. - num_total_neg (int): Number of negative samples in all \
  226. images.
  227. additional_returns: This function enables user-defined returns from
  228. `self._get_targets_single`. These returns are currently refined
  229. to properties at each feature map (i.e. having HxW dimension).
  230. The results will be concatenated after the end
  231. """
  232. num_imgs = len(batch_img_metas)
  233. assert len(anchor_list) == len(valid_flag_list) == num_imgs
  234. # compute targets for each image
  235. if batch_gt_instances_ignore is None:
  236. batch_gt_instances_ignore = [None] * num_imgs
  237. results = multi_apply(
  238. self._get_targets_single,
  239. bbox_preds_list,
  240. anchor_list,
  241. valid_flag_list,
  242. batch_gt_instances,
  243. batch_img_metas,
  244. batch_gt_instances_ignore,
  245. unmap_outputs=unmap_outputs)
  246. (all_labels, all_label_weights, pos_inds, neg_inds,
  247. sampling_results_list) = results[:5]
  248. # Get `avg_factor` of all images, which calculate in `SamplingResult`.
  249. # When using sampling method, avg_factor is usually the sum of
  250. # positive and negative priors. When using `PseudoSampler`,
  251. # `avg_factor` is usually equal to the number of positive priors.
  252. avg_factor = sum(
  253. [results.avg_factor for results in sampling_results_list])
  254. rest_results = list(results[5:]) # user-added return values
  255. batch_labels = torch.stack(all_labels, 0)
  256. batch_label_weights = torch.stack(all_label_weights, 0)
  257. res = (batch_labels, batch_label_weights, avg_factor)
  258. for i, rests in enumerate(rest_results): # user-added return values
  259. rest_results[i] = torch.cat(rests, 0)
  260. return res + tuple(rest_results)
  261. def _get_targets_single(self,
  262. bbox_preds: Tensor,
  263. flat_anchors: Tensor,
  264. valid_flags: Tensor,
  265. gt_instances: InstanceData,
  266. img_meta: dict,
  267. gt_instances_ignore: Optional[InstanceData] = None,
  268. unmap_outputs: bool = True) -> tuple:
  269. """Compute regression and classification targets for anchors in a
  270. single image.
  271. Args:
  272. bbox_preds (Tensor): Bbox prediction of the image, which
  273. shape is (h * w ,4)
  274. flat_anchors (Tensor): Anchors of the image, which shape is
  275. (h * w * num_anchors ,4)
  276. valid_flags (Tensor): Valid flags of the image, which shape is
  277. (h * w * num_anchors,).
  278. gt_instances (:obj:`InstanceData`): Ground truth of instance
  279. annotations. It should includes ``bboxes`` and ``labels``
  280. attributes.
  281. img_meta (dict): Meta information for current image.
  282. gt_instances_ignore (:obj:`InstanceData`, optional): Instances
  283. to be ignored during training. It includes ``bboxes`` attribute
  284. data that is ignored during training and testing.
  285. Defaults to None.
  286. unmap_outputs (bool): Whether to map outputs back to the original
  287. set of anchors.
  288. Returns:
  289. tuple:
  290. labels (Tensor): Labels of image, which shape is
  291. (h * w * num_anchors, ).
  292. label_weights (Tensor): Label weights of image, which shape is
  293. (h * w * num_anchors, ).
  294. pos_inds (Tensor): Pos index of image.
  295. neg_inds (Tensor): Neg index of image.
  296. sampling_result (obj:`SamplingResult`): Sampling result.
  297. pos_bbox_weights (Tensor): The Weight of using to calculate
  298. the bbox branch loss, which shape is (num, ).
  299. pos_predicted_boxes (Tensor): boxes predicted value of
  300. using to calculate the bbox branch loss, which shape is
  301. (num, 4).
  302. pos_target_boxes (Tensor): boxes target value of
  303. using to calculate the bbox branch loss, which shape is
  304. (num, 4).
  305. """
  306. inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
  307. img_meta['img_shape'][:2],
  308. self.train_cfg['allowed_border'])
  309. if not inside_flags.any():
  310. raise ValueError(
  311. 'There is no valid anchor inside the image boundary. Please '
  312. 'check the image size and anchor sizes, or set '
  313. '``allowed_border`` to -1 to skip the condition.')
  314. # assign gt and sample anchors
  315. anchors = flat_anchors[inside_flags, :]
  316. bbox_preds = bbox_preds.reshape(-1, 4)
  317. bbox_preds = bbox_preds[inside_flags, :]
  318. # decoded bbox
  319. decoder_bbox_preds = self.bbox_coder.decode(anchors, bbox_preds)
  320. pred_instances = InstanceData(
  321. priors=anchors, decoder_priors=decoder_bbox_preds)
  322. assign_result = self.assigner.assign(pred_instances, gt_instances,
  323. gt_instances_ignore)
  324. pos_bbox_weights = assign_result.get_extra_property('pos_idx')
  325. pos_predicted_boxes = assign_result.get_extra_property(
  326. 'pos_predicted_boxes')
  327. pos_target_boxes = assign_result.get_extra_property('target_boxes')
  328. sampling_result = self.sampler.sample(assign_result, pred_instances,
  329. gt_instances)
  330. num_valid_anchors = anchors.shape[0]
  331. labels = anchors.new_full((num_valid_anchors, ),
  332. self.num_classes,
  333. dtype=torch.long)
  334. label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
  335. pos_inds = sampling_result.pos_inds
  336. neg_inds = sampling_result.neg_inds
  337. if len(pos_inds) > 0:
  338. labels[pos_inds] = sampling_result.pos_gt_labels
  339. if self.train_cfg['pos_weight'] <= 0:
  340. label_weights[pos_inds] = 1.0
  341. else:
  342. label_weights[pos_inds] = self.train_cfg['pos_weight']
  343. if len(neg_inds) > 0:
  344. label_weights[neg_inds] = 1.0
  345. # map up to original set of anchors
  346. if unmap_outputs:
  347. num_total_anchors = flat_anchors.size(0)
  348. labels = unmap(
  349. labels, num_total_anchors, inside_flags,
  350. fill=self.num_classes) # fill bg label
  351. label_weights = unmap(label_weights, num_total_anchors,
  352. inside_flags)
  353. return (labels, label_weights, pos_inds, neg_inds, sampling_result,
  354. pos_bbox_weights, pos_predicted_boxes, pos_target_boxes)