csp_darknet.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. # Copyright (c) OpenMMLab. All rights reserved.
  2. import math
  3. import torch
  4. import torch.nn as nn
  5. from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
  6. from mmengine.model import BaseModule
  7. from torch.nn.modules.batchnorm import _BatchNorm
  8. from mmdet.registry import MODELS
  9. from ..layers import CSPLayer
  10. class Focus(nn.Module):
  11. """Focus width and height information into channel space.
  12. Args:
  13. in_channels (int): The input channels of this Module.
  14. out_channels (int): The output channels of this Module.
  15. kernel_size (int): The kernel size of the convolution. Default: 1
  16. stride (int): The stride of the convolution. Default: 1
  17. conv_cfg (dict): Config dict for convolution layer. Default: None,
  18. which means using conv2d.
  19. norm_cfg (dict): Config dict for normalization layer.
  20. Default: dict(type='BN', momentum=0.03, eps=0.001).
  21. act_cfg (dict): Config dict for activation layer.
  22. Default: dict(type='Swish').
  23. """
  24. def __init__(self,
  25. in_channels,
  26. out_channels,
  27. kernel_size=1,
  28. stride=1,
  29. conv_cfg=None,
  30. norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
  31. act_cfg=dict(type='Swish')):
  32. super().__init__()
  33. self.conv = ConvModule(
  34. in_channels * 4,
  35. out_channels,
  36. kernel_size,
  37. stride,
  38. padding=(kernel_size - 1) // 2,
  39. conv_cfg=conv_cfg,
  40. norm_cfg=norm_cfg,
  41. act_cfg=act_cfg)
  42. def forward(self, x):
  43. # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
  44. patch_top_left = x[..., ::2, ::2]
  45. patch_top_right = x[..., ::2, 1::2]
  46. patch_bot_left = x[..., 1::2, ::2]
  47. patch_bot_right = x[..., 1::2, 1::2]
  48. x = torch.cat(
  49. (
  50. patch_top_left,
  51. patch_bot_left,
  52. patch_top_right,
  53. patch_bot_right,
  54. ),
  55. dim=1,
  56. )
  57. return self.conv(x)
  58. class SPPBottleneck(BaseModule):
  59. """Spatial pyramid pooling layer used in YOLOv3-SPP.
  60. Args:
  61. in_channels (int): The input channels of this Module.
  62. out_channels (int): The output channels of this Module.
  63. kernel_sizes (tuple[int]): Sequential of kernel sizes of pooling
  64. layers. Default: (5, 9, 13).
  65. conv_cfg (dict): Config dict for convolution layer. Default: None,
  66. which means using conv2d.
  67. norm_cfg (dict): Config dict for normalization layer.
  68. Default: dict(type='BN').
  69. act_cfg (dict): Config dict for activation layer.
  70. Default: dict(type='Swish').
  71. init_cfg (dict or list[dict], optional): Initialization config dict.
  72. Default: None.
  73. """
  74. def __init__(self,
  75. in_channels,
  76. out_channels,
  77. kernel_sizes=(5, 9, 13),
  78. conv_cfg=None,
  79. norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
  80. act_cfg=dict(type='Swish'),
  81. init_cfg=None):
  82. super().__init__(init_cfg)
  83. mid_channels = in_channels // 2
  84. self.conv1 = ConvModule(
  85. in_channels,
  86. mid_channels,
  87. 1,
  88. stride=1,
  89. conv_cfg=conv_cfg,
  90. norm_cfg=norm_cfg,
  91. act_cfg=act_cfg)
  92. self.poolings = nn.ModuleList([
  93. nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
  94. for ks in kernel_sizes
  95. ])
  96. conv2_channels = mid_channels * (len(kernel_sizes) + 1)
  97. self.conv2 = ConvModule(
  98. conv2_channels,
  99. out_channels,
  100. 1,
  101. conv_cfg=conv_cfg,
  102. norm_cfg=norm_cfg,
  103. act_cfg=act_cfg)
  104. def forward(self, x):
  105. x = self.conv1(x)
  106. with torch.cuda.amp.autocast(enabled=False):
  107. x = torch.cat(
  108. [x] + [pooling(x) for pooling in self.poolings], dim=1)
  109. x = self.conv2(x)
  110. return x
  111. @MODELS.register_module()
  112. class CSPDarknet(BaseModule):
  113. """CSP-Darknet backbone used in YOLOv5 and YOLOX.
  114. Args:
  115. arch (str): Architecture of CSP-Darknet, from {P5, P6}.
  116. Default: P5.
  117. deepen_factor (float): Depth multiplier, multiply number of
  118. blocks in CSP layer by this amount. Default: 1.0.
  119. widen_factor (float): Width multiplier, multiply number of
  120. channels in each layer by this amount. Default: 1.0.
  121. out_indices (Sequence[int]): Output from which stages.
  122. Default: (2, 3, 4).
  123. frozen_stages (int): Stages to be frozen (stop grad and set eval
  124. mode). -1 means not freezing any parameters. Default: -1.
  125. use_depthwise (bool): Whether to use depthwise separable convolution.
  126. Default: False.
  127. arch_ovewrite(list): Overwrite default arch settings. Default: None.
  128. spp_kernal_sizes: (tuple[int]): Sequential of kernel sizes of SPP
  129. layers. Default: (5, 9, 13).
  130. conv_cfg (dict): Config dict for convolution layer. Default: None.
  131. norm_cfg (dict): Dictionary to construct and config norm layer.
  132. Default: dict(type='BN', requires_grad=True).
  133. act_cfg (dict): Config dict for activation layer.
  134. Default: dict(type='LeakyReLU', negative_slope=0.1).
  135. norm_eval (bool): Whether to set norm layers to eval mode, namely,
  136. freeze running stats (mean and var). Note: Effect on Batch Norm
  137. and its variants only.
  138. init_cfg (dict or list[dict], optional): Initialization config dict.
  139. Default: None.
  140. Example:
  141. >>> from mmdet.models import CSPDarknet
  142. >>> import torch
  143. >>> self = CSPDarknet(depth=53)
  144. >>> self.eval()
  145. >>> inputs = torch.rand(1, 3, 416, 416)
  146. >>> level_outputs = self.forward(inputs)
  147. >>> for level_out in level_outputs:
  148. ... print(tuple(level_out.shape))
  149. ...
  150. (1, 256, 52, 52)
  151. (1, 512, 26, 26)
  152. (1, 1024, 13, 13)
  153. """
  154. # From left to right:
  155. # in_channels, out_channels, num_blocks, add_identity, use_spp
  156. arch_settings = {
  157. 'P5': [[64, 128, 3, True, False], [128, 256, 9, True, False],
  158. [256, 512, 9, True, False], [512, 1024, 3, False, True]],
  159. 'P6': [[64, 128, 3, True, False], [128, 256, 9, True, False],
  160. [256, 512, 9, True, False], [512, 768, 3, True, False],
  161. [768, 1024, 3, False, True]]
  162. }
  163. def __init__(self,
  164. arch='P5',
  165. deepen_factor=1.0,
  166. widen_factor=1.0,
  167. out_indices=(2, 3, 4),
  168. frozen_stages=-1,
  169. use_depthwise=False,
  170. arch_ovewrite=None,
  171. spp_kernal_sizes=(5, 9, 13),
  172. conv_cfg=None,
  173. norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
  174. act_cfg=dict(type='Swish'),
  175. norm_eval=False,
  176. init_cfg=dict(
  177. type='Kaiming',
  178. layer='Conv2d',
  179. a=math.sqrt(5),
  180. distribution='uniform',
  181. mode='fan_in',
  182. nonlinearity='leaky_relu')):
  183. super().__init__(init_cfg)
  184. arch_setting = self.arch_settings[arch]
  185. if arch_ovewrite:
  186. arch_setting = arch_ovewrite
  187. assert set(out_indices).issubset(
  188. i for i in range(len(arch_setting) + 1))
  189. if frozen_stages not in range(-1, len(arch_setting) + 1):
  190. raise ValueError('frozen_stages must be in range(-1, '
  191. 'len(arch_setting) + 1). But received '
  192. f'{frozen_stages}')
  193. self.out_indices = out_indices
  194. self.frozen_stages = frozen_stages
  195. self.use_depthwise = use_depthwise
  196. self.norm_eval = norm_eval
  197. conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
  198. self.stem = Focus(
  199. 3,
  200. int(arch_setting[0][0] * widen_factor),
  201. kernel_size=3,
  202. conv_cfg=conv_cfg,
  203. norm_cfg=norm_cfg,
  204. act_cfg=act_cfg)
  205. self.layers = ['stem']
  206. for i, (in_channels, out_channels, num_blocks, add_identity,
  207. use_spp) in enumerate(arch_setting):
  208. in_channels = int(in_channels * widen_factor)
  209. out_channels = int(out_channels * widen_factor)
  210. num_blocks = max(round(num_blocks * deepen_factor), 1)
  211. stage = []
  212. conv_layer = conv(
  213. in_channels,
  214. out_channels,
  215. 3,
  216. stride=2,
  217. padding=1,
  218. conv_cfg=conv_cfg,
  219. norm_cfg=norm_cfg,
  220. act_cfg=act_cfg)
  221. stage.append(conv_layer)
  222. if use_spp:
  223. spp = SPPBottleneck(
  224. out_channels,
  225. out_channels,
  226. kernel_sizes=spp_kernal_sizes,
  227. conv_cfg=conv_cfg,
  228. norm_cfg=norm_cfg,
  229. act_cfg=act_cfg)
  230. stage.append(spp)
  231. csp_layer = CSPLayer(
  232. out_channels,
  233. out_channels,
  234. num_blocks=num_blocks,
  235. add_identity=add_identity,
  236. use_depthwise=use_depthwise,
  237. conv_cfg=conv_cfg,
  238. norm_cfg=norm_cfg,
  239. act_cfg=act_cfg)
  240. stage.append(csp_layer)
  241. self.add_module(f'stage{i + 1}', nn.Sequential(*stage))
  242. self.layers.append(f'stage{i + 1}')
  243. def _freeze_stages(self):
  244. if self.frozen_stages >= 0:
  245. for i in range(self.frozen_stages + 1):
  246. m = getattr(self, self.layers[i])
  247. m.eval()
  248. for param in m.parameters():
  249. param.requires_grad = False
  250. def train(self, mode=True):
  251. super(CSPDarknet, self).train(mode)
  252. self._freeze_stages()
  253. if mode and self.norm_eval:
  254. for m in self.modules():
  255. if isinstance(m, _BatchNorm):
  256. m.eval()
  257. def forward(self, x):
  258. outs = []
  259. for i, layer_name in enumerate(self.layers):
  260. layer = getattr(self, layer_name)
  261. x = layer(x)
  262. if i in self.out_indices:
  263. outs.append(x)
  264. return tuple(outs)