123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418 |
- # Copyright (c) OpenMMLab. All rights reserved.
- import copy
- import math
- from functools import partial
- import torch
- import torch.nn as nn
- import torch.utils.checkpoint as cp
- from mmcv.cnn.bricks import ConvModule, DropPath
- from mmengine.model import BaseModule, Sequential
- from mmdet.registry import MODELS
- from ..layers import InvertedResidual, SELayer
- from ..utils import make_divisible
- class EdgeResidual(BaseModule):
- """Edge Residual Block.
- Args:
- in_channels (int): The input channels of this module.
- out_channels (int): The output channels of this module.
- mid_channels (int): The input channels of the second convolution.
- kernel_size (int): The kernel size of the first convolution.
- Defaults to 3.
- stride (int): The stride of the first convolution. Defaults to 1.
- se_cfg (dict, optional): Config dict for se layer. Defaults to None,
- which means no se layer.
- with_residual (bool): Use residual connection. Defaults to True.
- conv_cfg (dict, optional): Config dict for convolution layer.
- Defaults to None, which means using conv2d.
- norm_cfg (dict): Config dict for normalization layer.
- Defaults to ``dict(type='BN')``.
- act_cfg (dict): Config dict for activation layer.
- Defaults to ``dict(type='ReLU')``.
- drop_path_rate (float): stochastic depth rate. Defaults to 0.
- with_cp (bool): Use checkpoint or not. Using checkpoint will save some
- memory while slowing down the training speed. Defaults to False.
- init_cfg (dict | list[dict], optional): Initialization config dict.
- """
- def __init__(self,
- in_channels,
- out_channels,
- mid_channels,
- kernel_size=3,
- stride=1,
- se_cfg=None,
- with_residual=True,
- conv_cfg=None,
- norm_cfg=dict(type='BN'),
- act_cfg=dict(type='ReLU'),
- drop_path_rate=0.,
- with_cp=False,
- init_cfg=None,
- **kwargs):
- super(EdgeResidual, self).__init__(init_cfg=init_cfg)
- assert stride in [1, 2]
- self.with_cp = with_cp
- self.drop_path = DropPath(
- drop_path_rate) if drop_path_rate > 0 else nn.Identity()
- self.with_se = se_cfg is not None
- self.with_residual = (
- stride == 1 and in_channels == out_channels and with_residual)
- if self.with_se:
- assert isinstance(se_cfg, dict)
- self.conv1 = ConvModule(
- in_channels=in_channels,
- out_channels=mid_channels,
- kernel_size=kernel_size,
- stride=1,
- padding=kernel_size // 2,
- conv_cfg=conv_cfg,
- norm_cfg=norm_cfg,
- act_cfg=act_cfg)
- if self.with_se:
- self.se = SELayer(**se_cfg)
- self.conv2 = ConvModule(
- in_channels=mid_channels,
- out_channels=out_channels,
- kernel_size=1,
- stride=stride,
- padding=0,
- conv_cfg=conv_cfg,
- norm_cfg=norm_cfg,
- act_cfg=None)
- def forward(self, x):
- def _inner_forward(x):
- out = x
- out = self.conv1(out)
- if self.with_se:
- out = self.se(out)
- out = self.conv2(out)
- if self.with_residual:
- return x + self.drop_path(out)
- else:
- return out
- if self.with_cp and x.requires_grad:
- out = cp.checkpoint(_inner_forward, x)
- else:
- out = _inner_forward(x)
- return out
- def model_scaling(layer_setting, arch_setting):
- """Scaling operation to the layer's parameters according to the
- arch_setting."""
- # scale width
- new_layer_setting = copy.deepcopy(layer_setting)
- for layer_cfg in new_layer_setting:
- for block_cfg in layer_cfg:
- block_cfg[1] = make_divisible(block_cfg[1] * arch_setting[0], 8)
- # scale depth
- split_layer_setting = [new_layer_setting[0]]
- for layer_cfg in new_layer_setting[1:-1]:
- tmp_index = [0]
- for i in range(len(layer_cfg) - 1):
- if layer_cfg[i + 1][1] != layer_cfg[i][1]:
- tmp_index.append(i + 1)
- tmp_index.append(len(layer_cfg))
- for i in range(len(tmp_index) - 1):
- split_layer_setting.append(layer_cfg[tmp_index[i]:tmp_index[i +
- 1]])
- split_layer_setting.append(new_layer_setting[-1])
- num_of_layers = [len(layer_cfg) for layer_cfg in split_layer_setting[1:-1]]
- new_layers = [
- int(math.ceil(arch_setting[1] * num)) for num in num_of_layers
- ]
- merge_layer_setting = [split_layer_setting[0]]
- for i, layer_cfg in enumerate(split_layer_setting[1:-1]):
- if new_layers[i] <= num_of_layers[i]:
- tmp_layer_cfg = layer_cfg[:new_layers[i]]
- else:
- tmp_layer_cfg = copy.deepcopy(layer_cfg) + [layer_cfg[-1]] * (
- new_layers[i] - num_of_layers[i])
- if tmp_layer_cfg[0][3] == 1 and i != 0:
- merge_layer_setting[-1] += tmp_layer_cfg.copy()
- else:
- merge_layer_setting.append(tmp_layer_cfg.copy())
- merge_layer_setting.append(split_layer_setting[-1])
- return merge_layer_setting
- @MODELS.register_module()
- class EfficientNet(BaseModule):
- """EfficientNet backbone.
- Args:
- arch (str): Architecture of efficientnet. Defaults to b0.
- out_indices (Sequence[int]): Output from which stages.
- Defaults to (6, ).
- frozen_stages (int): Stages to be frozen (all param fixed).
- Defaults to 0, which means not freezing any parameters.
- conv_cfg (dict): Config dict for convolution layer.
- Defaults to None, which means using conv2d.
- norm_cfg (dict): Config dict for normalization layer.
- Defaults to dict(type='BN').
- act_cfg (dict): Config dict for activation layer.
- Defaults to dict(type='Swish').
- norm_eval (bool): Whether to set norm layers to eval mode, namely,
- freeze running stats (mean and var). Note: Effect on Batch Norm
- and its variants only. Defaults to False.
- with_cp (bool): Use checkpoint or not. Using checkpoint will save some
- memory while slowing down the training speed. Defaults to False.
- """
- # Parameters to build layers.
- # 'b' represents the architecture of normal EfficientNet family includes
- # 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8'.
- # 'e' represents the architecture of EfficientNet-EdgeTPU including 'es',
- # 'em', 'el'.
- # 6 parameters are needed to construct a layer, From left to right:
- # - kernel_size: The kernel size of the block
- # - out_channel: The number of out_channels of the block
- # - se_ratio: The sequeeze ratio of SELayer.
- # - stride: The stride of the block
- # - expand_ratio: The expand_ratio of the mid_channels
- # - block_type: -1: Not a block, 0: InvertedResidual, 1: EdgeResidual
- layer_settings = {
- 'b': [[[3, 32, 0, 2, 0, -1]],
- [[3, 16, 4, 1, 1, 0]],
- [[3, 24, 4, 2, 6, 0],
- [3, 24, 4, 1, 6, 0]],
- [[5, 40, 4, 2, 6, 0],
- [5, 40, 4, 1, 6, 0]],
- [[3, 80, 4, 2, 6, 0],
- [3, 80, 4, 1, 6, 0],
- [3, 80, 4, 1, 6, 0],
- [5, 112, 4, 1, 6, 0],
- [5, 112, 4, 1, 6, 0],
- [5, 112, 4, 1, 6, 0]],
- [[5, 192, 4, 2, 6, 0],
- [5, 192, 4, 1, 6, 0],
- [5, 192, 4, 1, 6, 0],
- [5, 192, 4, 1, 6, 0],
- [3, 320, 4, 1, 6, 0]],
- [[1, 1280, 0, 1, 0, -1]]
- ],
- 'e': [[[3, 32, 0, 2, 0, -1]],
- [[3, 24, 0, 1, 3, 1]],
- [[3, 32, 0, 2, 8, 1],
- [3, 32, 0, 1, 8, 1]],
- [[3, 48, 0, 2, 8, 1],
- [3, 48, 0, 1, 8, 1],
- [3, 48, 0, 1, 8, 1],
- [3, 48, 0, 1, 8, 1]],
- [[5, 96, 0, 2, 8, 0],
- [5, 96, 0, 1, 8, 0],
- [5, 96, 0, 1, 8, 0],
- [5, 96, 0, 1, 8, 0],
- [5, 96, 0, 1, 8, 0],
- [5, 144, 0, 1, 8, 0],
- [5, 144, 0, 1, 8, 0],
- [5, 144, 0, 1, 8, 0],
- [5, 144, 0, 1, 8, 0]],
- [[5, 192, 0, 2, 8, 0],
- [5, 192, 0, 1, 8, 0]],
- [[1, 1280, 0, 1, 0, -1]]
- ]
- } # yapf: disable
- # Parameters to build different kinds of architecture.
- # From left to right: scaling factor for width, scaling factor for depth,
- # resolution.
- arch_settings = {
- 'b0': (1.0, 1.0, 224),
- 'b1': (1.0, 1.1, 240),
- 'b2': (1.1, 1.2, 260),
- 'b3': (1.2, 1.4, 300),
- 'b4': (1.4, 1.8, 380),
- 'b5': (1.6, 2.2, 456),
- 'b6': (1.8, 2.6, 528),
- 'b7': (2.0, 3.1, 600),
- 'b8': (2.2, 3.6, 672),
- 'es': (1.0, 1.0, 224),
- 'em': (1.0, 1.1, 240),
- 'el': (1.2, 1.4, 300)
- }
- def __init__(self,
- arch='b0',
- drop_path_rate=0.,
- out_indices=(6, ),
- frozen_stages=0,
- conv_cfg=dict(type='Conv2dAdaptivePadding'),
- norm_cfg=dict(type='BN', eps=1e-3),
- act_cfg=dict(type='Swish'),
- norm_eval=False,
- with_cp=False,
- init_cfg=[
- dict(type='Kaiming', layer='Conv2d'),
- dict(
- type='Constant',
- layer=['_BatchNorm', 'GroupNorm'],
- val=1)
- ]):
- super(EfficientNet, self).__init__(init_cfg)
- assert arch in self.arch_settings, \
- f'"{arch}" is not one of the arch_settings ' \
- f'({", ".join(self.arch_settings.keys())})'
- self.arch_setting = self.arch_settings[arch]
- self.layer_setting = self.layer_settings[arch[:1]]
- for index in out_indices:
- if index not in range(0, len(self.layer_setting)):
- raise ValueError('the item in out_indices must in '
- f'range(0, {len(self.layer_setting)}). '
- f'But received {index}')
- if frozen_stages not in range(len(self.layer_setting) + 1):
- raise ValueError('frozen_stages must be in range(0, '
- f'{len(self.layer_setting) + 1}). '
- f'But received {frozen_stages}')
- self.drop_path_rate = drop_path_rate
- self.out_indices = out_indices
- self.frozen_stages = frozen_stages
- self.conv_cfg = conv_cfg
- self.norm_cfg = norm_cfg
- self.act_cfg = act_cfg
- self.norm_eval = norm_eval
- self.with_cp = with_cp
- self.layer_setting = model_scaling(self.layer_setting,
- self.arch_setting)
- block_cfg_0 = self.layer_setting[0][0]
- block_cfg_last = self.layer_setting[-1][0]
- self.in_channels = make_divisible(block_cfg_0[1], 8)
- self.out_channels = block_cfg_last[1]
- self.layers = nn.ModuleList()
- self.layers.append(
- ConvModule(
- in_channels=3,
- out_channels=self.in_channels,
- kernel_size=block_cfg_0[0],
- stride=block_cfg_0[3],
- padding=block_cfg_0[0] // 2,
- conv_cfg=self.conv_cfg,
- norm_cfg=self.norm_cfg,
- act_cfg=self.act_cfg))
- self.make_layer()
- # Avoid building unused layers in mmdetection.
- if len(self.layers) < max(self.out_indices) + 1:
- self.layers.append(
- ConvModule(
- in_channels=self.in_channels,
- out_channels=self.out_channels,
- kernel_size=block_cfg_last[0],
- stride=block_cfg_last[3],
- padding=block_cfg_last[0] // 2,
- conv_cfg=self.conv_cfg,
- norm_cfg=self.norm_cfg,
- act_cfg=self.act_cfg))
- def make_layer(self):
- # Without the first and the final conv block.
- layer_setting = self.layer_setting[1:-1]
- total_num_blocks = sum([len(x) for x in layer_setting])
- block_idx = 0
- dpr = [
- x.item()
- for x in torch.linspace(0, self.drop_path_rate, total_num_blocks)
- ] # stochastic depth decay rule
- for i, layer_cfg in enumerate(layer_setting):
- # Avoid building unused layers in mmdetection.
- if i > max(self.out_indices) - 1:
- break
- layer = []
- for i, block_cfg in enumerate(layer_cfg):
- (kernel_size, out_channels, se_ratio, stride, expand_ratio,
- block_type) = block_cfg
- mid_channels = int(self.in_channels * expand_ratio)
- out_channels = make_divisible(out_channels, 8)
- if se_ratio <= 0:
- se_cfg = None
- else:
- # In mmdetection, the `divisor` is deleted to align
- # the logic of SELayer with mmcls.
- se_cfg = dict(
- channels=mid_channels,
- ratio=expand_ratio * se_ratio,
- act_cfg=(self.act_cfg, dict(type='Sigmoid')))
- if block_type == 1: # edge tpu
- if i > 0 and expand_ratio == 3:
- with_residual = False
- expand_ratio = 4
- else:
- with_residual = True
- mid_channels = int(self.in_channels * expand_ratio)
- if se_cfg is not None:
- # In mmdetection, the `divisor` is deleted to align
- # the logic of SELayer with mmcls.
- se_cfg = dict(
- channels=mid_channels,
- ratio=se_ratio * expand_ratio,
- act_cfg=(self.act_cfg, dict(type='Sigmoid')))
- block = partial(EdgeResidual, with_residual=with_residual)
- else:
- block = InvertedResidual
- layer.append(
- block(
- in_channels=self.in_channels,
- out_channels=out_channels,
- mid_channels=mid_channels,
- kernel_size=kernel_size,
- stride=stride,
- se_cfg=se_cfg,
- conv_cfg=self.conv_cfg,
- norm_cfg=self.norm_cfg,
- act_cfg=self.act_cfg,
- drop_path_rate=dpr[block_idx],
- with_cp=self.with_cp,
- # In mmdetection, `with_expand_conv` is set to align
- # the logic of InvertedResidual with mmcls.
- with_expand_conv=(mid_channels != self.in_channels)))
- self.in_channels = out_channels
- block_idx += 1
- self.layers.append(Sequential(*layer))
- def forward(self, x):
- outs = []
- for i, layer in enumerate(self.layers):
- x = layer(x)
- if i in self.out_indices:
- outs.append(x)
- return tuple(outs)
- def _freeze_stages(self):
- for i in range(self.frozen_stages):
- m = self.layers[i]
- m.eval()
- for param in m.parameters():
- param.requires_grad = False
- def train(self, mode=True):
- super(EfficientNet, self).train(mode)
- self._freeze_stages()
- if mode and self.norm_eval:
- for m in self.modules():
- if isinstance(m, nn.BatchNorm2d):
- m.eval()
|