123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504 |
- # Copyright (c) OpenMMLab. All rights reserved.
- import pytest
- import torch
- from mmengine.config import ConfigDict
- from mmdet.models.layers.transformer import (AdaptivePadding,
- DetrTransformerDecoder,
- DetrTransformerEncoder,
- PatchEmbed, PatchMerging)
- def test_adaptive_padding():
- for padding in ('same', 'corner'):
- kernel_size = 16
- stride = 16
- dilation = 1
- input = torch.rand(1, 1, 15, 17)
- pool = AdaptivePadding(
- kernel_size=kernel_size,
- stride=stride,
- dilation=dilation,
- padding=padding)
- out = pool(input)
- # padding to divisible by 16
- assert (out.shape[2], out.shape[3]) == (16, 32)
- input = torch.rand(1, 1, 16, 17)
- out = pool(input)
- # padding to divisible by 16
- assert (out.shape[2], out.shape[3]) == (16, 32)
- kernel_size = (2, 2)
- stride = (2, 2)
- dilation = (1, 1)
- adap_pad = AdaptivePadding(
- kernel_size=kernel_size,
- stride=stride,
- dilation=dilation,
- padding=padding)
- input = torch.rand(1, 1, 11, 13)
- out = adap_pad(input)
- # padding to divisible by 2
- assert (out.shape[2], out.shape[3]) == (12, 14)
- kernel_size = (2, 2)
- stride = (10, 10)
- dilation = (1, 1)
- adap_pad = AdaptivePadding(
- kernel_size=kernel_size,
- stride=stride,
- dilation=dilation,
- padding=padding)
- input = torch.rand(1, 1, 10, 13)
- out = adap_pad(input)
- # no padding
- assert (out.shape[2], out.shape[3]) == (10, 13)
- kernel_size = (11, 11)
- adap_pad = AdaptivePadding(
- kernel_size=kernel_size,
- stride=stride,
- dilation=dilation,
- padding=padding)
- input = torch.rand(1, 1, 11, 13)
- out = adap_pad(input)
- # all padding
- assert (out.shape[2], out.shape[3]) == (21, 21)
- # test padding as kernel is (7,9)
- input = torch.rand(1, 1, 11, 13)
- stride = (3, 4)
- kernel_size = (4, 5)
- dilation = (2, 2)
- # actually (7, 9)
- adap_pad = AdaptivePadding(
- kernel_size=kernel_size,
- stride=stride,
- dilation=dilation,
- padding=padding)
- dilation_out = adap_pad(input)
- assert (dilation_out.shape[2], dilation_out.shape[3]) == (16, 21)
- kernel_size = (7, 9)
- dilation = (1, 1)
- adap_pad = AdaptivePadding(
- kernel_size=kernel_size,
- stride=stride,
- dilation=dilation,
- padding=padding)
- kernel79_out = adap_pad(input)
- assert (kernel79_out.shape[2], kernel79_out.shape[3]) == (16, 21)
- assert kernel79_out.shape == dilation_out.shape
- # assert only support "same" "corner"
- with pytest.raises(AssertionError):
- AdaptivePadding(
- kernel_size=kernel_size,
- stride=stride,
- dilation=dilation,
- padding=1)
- def test_patch_embed():
- B = 2
- H = 3
- W = 4
- C = 3
- embed_dims = 10
- kernel_size = 3
- stride = 1
- dummy_input = torch.rand(B, C, H, W)
- patch_merge_1 = PatchEmbed(
- in_channels=C,
- embed_dims=embed_dims,
- kernel_size=kernel_size,
- stride=stride,
- padding=0,
- dilation=1,
- norm_cfg=None)
- x1, shape = patch_merge_1(dummy_input)
- # test out shape
- assert x1.shape == (2, 2, 10)
- # test outsize is correct
- assert shape == (1, 2)
- # test L = out_h * out_w
- assert shape[0] * shape[1] == x1.shape[1]
- B = 2
- H = 10
- W = 10
- C = 3
- embed_dims = 10
- kernel_size = 5
- stride = 2
- dummy_input = torch.rand(B, C, H, W)
- # test dilation
- patch_merge_2 = PatchEmbed(
- in_channels=C,
- embed_dims=embed_dims,
- kernel_size=kernel_size,
- stride=stride,
- padding=0,
- dilation=2,
- norm_cfg=None,
- )
- x2, shape = patch_merge_2(dummy_input)
- # test out shape
- assert x2.shape == (2, 1, 10)
- # test outsize is correct
- assert shape == (1, 1)
- # test L = out_h * out_w
- assert shape[0] * shape[1] == x2.shape[1]
- stride = 2
- input_size = (10, 10)
- dummy_input = torch.rand(B, C, H, W)
- # test stride and norm
- patch_merge_3 = PatchEmbed(
- in_channels=C,
- embed_dims=embed_dims,
- kernel_size=kernel_size,
- stride=stride,
- padding=0,
- dilation=2,
- norm_cfg=dict(type='LN'),
- input_size=input_size)
- x3, shape = patch_merge_3(dummy_input)
- # test out shape
- assert x3.shape == (2, 1, 10)
- # test outsize is correct
- assert shape == (1, 1)
- # test L = out_h * out_w
- assert shape[0] * shape[1] == x3.shape[1]
- # test the init_out_size with nn.Unfold
- assert patch_merge_3.init_out_size[1] == (input_size[0] - 2 * 4 -
- 1) // 2 + 1
- assert patch_merge_3.init_out_size[0] == (input_size[0] - 2 * 4 -
- 1) // 2 + 1
- H = 11
- W = 12
- input_size = (H, W)
- dummy_input = torch.rand(B, C, H, W)
- # test stride and norm
- patch_merge_3 = PatchEmbed(
- in_channels=C,
- embed_dims=embed_dims,
- kernel_size=kernel_size,
- stride=stride,
- padding=0,
- dilation=2,
- norm_cfg=dict(type='LN'),
- input_size=input_size)
- _, shape = patch_merge_3(dummy_input)
- # when input_size equal to real input
- # the out_size should be equal to `init_out_size`
- assert shape == patch_merge_3.init_out_size
- input_size = (H, W)
- dummy_input = torch.rand(B, C, H, W)
- # test stride and norm
- patch_merge_3 = PatchEmbed(
- in_channels=C,
- embed_dims=embed_dims,
- kernel_size=kernel_size,
- stride=stride,
- padding=0,
- dilation=2,
- norm_cfg=dict(type='LN'),
- input_size=input_size)
- _, shape = patch_merge_3(dummy_input)
- # when input_size equal to real input
- # the out_size should be equal to `init_out_size`
- assert shape == patch_merge_3.init_out_size
- # test adap padding
- for padding in ('same', 'corner'):
- in_c = 2
- embed_dims = 3
- B = 2
- # test stride is 1
- input_size = (5, 5)
- kernel_size = (5, 5)
- stride = (1, 1)
- dilation = 1
- bias = False
- x = torch.rand(B, in_c, *input_size)
- patch_embed = PatchEmbed(
- in_channels=in_c,
- embed_dims=embed_dims,
- kernel_size=kernel_size,
- stride=stride,
- padding=padding,
- dilation=dilation,
- bias=bias)
- x_out, out_size = patch_embed(x)
- assert x_out.size() == (B, 25, 3)
- assert out_size == (5, 5)
- assert x_out.size(1) == out_size[0] * out_size[1]
- # test kernel_size == stride
- input_size = (5, 5)
- kernel_size = (5, 5)
- stride = (5, 5)
- dilation = 1
- bias = False
- x = torch.rand(B, in_c, *input_size)
- patch_embed = PatchEmbed(
- in_channels=in_c,
- embed_dims=embed_dims,
- kernel_size=kernel_size,
- stride=stride,
- padding=padding,
- dilation=dilation,
- bias=bias)
- x_out, out_size = patch_embed(x)
- assert x_out.size() == (B, 1, 3)
- assert out_size == (1, 1)
- assert x_out.size(1) == out_size[0] * out_size[1]
- # test kernel_size == stride
- input_size = (6, 5)
- kernel_size = (5, 5)
- stride = (5, 5)
- dilation = 1
- bias = False
- x = torch.rand(B, in_c, *input_size)
- patch_embed = PatchEmbed(
- in_channels=in_c,
- embed_dims=embed_dims,
- kernel_size=kernel_size,
- stride=stride,
- padding=padding,
- dilation=dilation,
- bias=bias)
- x_out, out_size = patch_embed(x)
- assert x_out.size() == (B, 2, 3)
- assert out_size == (2, 1)
- assert x_out.size(1) == out_size[0] * out_size[1]
- # test different kernel_size with different stride
- input_size = (6, 5)
- kernel_size = (6, 2)
- stride = (6, 2)
- dilation = 1
- bias = False
- x = torch.rand(B, in_c, *input_size)
- patch_embed = PatchEmbed(
- in_channels=in_c,
- embed_dims=embed_dims,
- kernel_size=kernel_size,
- stride=stride,
- padding=padding,
- dilation=dilation,
- bias=bias)
- x_out, out_size = patch_embed(x)
- assert x_out.size() == (B, 3, 3)
- assert out_size == (1, 3)
- assert x_out.size(1) == out_size[0] * out_size[1]
- def test_patch_merging():
- # Test the model with int padding
- in_c = 3
- out_c = 4
- kernel_size = 3
- stride = 3
- padding = 1
- dilation = 1
- bias = False
- # test the case `pad_to_stride` is False
- patch_merge = PatchMerging(
- in_channels=in_c,
- out_channels=out_c,
- kernel_size=kernel_size,
- stride=stride,
- padding=padding,
- dilation=dilation,
- bias=bias)
- B, L, C = 1, 100, 3
- input_size = (10, 10)
- x = torch.rand(B, L, C)
- x_out, out_size = patch_merge(x, input_size)
- assert x_out.size() == (1, 16, 4)
- assert out_size == (4, 4)
- # assert out size is consistent with real output
- assert x_out.size(1) == out_size[0] * out_size[1]
- in_c = 4
- out_c = 5
- kernel_size = 6
- stride = 3
- padding = 2
- dilation = 2
- bias = False
- patch_merge = PatchMerging(
- in_channels=in_c,
- out_channels=out_c,
- kernel_size=kernel_size,
- stride=stride,
- padding=padding,
- dilation=dilation,
- bias=bias)
- B, L, C = 1, 100, 4
- input_size = (10, 10)
- x = torch.rand(B, L, C)
- x_out, out_size = patch_merge(x, input_size)
- assert x_out.size() == (1, 4, 5)
- assert out_size == (2, 2)
- # assert out size is consistent with real output
- assert x_out.size(1) == out_size[0] * out_size[1]
- # Test with adaptive padding
- for padding in ('same', 'corner'):
- in_c = 2
- out_c = 3
- B = 2
- # test stride is 1
- input_size = (5, 5)
- kernel_size = (5, 5)
- stride = (1, 1)
- dilation = 1
- bias = False
- L = input_size[0] * input_size[1]
- x = torch.rand(B, L, in_c)
- patch_merge = PatchMerging(
- in_channels=in_c,
- out_channels=out_c,
- kernel_size=kernel_size,
- stride=stride,
- padding=padding,
- dilation=dilation,
- bias=bias)
- x_out, out_size = patch_merge(x, input_size)
- assert x_out.size() == (B, 25, 3)
- assert out_size == (5, 5)
- assert x_out.size(1) == out_size[0] * out_size[1]
- # test kernel_size == stride
- input_size = (5, 5)
- kernel_size = (5, 5)
- stride = (5, 5)
- dilation = 1
- bias = False
- L = input_size[0] * input_size[1]
- x = torch.rand(B, L, in_c)
- patch_merge = PatchMerging(
- in_channels=in_c,
- out_channels=out_c,
- kernel_size=kernel_size,
- stride=stride,
- padding=padding,
- dilation=dilation,
- bias=bias)
- x_out, out_size = patch_merge(x, input_size)
- assert x_out.size() == (B, 1, 3)
- assert out_size == (1, 1)
- assert x_out.size(1) == out_size[0] * out_size[1]
- # test kernel_size == stride
- input_size = (6, 5)
- kernel_size = (5, 5)
- stride = (5, 5)
- dilation = 1
- bias = False
- L = input_size[0] * input_size[1]
- x = torch.rand(B, L, in_c)
- patch_merge = PatchMerging(
- in_channels=in_c,
- out_channels=out_c,
- kernel_size=kernel_size,
- stride=stride,
- padding=padding,
- dilation=dilation,
- bias=bias)
- x_out, out_size = patch_merge(x, input_size)
- assert x_out.size() == (B, 2, 3)
- assert out_size == (2, 1)
- assert x_out.size(1) == out_size[0] * out_size[1]
- # test different kernel_size with different stride
- input_size = (6, 5)
- kernel_size = (6, 2)
- stride = (6, 2)
- dilation = 1
- bias = False
- L = input_size[0] * input_size[1]
- x = torch.rand(B, L, in_c)
- patch_merge = PatchMerging(
- in_channels=in_c,
- out_channels=out_c,
- kernel_size=kernel_size,
- stride=stride,
- padding=padding,
- dilation=dilation,
- bias=bias)
- x_out, out_size = patch_merge(x, input_size)
- assert x_out.size() == (B, 3, 3)
- assert out_size == (1, 3)
- assert x_out.size(1) == out_size[0] * out_size[1]
- def test_detr_transformer_encoder_decoder():
- config = ConfigDict(
- num_layers=6,
- layer_cfg=dict( # DetrTransformerDecoderLayer
- self_attn_cfg=dict( # MultiheadAttention
- embed_dims=256,
- num_heads=8,
- dropout=0.1),
- cross_attn_cfg=dict( # MultiheadAttention
- embed_dims=256,
- num_heads=8,
- dropout=0.1),
- ffn_cfg=dict(
- embed_dims=256,
- feedforward_channels=2048,
- num_fcs=2,
- ffn_drop=0.1,
- act_cfg=dict(type='ReLU', inplace=True))))
- assert len(DetrTransformerDecoder(**config).layers) == 6
- assert DetrTransformerDecoder(**config)
- config = ConfigDict(
- dict(
- num_layers=6,
- layer_cfg=dict( # DetrTransformerEncoderLayer
- self_attn_cfg=dict( # MultiheadAttention
- embed_dims=256,
- num_heads=8,
- dropout=0.1),
- ffn_cfg=dict(
- embed_dims=256,
- feedforward_channels=2048,
- num_fcs=2,
- ffn_drop=0.1,
- act_cfg=dict(type='ReLU', inplace=True)))))
- assert len(DetrTransformerEncoder(**config).layers) == 6
- assert DetrTransformerEncoder(**config)
|