mask2former_r50_8xb2-lsj-50e_coco-panoptic.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. _base_ = [
  2. '../_base_/datasets/coco_panoptic.py', '../_base_/default_runtime.py'
  3. ]
  4. image_size = (1024, 1024)
  5. batch_augments = [
  6. dict(
  7. type='BatchFixedSizePad',
  8. size=image_size,
  9. img_pad_value=0,
  10. pad_mask=True,
  11. mask_pad_value=0,
  12. pad_seg=True,
  13. seg_pad_value=255)
  14. ]
  15. data_preprocessor = dict(
  16. type='DetDataPreprocessor',
  17. mean=[123.675, 116.28, 103.53],
  18. std=[58.395, 57.12, 57.375],
  19. bgr_to_rgb=True,
  20. pad_size_divisor=32,
  21. pad_mask=True,
  22. mask_pad_value=0,
  23. pad_seg=True,
  24. seg_pad_value=255,
  25. batch_augments=batch_augments)
  26. num_things_classes = 80
  27. num_stuff_classes = 53
  28. num_classes = num_things_classes + num_stuff_classes
  29. model = dict(
  30. type='Mask2Former',
  31. data_preprocessor=data_preprocessor,
  32. backbone=dict(
  33. type='ResNet',
  34. depth=50,
  35. num_stages=4,
  36. out_indices=(0, 1, 2, 3),
  37. frozen_stages=-1,
  38. norm_cfg=dict(type='BN', requires_grad=False),
  39. norm_eval=True,
  40. style='pytorch',
  41. init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
  42. panoptic_head=dict(
  43. type='Mask2FormerHead',
  44. in_channels=[256, 512, 1024, 2048], # pass to pixel_decoder inside
  45. strides=[4, 8, 16, 32],
  46. feat_channels=256,
  47. out_channels=256,
  48. num_things_classes=num_things_classes,
  49. num_stuff_classes=num_stuff_classes,
  50. num_queries=100,
  51. num_transformer_feat_level=3,
  52. pixel_decoder=dict(
  53. type='MSDeformAttnPixelDecoder',
  54. num_outs=3,
  55. norm_cfg=dict(type='GN', num_groups=32),
  56. act_cfg=dict(type='ReLU'),
  57. encoder=dict( # DeformableDetrTransformerEncoder
  58. num_layers=6,
  59. layer_cfg=dict( # DeformableDetrTransformerEncoderLayer
  60. self_attn_cfg=dict( # MultiScaleDeformableAttention
  61. embed_dims=256,
  62. num_heads=8,
  63. num_levels=3,
  64. num_points=4,
  65. dropout=0.0,
  66. batch_first=True),
  67. ffn_cfg=dict(
  68. embed_dims=256,
  69. feedforward_channels=1024,
  70. num_fcs=2,
  71. ffn_drop=0.0,
  72. act_cfg=dict(type='ReLU', inplace=True)))),
  73. positional_encoding=dict(num_feats=128, normalize=True)),
  74. enforce_decoder_input_project=False,
  75. positional_encoding=dict(num_feats=128, normalize=True),
  76. transformer_decoder=dict( # Mask2FormerTransformerDecoder
  77. return_intermediate=True,
  78. num_layers=9,
  79. layer_cfg=dict( # Mask2FormerTransformerDecoderLayer
  80. self_attn_cfg=dict( # MultiheadAttention
  81. embed_dims=256,
  82. num_heads=8,
  83. dropout=0.0,
  84. batch_first=True),
  85. cross_attn_cfg=dict( # MultiheadAttention
  86. embed_dims=256,
  87. num_heads=8,
  88. dropout=0.0,
  89. batch_first=True),
  90. ffn_cfg=dict(
  91. embed_dims=256,
  92. feedforward_channels=2048,
  93. num_fcs=2,
  94. ffn_drop=0.0,
  95. act_cfg=dict(type='ReLU', inplace=True))),
  96. init_cfg=None),
  97. loss_cls=dict(
  98. type='CrossEntropyLoss',
  99. use_sigmoid=False,
  100. loss_weight=2.0,
  101. reduction='mean',
  102. class_weight=[1.0] * num_classes + [0.1]),
  103. loss_mask=dict(
  104. type='CrossEntropyLoss',
  105. use_sigmoid=True,
  106. reduction='mean',
  107. loss_weight=5.0),
  108. loss_dice=dict(
  109. type='DiceLoss',
  110. use_sigmoid=True,
  111. activate=True,
  112. reduction='mean',
  113. naive_dice=True,
  114. eps=1.0,
  115. loss_weight=5.0)),
  116. panoptic_fusion_head=dict(
  117. type='MaskFormerFusionHead',
  118. num_things_classes=num_things_classes,
  119. num_stuff_classes=num_stuff_classes,
  120. loss_panoptic=None,
  121. init_cfg=None),
  122. train_cfg=dict(
  123. num_points=12544,
  124. oversample_ratio=3.0,
  125. importance_sample_ratio=0.75,
  126. assigner=dict(
  127. type='HungarianAssigner',
  128. match_costs=[
  129. dict(type='ClassificationCost', weight=2.0),
  130. dict(
  131. type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True),
  132. dict(type='DiceCost', weight=5.0, pred_act=True, eps=1.0)
  133. ]),
  134. sampler=dict(type='MaskPseudoSampler')),
  135. test_cfg=dict(
  136. panoptic_on=True,
  137. # For now, the dataset does not support
  138. # evaluating semantic segmentation metric.
  139. semantic_on=False,
  140. instance_on=True,
  141. # max_per_image is for instance segmentation.
  142. max_per_image=100,
  143. iou_thr=0.8,
  144. # In Mask2Former's panoptic postprocessing,
  145. # it will filter mask area where score is less than 0.5 .
  146. filter_low_score=True),
  147. init_cfg=None)
  148. # dataset settings
  149. data_root = 'data/coco/'
  150. train_pipeline = [
  151. dict(
  152. type='LoadImageFromFile',
  153. to_float32=True,
  154. backend_args={{_base_.backend_args}}),
  155. dict(
  156. type='LoadPanopticAnnotations',
  157. with_bbox=True,
  158. with_mask=True,
  159. with_seg=True,
  160. backend_args={{_base_.backend_args}}),
  161. dict(type='RandomFlip', prob=0.5),
  162. # large scale jittering
  163. dict(
  164. type='RandomResize',
  165. scale=image_size,
  166. ratio_range=(0.1, 2.0),
  167. keep_ratio=True),
  168. dict(
  169. type='RandomCrop',
  170. crop_size=image_size,
  171. crop_type='absolute',
  172. recompute_bbox=True,
  173. allow_negative_crop=True),
  174. dict(type='PackDetInputs')
  175. ]
  176. train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
  177. val_evaluator = [
  178. dict(
  179. type='CocoPanopticMetric',
  180. ann_file=data_root + 'annotations/panoptic_val2017.json',
  181. seg_prefix=data_root + 'annotations/panoptic_val2017/',
  182. backend_args={{_base_.backend_args}}),
  183. dict(
  184. type='CocoMetric',
  185. ann_file=data_root + 'annotations/instances_val2017.json',
  186. metric=['bbox', 'segm'],
  187. backend_args={{_base_.backend_args}})
  188. ]
  189. test_evaluator = val_evaluator
  190. # optimizer
  191. embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
  192. optim_wrapper = dict(
  193. type='OptimWrapper',
  194. optimizer=dict(
  195. type='AdamW',
  196. lr=0.0001,
  197. weight_decay=0.05,
  198. eps=1e-8,
  199. betas=(0.9, 0.999)),
  200. paramwise_cfg=dict(
  201. custom_keys={
  202. 'backbone': dict(lr_mult=0.1, decay_mult=1.0),
  203. 'query_embed': embed_multi,
  204. 'query_feat': embed_multi,
  205. 'level_embed': embed_multi,
  206. },
  207. norm_decay_mult=0.0),
  208. clip_grad=dict(max_norm=0.01, norm_type=2))
  209. # learning policy
  210. max_iters = 368750
  211. param_scheduler = dict(
  212. type='MultiStepLR',
  213. begin=0,
  214. end=max_iters,
  215. by_epoch=False,
  216. milestones=[327778, 355092],
  217. gamma=0.1)
  218. # Before 365001th iteration, we do evaluation every 5000 iterations.
  219. # After 365000th iteration, we do evaluation every 368750 iterations,
  220. # which means that we do evaluation at the end of training.
  221. interval = 5000
  222. dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)]
  223. train_cfg = dict(
  224. type='IterBasedTrainLoop',
  225. max_iters=max_iters,
  226. val_interval=interval,
  227. dynamic_intervals=dynamic_intervals)
  228. val_cfg = dict(type='ValLoop')
  229. test_cfg = dict(type='TestLoop')
  230. default_hooks = dict(
  231. checkpoint=dict(
  232. type='CheckpointHook',
  233. by_epoch=False,
  234. save_last=True,
  235. max_keep_ckpts=3,
  236. interval=interval))
  237. log_processor = dict(type='LogProcessor', window_size=50, by_epoch=False)
  238. # Default setting for scaling LR automatically
  239. # - `enable` means enable scaling LR automatically
  240. # or not by default.
  241. # - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
  242. auto_scale_lr = dict(enable=False, base_batch_size=16)