@inproceedings{xiao2018simple,
title={Simple baselines for human pose estimation and tracking},
author={Xiao, Bin and Wu, Haiping and Wei, Yichen},
booktitle={Proceedings of the European conference on computer vision (ECCV)},
pages={466--481},
year={2018}
}
@inproceedings{liu2021swin,
title={Swin transformer: Hierarchical vision transformer using shifted windows},
author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},
booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
pages={10012--10022},
year={2021}
}
@inproceedings{lin2017feature,
title={Feature pyramid networks for object detection},
author={Lin, Tsung-Yi and Doll{\'a}r, Piotr and Girshick, Ross and He, Kaiming and Hariharan, Bharath and Belongie, Serge},
booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
pages={2117--2125},
year={2017}
}
@inproceedings{lin2014microsoft,
title={Microsoft coco: Common objects in context},
author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
booktitle={European conference on computer vision},
pages={740--755},
year={2014},
organization={Springer}
}
Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
---|---|---|---|---|---|---|---|---|
pose_swin_t | 256x192 | 0.724 | 0.901 | 0.806 | 0.782 | 0.940 | ckpt | log |
pose_swin_b | 256x192 | 0.737 | 0.904 | 0.820 | 0.794 | 0.942 | ckpt | log |
pose_swin_b | 384x288 | 0.759 | 0.910 | 0.832 | 0.811 | 0.946 | ckpt | log |
pose_swin_l | 256x192 | 0.743 | 0.906 | 0.821 | 0.798 | 0.943 | ckpt | log |
pose_swin_l | 384x288 | 0.763 | 0.912 | 0.830 | 0.814 | 0.949 | ckpt | log |