@inproceedings{wang2021pyramid,
title={Pyramid vision transformer: A versatile backbone for dense prediction without convolutions},
author={Wang, Wenhai and Xie, Enze and Li, Xiang and Fan, Deng-Ping and Song, Kaitao and Liang, Ding and Lu, Tong and Luo, Ping and Shao, Ling},
booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
pages={568--578},
year={2021}
}
@article{wang2022pvt,
title={PVT v2: Improved baselines with Pyramid Vision Transformer},
author={Wang, Wenhai and Xie, Enze and Li, Xiang and Fan, Deng-Ping and Song, Kaitao and Liang, Ding and Lu, Tong and Luo, Ping and Shao, Ling},
journal={Computational Visual Media},
pages={1--10},
year={2022},
publisher={Springer}
}
@inproceedings{lin2014microsoft,
title={Microsoft coco: Common objects in context},
author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
booktitle={European conference on computer vision},
pages={740--755},
year={2014},
organization={Springer}
}
Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset
Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
---|---|---|---|---|---|---|---|---|
pose_pvt-s | 256x192 | 0.714 | 0.896 | 0.794 | 0.773 | 0.936 | ckpt | log |
pose_pvtv2-b2 | 256x192 | 0.737 | 0.905 | 0.812 | 0.791 | 0.942 | ckpt | log |