From df1c3e5f12d0e93675448f56911b93048e5de150 Mon Sep 17 00:00:00 2001 From: lyuwenyu Date: Tue, 18 Apr 2023 13:10:51 +0800 Subject: [PATCH] init rtdetr --- configs/rtdetr/README.md | 2 +- configs/rtdetr/_base_/rtdetr_r50vd.yml | 2 +- ppdet/modeling/transformers/__init__.py | 4 ++-- .../{ppdetr_transformer.py => rtdetr_transformer.py} | 6 +++--- 4 files changed, 7 insertions(+), 7 deletions(-) rename ppdet/modeling/transformers/{ppdetr_transformer.py => rtdetr_transformer.py} (99%) diff --git a/configs/rtdetr/README.md b/configs/rtdetr/README.md index 46f47b87fd3..3a11e87bca3 100644 --- a/configs/rtdetr/README.md +++ b/configs/rtdetr/README.md @@ -1,7 +1,7 @@ # DETRs Beat YOLOs on Real-time Object Detection ## Introduction -We propose a **R**eal-**T**ime **DE**tection **TR**ansformer (RT-DETR), the first real-time end-to-end object detector to our best knowledge. Specifically, we design an efficient hybrid encoder to efficiently process multi-scale features by decoupling the intra-scale interaction and cross-scale fusion, and propose IoU-aware query selection to improve the initialization of object queries. In addition, our proposed detector supports flexibly adjustment of the inference speed by using different decoder layers without the need for retraining, which facilitates the practical application of real-time object detectors. Our RT-DETR-L achieves 53.0% AP on COCO val2017 and 114 FPS on T4 GPU, while RT-DETR-X achieves 54.8% AP and 74 FPS, outperforming all YOLO detectors of the same scale in both speed and accuracy. Furthermore, our RT-DETR-R50 achieves 53.1% AP and 108 FPS, outperforming DINO-Deformable-DETR-R50 by 2.2% AP in accuracy and by about 21 times in FPS. +We propose a **R**eal-**T**ime **DE**tection **TR**ansformer (RT-DETR), the first real-time end-to-end object detector to our best knowledge. Specifically, we design an efficient hybrid encoder to efficiently process multi-scale features by decoupling the intra-scale interaction and cross-scale fusion, and propose IoU-aware query selection to improve the initialization of object queries. In addition, our proposed detector supports flexibly adjustment of the inference speed by using different decoder layers without the need for retraining, which facilitates the practical application of real-time object detectors. Our RT-DETR-L achieves 53.0% AP on COCO val2017 and 114 FPS on T4 GPU, while RT-DETR-X achieves 54.8% AP and 74 FPS, outperforming all YOLO detectors of the same scale in both speed and accuracy. Furthermore, our RT-DETR-R50 achieves 53.1% AP and 108 FPS, outperforming DINO-Deformable-DETR-R50 by 2.2% AP in accuracy and by about 21 times in FPS. For more details, please refer to our [paper](https://arxiv.org/abs/2304.08069).
diff --git a/configs/rtdetr/_base_/rtdetr_r50vd.yml b/configs/rtdetr/_base_/rtdetr_r50vd.yml index f2c7e1f3e31..e48c4cb9d94 100644 --- a/configs/rtdetr/_base_/rtdetr_r50vd.yml +++ b/configs/rtdetr/_base_/rtdetr_r50vd.yml @@ -13,7 +13,7 @@ eval_size: [640, 640] DETR: backbone: ResNet neck: HybridEncoder - transformer: PPDETRTransformer + transformer: RTDETRTransformer detr_head: DINOHead post_process: DETRPostProcess diff --git a/ppdet/modeling/transformers/__init__.py b/ppdet/modeling/transformers/__init__.py index 8815737da57..33a12402656 100644 --- a/ppdet/modeling/transformers/__init__.py +++ b/ppdet/modeling/transformers/__init__.py @@ -20,7 +20,7 @@ from . import dino_transformer from . import group_detr_transformer from . import mask_dino_transformer -from . import ppdetr_transformer +from . import rtdetr_transformer from . import hybrid_encoder from .detr_transformer import * @@ -32,5 +32,5 @@ from .petr_transformer import * from .group_detr_transformer import * from .mask_dino_transformer import * -from .ppdetr_transformer import * +from .rtdetr_transformer import * from .hybrid_encoder import * diff --git a/ppdet/modeling/transformers/ppdetr_transformer.py b/ppdet/modeling/transformers/rtdetr_transformer.py similarity index 99% rename from ppdet/modeling/transformers/ppdetr_transformer.py rename to ppdet/modeling/transformers/rtdetr_transformer.py index 98c9631d5c6..672590edfde 100644 --- a/ppdet/modeling/transformers/ppdetr_transformer.py +++ b/ppdet/modeling/transformers/rtdetr_transformer.py @@ -36,7 +36,7 @@ from .utils import (_get_clones, get_sine_pos_embed, get_contrastive_denoising_training_group, inverse_sigmoid) -__all__ = ['PPDETRTransformer'] +__all__ = ['RTDETRTransformer'] class PPMSDeformableAttention(MSDeformableAttention): @@ -254,7 +254,7 @@ def forward(self, @register -class PPDETRTransformer(nn.Layer): +class RTDETRTransformer(nn.Layer): __shared__ = ['num_classes', 'hidden_dim', 'eval_size'] def __init__(self, @@ -278,7 +278,7 @@ def __init__(self, eval_size=None, eval_idx=-1, eps=1e-2): - super(PPDETRTransformer, self).__init__() + super(RTDETRTransformer, self).__init__() assert position_embed_type in ['sine', 'learned'], \ f'ValueError: position_embed_type not supported {position_embed_type}!' assert len(backbone_feat_channels) <= num_levels