init rtdetr

PaddlePaddle · Apr 18, 2023 · df1c3e5 · df1c3e5
1 parent 4ce6d0d
commit df1c3e5
Show file tree

Hide file tree

Showing 4 changed files with 7 additions and 7 deletions.
diff --git a/configs/rtdetr/README.md b/configs/rtdetr/README.md
@@ -1,7 +1,7 @@
 # DETRs Beat YOLOs on Real-time Object Detection
 
 ## Introduction
-We propose a **R**eal-**T**ime **DE**tection **TR**ansformer (RT-DETR), the first real-time end-to-end object detector to our best knowledge. Specifically, we design an efficient hybrid encoder to efficiently process multi-scale features by decoupling the intra-scale interaction and cross-scale fusion, and propose IoU-aware query selection to improve the initialization of object queries. In addition, our proposed detector supports flexibly adjustment of the inference speed by using different decoder layers without the need for retraining, which facilitates the practical application of real-time object detectors. Our RT-DETR-L achieves 53.0% AP on COCO val2017 and 114 FPS on T4 GPU, while RT-DETR-X achieves 54.8% AP and 74 FPS, outperforming all YOLO detectors of the same scale in both speed and accuracy. Furthermore, our RT-DETR-R50 achieves 53.1% AP and 108 FPS, outperforming DINO-Deformable-DETR-R50 by 2.2% AP in accuracy and by about 21 times in FPS.
+We propose a **R**eal-**T**ime **DE**tection **TR**ansformer (RT-DETR), the first real-time end-to-end object detector to our best knowledge. Specifically, we design an efficient hybrid encoder to efficiently process multi-scale features by decoupling the intra-scale interaction and cross-scale fusion, and propose IoU-aware query selection to improve the initialization of object queries. In addition, our proposed detector supports flexibly adjustment of the inference speed by using different decoder layers without the need for retraining, which facilitates the practical application of real-time object detectors. Our RT-DETR-L achieves 53.0% AP on COCO val2017 and 114 FPS on T4 GPU, while RT-DETR-X achieves 54.8% AP and 74 FPS, outperforming all YOLO detectors of the same scale in both speed and accuracy. Furthermore, our RT-DETR-R50 achieves 53.1% AP and 108 FPS, outperforming DINO-Deformable-DETR-R50 by 2.2% AP in accuracy and by about 21 times in FPS. For more details, please refer to our [paper](https://arxiv.org/abs/2304.08069).
 
 <div align="center">
   <img src="https://user-images.githubusercontent.com/17582080/232390925-54e58fe6-1c17-4610-90b9-7e5525577d80.png" width=500 />

diff --git a/configs/rtdetr/_base_/rtdetr_r50vd.yml b/configs/rtdetr/_base_/rtdetr_r50vd.yml
@@ -13,7 +13,7 @@ eval_size: [640, 640]
 DETR:
   backbone: ResNet
   neck: HybridEncoder
-  transformer: PPDETRTransformer
+  transformer: RTDETRTransformer
   detr_head: DINOHead
   post_process: DETRPostProcess
 

diff --git a/ppdet/modeling/transformers/__init__.py b/ppdet/modeling/transformers/__init__.py
@@ -20,7 +20,7 @@
 from . import dino_transformer
 from . import group_detr_transformer
 from . import mask_dino_transformer
-from . import ppdetr_transformer
+from . import rtdetr_transformer
 from . import hybrid_encoder
 
 from .detr_transformer import *
@@ -32,5 +32,5 @@
 from .petr_transformer import *
 from .group_detr_transformer import *
 from .mask_dino_transformer import *
-from .ppdetr_transformer import *
+from .rtdetr_transformer import *
 from .hybrid_encoder import *
diff --git a/...deling/transformers/ppdetr_transformer.py → ...deling/transformers/rtdetr_transformer.py b/...deling/transformers/ppdetr_transformer.py → ...deling/transformers/rtdetr_transformer.py
@@ -36,7 +36,7 @@
 from .utils import (_get_clones, get_sine_pos_embed,
                     get_contrastive_denoising_training_group, inverse_sigmoid)
 
-__all__ = ['PPDETRTransformer']
+__all__ = ['RTDETRTransformer']
 
 
 class PPMSDeformableAttention(MSDeformableAttention):
@@ -254,7 +254,7 @@ def forward(self,
 
 
 @register
-class PPDETRTransformer(nn.Layer):
+class RTDETRTransformer(nn.Layer):
     __shared__ = ['num_classes', 'hidden_dim', 'eval_size']
 
     def __init__(self,
@@ -278,7 +278,7 @@ def __init__(self,
                  eval_size=None,
                  eval_idx=-1,
                  eps=1e-2):
-        super(PPDETRTransformer, self).__init__()
+        super(RTDETRTransformer, self).__init__()
         assert position_embed_type in ['sine', 'learned'], \
             f'ValueError: position_embed_type not supported {position_embed_type}!'
         assert len(backbone_feat_channels) <= num_levels