alibaba
diff --git a/‎README.md
Lines changed: 2 additions & 1 deletion b/‎README.md
Lines changed: 2 additions & 1 deletion
diff --git a/‎README_zh-CN.md
Lines changed: 1 addition & 0 deletions b/‎README_zh-CN.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/tools/extract.py
Lines changed: 1 addition & 19 deletions b/‎benchmarks/tools/extract.py
Lines changed: 1 addition & 19 deletions
diff --git a/‎configs/detection/_base_/dataset/autoaug_coco_detection.py renamed to ‎configs/detection/common/dataset/autoaug_coco_detection.py
Lines changed: 42 additions & 32 deletions b/‎configs/detection/_base_/dataset/autoaug_coco_detection.py renamed to ‎configs/detection/common/dataset/autoaug_coco_detection.py
Lines changed: 42 additions & 32 deletions
diff --git a/‎configs/detection/dab_detr/dab_detr_r50_8x2_50e_coco.py
Lines changed: 1 addition & 1 deletion b/‎configs/detection/dab_detr/dab_detr_r50_8x2_50e_coco.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎configs/detection/detr/detr_r50_8x2_150e_coco.py
Lines changed: 1 addition & 1 deletion b/‎configs/detection/detr/detr_r50_8x2_150e_coco.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎configs/detection/dino/README.md
Lines changed: 36 additions & 0 deletions b/‎configs/detection/dino/README.md
Lines changed: 36 additions & 0 deletions
diff --git a/‎configs/detection/dino/dino_4sc_r50.py
Lines changed: 94 additions & 0 deletions b/‎configs/detection/dino/dino_4sc_r50.py
Lines changed: 94 additions & 0 deletions
diff --git a/‎configs/detection/dino/dino_4sc_r50_12e_coco.py
Lines changed: 4 additions & 0 deletions b/‎configs/detection/dino/dino_4sc_r50_12e_coco.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎configs/detection/dino/dino_4sc_r50_24e_coco.py
Lines changed: 6 additions & 0 deletions b/‎configs/detection/dino/dino_4sc_r50_24e_coco.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎configs/detection/dino/dino_4sc_r50_36e_coco.py
Lines changed: 6 additions & 0 deletions b/‎configs/detection/dino/dino_4sc_r50_36e_coco.py
Lines changed: 6 additions & 0 deletions
@@ -32,7 +32,7 @@ EasyCV is an all-in-one computer vision toolbox based on PyTorch, mainly focuses
 
 - **Vision Transformers**
 
-  EasyCV aims to provide an easy way to use the off-the-shelf SOTA transformer models trained either using supervised learning or self-supervised learning, such as ViT, Swin-Transformer and Shuffle Transformer. More models will be added in the future. In addition, we support all the pretrained models from [timm](https://github.com/rwightman/pytorch-image-models).
+  EasyCV aims to provide an easy way to use the off-the-shelf SOTA transformer models trained either using supervised learning or self-supervised learning, such as ViT, Swin Transformer and DETR Series. More models will be added in the future. In addition, we support all the pretrained models from [timm](https://github.com/rwightman/pytorch-image-models).
 
 - **Functionality & Extensibility**
 
@@ -144,6 +144,7 @@ notebook
           <li><a href="configs/detection/detr">DETR (ECCV'2020)</a></li>
           <li><a href="configs/detection/dab_detr">DAB-DETR (ICLR'2022)</a></li>
           <li><a href="configs/detection/dab_detr">DN-DETR (CVPR'2022)</a></li>
+          <li><a href="configs/detection/dino">DINO (ArXiv'2022)</a></li>
         </ul>
       </td>
       <td>
 
@@ -135,6 +135,7 @@ EasyCV是一个涵盖多个领域的基于Pytorch的计算机视觉工具箱，
           <li><a href="configs/detection/detr">DETR (ECCV'2020)</a></li>
           <li><a href="configs/detection/dab_detr">DAB-DETR (ICLR'2022)</a></li>
           <li><a href="configs/detection/dab_detr">DN-DETR (CVPR'2022)</a></li>
+          <li><a href="configs/detection/dino">DINO (ArXiv'2022)</a></li>
         </ul>
       </td>
       <td>
 
@@ -12,6 +12,7 @@
 from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
 from mmcv.runner import get_dist_info, init_dist, load_checkpoint
 
+from easycv.apis import set_random_seed
 from easycv.datasets import build_dataloader, build_dataset
 from easycv.file import io
 from easycv.models import build_model
@@ -20,25 +21,6 @@
 from easycv.utils.logger import get_root_logger
 
 
-def set_random_seed(seed, deterministic=True):
-    """Set random seed.
-
-    Args:
-        seed (int): Seed to be used.
-        deterministic (bool): Whether to set the deterministic option for
-            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
-            to True and `torch.backends.cudnn.benchmark` to False.
-            Default: False.
-    """
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-    if deterministic:
-        torch.backends.cudnn.deterministic = True
-        torch.backends.cudnn.benchmark = False
-
-
 class ExtractProcess(object):
 
     def __init__(self, extract_list=['neck']):
 
@@ -23,36 +23,41 @@
     dict(type='MMRandomFlip', flip_ratio=0.5),
     dict(
         type='MMAutoAugment',
-        policies=[[
-            dict(
-                type='MMResize',
-                img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
-                           (608, 1333), (640, 1333), (672, 1333), (704, 1333),
-                           (736, 1333), (768, 1333), (800, 1333)],
-                multiscale_mode='value',
-                keep_ratio=True)
-        ],
-                  [
-                      dict(
-                          type='MMResize',
-                          img_scale=[(400, 1333), (500, 1333), (600, 1333)],
-                          multiscale_mode='value',
-                          keep_ratio=True),
-                      dict(
-                          type='MMRandomCrop',
-                          crop_type='absolute_range',
-                          crop_size=(384, 600),
-                          allow_negative_crop=True),
-                      dict(
-                          type='MMResize',
-                          img_scale=[(480, 1333), (512, 1333), (544, 1333),
-                                     (576, 1333), (608, 1333), (640, 1333),
-                                     (672, 1333), (704, 1333), (736, 1333),
-                                     (768, 1333), (800, 1333)],
-                          multiscale_mode='value',
-                          override=True,
-                          keep_ratio=True)
-                  ]]),
+        policies=[
+            [
+                dict(
+                    type='MMResize',
+                    img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                               (576, 1333), (608, 1333), (640, 1333),
+                               (672, 1333), (704, 1333), (736, 1333),
+                               (768, 1333), (800, 1333)],
+                    multiscale_mode='value',
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='MMResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original impl
+                    img_scale=[(400, 4200), (500, 4200), (600, 4200)],
+                    multiscale_mode='value',
+                    keep_ratio=True),
+                dict(
+                    type='MMRandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='MMResize',
+                    img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                               (576, 1333), (608, 1333), (640, 1333),
+                               (672, 1333), (704, 1333), (736, 1333),
+                               (768, 1333), (800, 1333)],
+                    multiscale_mode='value',
+                    override=True,
+                    keep_ratio=True)
+            ]
+        ]),
     dict(type='MMNormalize', **img_norm_cfg),
     dict(type='MMPad', size_divisor=1),
     dict(type='DefaultFormatBundle'),
@@ -96,7 +101,7 @@
         ],
         classes=CLASSES,
         test_mode=False,
-        filter_empty_gt=True,
+        filter_empty_gt=False,
         iscrowd=False),
     pipeline=train_pipeline)
 
@@ -118,13 +123,18 @@
     pipeline=test_pipeline)
 
 data = dict(
-    imgs_per_gpu=2, workers_per_gpu=2, train=train_dataset, val=val_dataset)
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train=train_dataset,
+    val=val_dataset,
+    drop_last=True)
 
 # evaluation
 eval_config = dict(interval=1, gpu_collect=False)
 eval_pipelines = [
     dict(
         mode='test',
+        dist_eval=True,
         evaluators=[
             dict(type='CocoDetectionEvaluator', classes=CLASSES),
         ],
 
@@ -1,5 +1,5 @@
 _base_ = [
-    './dab_detr.py', '../_base_/dataset/autoaug_coco_detection.py',
+    './dab_detr.py', '../common/dataset/autoaug_coco_detection.py',
     'configs/base.py'
 ]
 
 
@@ -1,5 +1,5 @@
 _base_ = [
-    './detr.py', '../_base_/dataset/autoaug_coco_detection.py',
+    './detr.py', '../common/dataset/autoaug_coco_detection.py',
     'configs/base.py'
 ]
 
 
@@ -0,0 +1,36 @@
+# DINO
+
+> [DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection](https://arxiv.org/abs/2203.03605)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We present DINO(DETR with Improved deNoising anchOr boxes), a state-of-the-art end-to-end object detector. DINO improves over previous DETR-like models in performance and efficiency by using a contrastive way for denoising training, a mixed query selection method for anchor initialization, and a look forward twice scheme for box pre- diction. DINO achieves 49.4AP in 12 epochs and 51.3AP in 24 epochs on COCO with a ResNet-50 backbone and multi-scale features, yield- ing a significant improvement of +6.0AP and +2.7AP, respectively, compared to DN-DETR, the previous best DETR-like model. DINO scales well in both model size and data size. Without bells and whistles, after pre-training on the Objects365 dataset with a SwinL backbone, DINO obtains the best results on both COCO val2017 (63.2AP) and test-dev (63.3AP). Compared to other models on the leaderboard, DINO significantly reduces its model size and pre-training data size while achieving better results.
+
+<div align=center>
+<img src="https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/algo_images/detection/DINO.png"/>
+</div>
+
+## Results and Models
+
+| Algorithm  | Config                                                       | Params<br/>(backbone/total)                            | inference time(V100)<br/>(ms/img)                      | bbox_mAP<sup>val<br/><sub>0.5:0.95</sub> | AP<sup>val<br/><sub>50</sub> | Download                                                     |
+| ---------- | ------------------------------------------------------------ | ------------------------ | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| DINO_4sc_r50_12e    | [DINO_4sc_r50_12e](https://github.com/alibaba/EasyCV/tree/master/configs/detection/dino/dino_4sc_r50_12e_coco.py) | 23M/47M | 184ms |     48.71               |     66.27      | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dino/dino_4sc_r50_12e/epoch_12.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dino/dino_4sc_r50_12e/20220815_141403.log.json) |
+| DINO_4sc_r50_36e    | [DINO_4sc_r50_36e](https://github.com/alibaba/EasyCV/tree/master/configs/detection/dino/dino_4sc_r50_36e_coco.py) | 23M/47M | 184ms |        50.69            |     68.60      | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dino/dino_4sc_r50_36e/epoch_29.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dino/dino_4sc_r50_36e/20220817_101549.log.json) |
+| DINO_4sc_swinl_12e    | [DINO_4sc_swinl_12e](https://github.com/alibaba/EasyCV/tree/master/configs/detection/dino/dino_4sc_swinl_12e_coco.py) | 195M/217M | 155ms |        56.86            |     75.61      | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dino/dino_4sc_swinl_12e/epoch_12.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dino/dino_4sc_swinl_12e/20220815_211633.log.json) |
+| DINO_4sc_swinl_36e    | [DINO_4sc_swinl_36e](https://github.com/alibaba/EasyCV/tree/master/configs/detection/dino/dino_4sc_swinl_36e_coco.py) | 195M/217M | 155ms |          58.04          |     76.76      | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dino/dino_4sc_swinl_36e/epoch_34.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dino/dino_4sc_swinl_36e/20220817_101416.log.json) |
+| DINO_5sc_swinl_36e    | [DINO_5sc_swinl_36e](https://github.com/alibaba/EasyCV/tree/master/configs/detection/dino/dino_5sc_swinl_36e_coco.py) | 195M/217M | 235ms |           58.47         |     77.10      | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dino/dino_5sc_swinl_36e/epoch_35.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dino/dino_5sc_swinl_36e/20220820_215711.log.json) |
+
+## Citation
+
+```latex
+@misc{zhang2022dino,
+      title={DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection},
+      author={Hao Zhang and Feng Li and Shilong Liu and Lei Zhang and Hang Su and Jun Zhu and Lionel M. Ni and Heung-Yeung Shum},
+      year={2022},
+      eprint={2203.03605},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
@@ -0,0 +1,94 @@
+# model settings
+model = dict(
+    type='Detection',
+    pretrained=True,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3, 4),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch'),
+    head=dict(
+        type='DINOHead',
+        transformer=dict(
+            type='DeformableTransformer',
+            d_model=256,
+            nhead=8,
+            num_queries=900,
+            num_encoder_layers=6,
+            num_unicoder_layers=0,
+            num_decoder_layers=6,
+            dim_feedforward=2048,
+            dropout=0.0,
+            activation='relu',
+            normalize_before=False,
+            return_intermediate_dec=True,
+            query_dim=4,
+            num_patterns=0,
+            modulate_hw_attn=True,
+            # for deformable encoder
+            deformable_encoder=True,
+            deformable_decoder=True,
+            num_feature_levels=4,
+            enc_n_points=4,
+            dec_n_points=4,
+            # init query
+            decoder_query_perturber=None,
+            add_channel_attention=False,
+            random_refpoints_xy=False,
+            # two stage
+            two_stage_type=
+            'standard',  # ['no', 'standard', 'early', 'combine', 'enceachlayer', 'enclayer1']
+            two_stage_pat_embed=0,
+            two_stage_add_query_num=0,
+            two_stage_learn_wh=False,
+            two_stage_keep_all_tokens=False,
+            # evo of #anchors
+            dec_layer_number=None,
+            rm_dec_query_scale=True,
+            rm_self_attn_layers=None,
+            key_aware_type=None,
+            # layer share
+            layer_share_type=None,
+            # for detach
+            rm_detach=None,
+            decoder_sa_type='sa',
+            module_seq=['sa', 'ca', 'ffn'],
+            # for dn
+            embed_init_tgt=True,
+            use_detached_boxes_dec_out=False),
+        dn_components=dict(
+            dn_number=100,
+            dn_label_noise_ratio=0.5,  # paper 0.5, release code 0.25
+            dn_box_noise_scale=1.0,
+            dn_labelbook_size=80,
+        ),
+        num_classes=80,
+        in_channels=[512, 1024, 2048],
+        embed_dims=256,
+        query_dim=4,
+        num_queries=900,
+        num_select=300,
+        random_refpoints_xy=False,
+        num_patterns=0,
+        fix_refpoints_hw=-1,
+        num_feature_levels=4,
+        # two stage
+        two_stage_type='standard',  # ['no', 'standard']
+        two_stage_add_query_num=0,
+        dec_pred_class_embed_share=True,
+        dec_pred_bbox_embed_share=True,
+        two_stage_class_embed_share=False,
+        two_stage_bbox_embed_share=False,
+        decoder_sa_type='sa',
+        temperatureH=20,
+        temperatureW=20,
+        cost_dict=dict(
+            cost_class=2,
+            cost_bbox=5,
+            cost_giou=2,
+        ),
+        weight_dict=dict(loss_ce=1, loss_bbox=5, loss_giou=2)))
@@ -0,0 +1,4 @@
+_base_ = [
+    './dino_4sc_r50.py', '../common/dataset/autoaug_coco_detection.py',
+    './dino_schedule_1x.py'
+]
@@ -0,0 +1,6 @@
+_base_ = './dino_4sc_r50_12e_coco.py'
+
+# learning policy
+lr_config = dict(policy='step', step=[22])
+
+total_epochs = 24
@@ -0,0 +1,6 @@
+_base_ = './dino_4sc_r50_12e_coco.py'
+
+# learning policy
+lr_config = dict(policy='step', step=[27, 33])
+
+total_epochs = 36
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`_base_ = [`
`2`		`- './dab_detr.py', '../_base_/dataset/autoaug_coco_detection.py',`
	`2`	`+ './dab_detr.py', '../common/dataset/autoaug_coco_detection.py',`
`3`	`3`	`'configs/base.py'`
`4`	`4`	`]`
`5`	`5`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`_base_ = [`
`2`		`- './detr.py', '../_base_/dataset/autoaug_coco_detection.py',`
	`2`	`+ './detr.py', '../common/dataset/autoaug_coco_detection.py',`
`3`	`3`	`'configs/base.py'`
`4`	`4`	`]`
`5`	`5`