Skip to content

Commit b198c5a

Browse files
support DINO algo (#144)
dino_4sc_r50_12e:48.71 dino_4sc_r50_24e:50.53 dino_4sc_r50_36e:50.69 dino_4sc_swinl_12e: 56.86 dino_4sc_swinl_36e: 58.04 dino_5sc_swinl_36e: 58.47
1 parent 38ae771 commit b198c5a

File tree

85 files changed

+3785
-636
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

85 files changed

+3785
-636
lines changed

README.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ EasyCV is an all-in-one computer vision toolbox based on PyTorch, mainly focuses
3232

3333
- **Vision Transformers**
3434

35-
EasyCV aims to provide an easy way to use the off-the-shelf SOTA transformer models trained either using supervised learning or self-supervised learning, such as ViT, Swin-Transformer and Shuffle Transformer. More models will be added in the future. In addition, we support all the pretrained models from [timm](https://github.com/rwightman/pytorch-image-models).
35+
EasyCV aims to provide an easy way to use the off-the-shelf SOTA transformer models trained either using supervised learning or self-supervised learning, such as ViT, Swin Transformer and DETR Series. More models will be added in the future. In addition, we support all the pretrained models from [timm](https://github.com/rwightman/pytorch-image-models).
3636

3737
- **Functionality & Extensibility**
3838

@@ -144,6 +144,7 @@ notebook
144144
<li><a href="configs/detection/detr">DETR (ECCV'2020)</a></li>
145145
<li><a href="configs/detection/dab_detr">DAB-DETR (ICLR'2022)</a></li>
146146
<li><a href="configs/detection/dab_detr">DN-DETR (CVPR'2022)</a></li>
147+
<li><a href="configs/detection/dino">DINO (ArXiv'2022)</a></li>
147148
</ul>
148149
</td>
149150
<td>

README_zh-CN.md

+1
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ EasyCV是一个涵盖多个领域的基于Pytorch的计算机视觉工具箱,
135135
<li><a href="configs/detection/detr">DETR (ECCV'2020)</a></li>
136136
<li><a href="configs/detection/dab_detr">DAB-DETR (ICLR'2022)</a></li>
137137
<li><a href="configs/detection/dab_detr">DN-DETR (CVPR'2022)</a></li>
138+
<li><a href="configs/detection/dino">DINO (ArXiv'2022)</a></li>
138139
</ul>
139140
</td>
140141
<td>

benchmarks/tools/extract.py

+1-19
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
1313
from mmcv.runner import get_dist_info, init_dist, load_checkpoint
1414

15+
from easycv.apis import set_random_seed
1516
from easycv.datasets import build_dataloader, build_dataset
1617
from easycv.file import io
1718
from easycv.models import build_model
@@ -20,25 +21,6 @@
2021
from easycv.utils.logger import get_root_logger
2122

2223

23-
def set_random_seed(seed, deterministic=True):
24-
"""Set random seed.
25-
26-
Args:
27-
seed (int): Seed to be used.
28-
deterministic (bool): Whether to set the deterministic option for
29-
CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
30-
to True and `torch.backends.cudnn.benchmark` to False.
31-
Default: False.
32-
"""
33-
random.seed(seed)
34-
np.random.seed(seed)
35-
torch.manual_seed(seed)
36-
torch.cuda.manual_seed_all(seed)
37-
if deterministic:
38-
torch.backends.cudnn.deterministic = True
39-
torch.backends.cudnn.benchmark = False
40-
41-
4224
class ExtractProcess(object):
4325

4426
def __init__(self, extract_list=['neck']):

configs/detection/_base_/dataset/autoaug_coco_detection.py configs/detection/common/dataset/autoaug_coco_detection.py

+42-32
Original file line numberDiff line numberDiff line change
@@ -23,36 +23,41 @@
2323
dict(type='MMRandomFlip', flip_ratio=0.5),
2424
dict(
2525
type='MMAutoAugment',
26-
policies=[[
27-
dict(
28-
type='MMResize',
29-
img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
30-
(608, 1333), (640, 1333), (672, 1333), (704, 1333),
31-
(736, 1333), (768, 1333), (800, 1333)],
32-
multiscale_mode='value',
33-
keep_ratio=True)
34-
],
35-
[
36-
dict(
37-
type='MMResize',
38-
img_scale=[(400, 1333), (500, 1333), (600, 1333)],
39-
multiscale_mode='value',
40-
keep_ratio=True),
41-
dict(
42-
type='MMRandomCrop',
43-
crop_type='absolute_range',
44-
crop_size=(384, 600),
45-
allow_negative_crop=True),
46-
dict(
47-
type='MMResize',
48-
img_scale=[(480, 1333), (512, 1333), (544, 1333),
49-
(576, 1333), (608, 1333), (640, 1333),
50-
(672, 1333), (704, 1333), (736, 1333),
51-
(768, 1333), (800, 1333)],
52-
multiscale_mode='value',
53-
override=True,
54-
keep_ratio=True)
55-
]]),
26+
policies=[
27+
[
28+
dict(
29+
type='MMResize',
30+
img_scale=[(480, 1333), (512, 1333), (544, 1333),
31+
(576, 1333), (608, 1333), (640, 1333),
32+
(672, 1333), (704, 1333), (736, 1333),
33+
(768, 1333), (800, 1333)],
34+
multiscale_mode='value',
35+
keep_ratio=True)
36+
],
37+
[
38+
dict(
39+
type='MMResize',
40+
# The radio of all image in train dataset < 7
41+
# follow the original impl
42+
img_scale=[(400, 4200), (500, 4200), (600, 4200)],
43+
multiscale_mode='value',
44+
keep_ratio=True),
45+
dict(
46+
type='MMRandomCrop',
47+
crop_type='absolute_range',
48+
crop_size=(384, 600),
49+
allow_negative_crop=True),
50+
dict(
51+
type='MMResize',
52+
img_scale=[(480, 1333), (512, 1333), (544, 1333),
53+
(576, 1333), (608, 1333), (640, 1333),
54+
(672, 1333), (704, 1333), (736, 1333),
55+
(768, 1333), (800, 1333)],
56+
multiscale_mode='value',
57+
override=True,
58+
keep_ratio=True)
59+
]
60+
]),
5661
dict(type='MMNormalize', **img_norm_cfg),
5762
dict(type='MMPad', size_divisor=1),
5863
dict(type='DefaultFormatBundle'),
@@ -96,7 +101,7 @@
96101
],
97102
classes=CLASSES,
98103
test_mode=False,
99-
filter_empty_gt=True,
104+
filter_empty_gt=False,
100105
iscrowd=False),
101106
pipeline=train_pipeline)
102107

@@ -118,13 +123,18 @@
118123
pipeline=test_pipeline)
119124

120125
data = dict(
121-
imgs_per_gpu=2, workers_per_gpu=2, train=train_dataset, val=val_dataset)
126+
imgs_per_gpu=2,
127+
workers_per_gpu=2,
128+
train=train_dataset,
129+
val=val_dataset,
130+
drop_last=True)
122131

123132
# evaluation
124133
eval_config = dict(interval=1, gpu_collect=False)
125134
eval_pipelines = [
126135
dict(
127136
mode='test',
137+
dist_eval=True,
128138
evaluators=[
129139
dict(type='CocoDetectionEvaluator', classes=CLASSES),
130140
],

configs/detection/dab_detr/dab_detr_r50_8x2_50e_coco.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
_base_ = [
2-
'./dab_detr.py', '../_base_/dataset/autoaug_coco_detection.py',
2+
'./dab_detr.py', '../common/dataset/autoaug_coco_detection.py',
33
'configs/base.py'
44
]
55

configs/detection/detr/detr_r50_8x2_150e_coco.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
_base_ = [
2-
'./detr.py', '../_base_/dataset/autoaug_coco_detection.py',
2+
'./detr.py', '../common/dataset/autoaug_coco_detection.py',
33
'configs/base.py'
44
]
55

configs/detection/dino/README.md

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# DINO
2+
3+
> [DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection](https://arxiv.org/abs/2203.03605)
4+
5+
<!-- [ALGORITHM] -->
6+
7+
## Abstract
8+
9+
We present DINO(DETR with Improved deNoising anchOr boxes), a state-of-the-art end-to-end object detector. DINO improves over previous DETR-like models in performance and efficiency by using a contrastive way for denoising training, a mixed query selection method for anchor initialization, and a look forward twice scheme for box pre- diction. DINO achieves 49.4AP in 12 epochs and 51.3AP in 24 epochs on COCO with a ResNet-50 backbone and multi-scale features, yield- ing a significant improvement of +6.0AP and +2.7AP, respectively, compared to DN-DETR, the previous best DETR-like model. DINO scales well in both model size and data size. Without bells and whistles, after pre-training on the Objects365 dataset with a SwinL backbone, DINO obtains the best results on both COCO val2017 (63.2AP) and test-dev (63.3AP). Compared to other models on the leaderboard, DINO significantly reduces its model size and pre-training data size while achieving better results.
10+
11+
<div align=center>
12+
<img src="https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/algo_images/detection/DINO.png"/>
13+
</div>
14+
15+
## Results and Models
16+
17+
| Algorithm | Config | Params<br/>(backbone/total) | inference time(V100)<br/>(ms/img) | bbox_mAP<sup>val<br/><sub>0.5:0.95</sub> | AP<sup>val<br/><sub>50</sub> | Download |
18+
| ---------- | ------------------------------------------------------------ | ------------------------ | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
19+
| DINO_4sc_r50_12e | [DINO_4sc_r50_12e](https://github.com/alibaba/EasyCV/tree/master/configs/detection/dino/dino_4sc_r50_12e_coco.py) | 23M/47M | 184ms | 48.71 | 66.27 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dino/dino_4sc_r50_12e/epoch_12.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dino/dino_4sc_r50_12e/20220815_141403.log.json) |
20+
| DINO_4sc_r50_36e | [DINO_4sc_r50_36e](https://github.com/alibaba/EasyCV/tree/master/configs/detection/dino/dino_4sc_r50_36e_coco.py) | 23M/47M | 184ms | 50.69 | 68.60 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dino/dino_4sc_r50_36e/epoch_29.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dino/dino_4sc_r50_36e/20220817_101549.log.json) |
21+
| DINO_4sc_swinl_12e | [DINO_4sc_swinl_12e](https://github.com/alibaba/EasyCV/tree/master/configs/detection/dino/dino_4sc_swinl_12e_coco.py) | 195M/217M | 155ms | 56.86 | 75.61 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dino/dino_4sc_swinl_12e/epoch_12.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dino/dino_4sc_swinl_12e/20220815_211633.log.json) |
22+
| DINO_4sc_swinl_36e | [DINO_4sc_swinl_36e](https://github.com/alibaba/EasyCV/tree/master/configs/detection/dino/dino_4sc_swinl_36e_coco.py) | 195M/217M | 155ms | 58.04 | 76.76 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dino/dino_4sc_swinl_36e/epoch_34.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dino/dino_4sc_swinl_36e/20220817_101416.log.json) |
23+
| DINO_5sc_swinl_36e | [DINO_5sc_swinl_36e](https://github.com/alibaba/EasyCV/tree/master/configs/detection/dino/dino_5sc_swinl_36e_coco.py) | 195M/217M | 235ms | 58.47 | 77.10 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dino/dino_5sc_swinl_36e/epoch_35.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/dino/dino_5sc_swinl_36e/20220820_215711.log.json) |
24+
25+
## Citation
26+
27+
```latex
28+
@misc{zhang2022dino,
29+
title={DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection},
30+
author={Hao Zhang and Feng Li and Shilong Liu and Lei Zhang and Hang Su and Jun Zhu and Lionel M. Ni and Heung-Yeung Shum},
31+
year={2022},
32+
eprint={2203.03605},
33+
archivePrefix={arXiv},
34+
primaryClass={cs.CV}
35+
}
36+
```
+94
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# model settings
2+
model = dict(
3+
type='Detection',
4+
pretrained=True,
5+
backbone=dict(
6+
type='ResNet',
7+
depth=50,
8+
num_stages=4,
9+
out_indices=(2, 3, 4),
10+
frozen_stages=1,
11+
norm_cfg=dict(type='BN', requires_grad=False),
12+
norm_eval=True,
13+
style='pytorch'),
14+
head=dict(
15+
type='DINOHead',
16+
transformer=dict(
17+
type='DeformableTransformer',
18+
d_model=256,
19+
nhead=8,
20+
num_queries=900,
21+
num_encoder_layers=6,
22+
num_unicoder_layers=0,
23+
num_decoder_layers=6,
24+
dim_feedforward=2048,
25+
dropout=0.0,
26+
activation='relu',
27+
normalize_before=False,
28+
return_intermediate_dec=True,
29+
query_dim=4,
30+
num_patterns=0,
31+
modulate_hw_attn=True,
32+
# for deformable encoder
33+
deformable_encoder=True,
34+
deformable_decoder=True,
35+
num_feature_levels=4,
36+
enc_n_points=4,
37+
dec_n_points=4,
38+
# init query
39+
decoder_query_perturber=None,
40+
add_channel_attention=False,
41+
random_refpoints_xy=False,
42+
# two stage
43+
two_stage_type=
44+
'standard', # ['no', 'standard', 'early', 'combine', 'enceachlayer', 'enclayer1']
45+
two_stage_pat_embed=0,
46+
two_stage_add_query_num=0,
47+
two_stage_learn_wh=False,
48+
two_stage_keep_all_tokens=False,
49+
# evo of #anchors
50+
dec_layer_number=None,
51+
rm_dec_query_scale=True,
52+
rm_self_attn_layers=None,
53+
key_aware_type=None,
54+
# layer share
55+
layer_share_type=None,
56+
# for detach
57+
rm_detach=None,
58+
decoder_sa_type='sa',
59+
module_seq=['sa', 'ca', 'ffn'],
60+
# for dn
61+
embed_init_tgt=True,
62+
use_detached_boxes_dec_out=False),
63+
dn_components=dict(
64+
dn_number=100,
65+
dn_label_noise_ratio=0.5, # paper 0.5, release code 0.25
66+
dn_box_noise_scale=1.0,
67+
dn_labelbook_size=80,
68+
),
69+
num_classes=80,
70+
in_channels=[512, 1024, 2048],
71+
embed_dims=256,
72+
query_dim=4,
73+
num_queries=900,
74+
num_select=300,
75+
random_refpoints_xy=False,
76+
num_patterns=0,
77+
fix_refpoints_hw=-1,
78+
num_feature_levels=4,
79+
# two stage
80+
two_stage_type='standard', # ['no', 'standard']
81+
two_stage_add_query_num=0,
82+
dec_pred_class_embed_share=True,
83+
dec_pred_bbox_embed_share=True,
84+
two_stage_class_embed_share=False,
85+
two_stage_bbox_embed_share=False,
86+
decoder_sa_type='sa',
87+
temperatureH=20,
88+
temperatureW=20,
89+
cost_dict=dict(
90+
cost_class=2,
91+
cost_bbox=5,
92+
cost_giou=2,
93+
),
94+
weight_dict=dict(loss_ce=1, loss_bbox=5, loss_giou=2)))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
_base_ = [
2+
'./dino_4sc_r50.py', '../common/dataset/autoaug_coco_detection.py',
3+
'./dino_schedule_1x.py'
4+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
_base_ = './dino_4sc_r50_12e_coco.py'
2+
3+
# learning policy
4+
lr_config = dict(policy='step', step=[22])
5+
6+
total_epochs = 24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
_base_ = './dino_4sc_r50_12e_coco.py'
2+
3+
# learning policy
4+
lr_config = dict(policy='step', step=[27, 33])
5+
6+
total_epochs = 36

0 commit comments

Comments
 (0)