From 667c495d83f909c3c1b929a9c2ab7f0ef1911896 Mon Sep 17 00:00:00 2001 From: ZhangYuanhan-AI Date: Thu, 8 Jun 2023 09:29:28 +0800 Subject: [PATCH] add omnibenchmarkv2 --- .../omnibenchmarkv2_bs32_pil_resize.py | 51 +++++++++++ ...xb2048-linear-coslr-90e_omniv2-activity.py | 83 +++++++++++++++++ ...16_4xb2048-linear-coslr-90e_omniv2-bird.py | 83 +++++++++++++++++ ...p16_4xb2048-linear-coslr-90e_omniv2-car.py | 83 +++++++++++++++++ ...-linear-coslr-90e_omniv2-consumer_goods.py | 83 +++++++++++++++++ ...xb2048-linear-coslr-90e_omniv2-creation.py | 83 +++++++++++++++++ ...2048-linear-coslr-90e_omniv2-decoration.py | 83 +++++++++++++++++ ..._4xb2048-linear-coslr-90e_omniv2-device.py | 83 +++++++++++++++++ ...16_4xb2048-linear-coslr-90e_omniv2-food.py | 83 +++++++++++++++++ ...r-coslr-90e_omniv2-geological_formation.py | 83 +++++++++++++++++ ...linear-coslr-90e_omniv2-instrumentality.py | 83 +++++++++++++++++ ...2048-linear-coslr-90e_omniv2-locomotive.py | 83 +++++++++++++++++ ..._4xb2048-linear-coslr-90e_omniv2-mammal.py | 83 +++++++++++++++++ ...xb2048-linear-coslr-90e_omniv2-material.py | 83 +++++++++++++++++ ...inear-coslr-90e_omniv2-military_vehicle.py | 83 +++++++++++++++++ ...4xb2048-linear-coslr-90e_omniv2-process.py | 83 +++++++++++++++++ ..._4xb2048-linear-coslr-90e_omniv2-region.py | 83 +++++++++++++++++ ...b2048-linear-coslr-90e_omniv2-structure.py | 83 +++++++++++++++++ ...-linear-coslr-90e_omniv2-vascular_plant.py | 83 +++++++++++++++++ ...xb512-linear-coslr-90e_omniv2-amphibian.py | 83 +++++++++++++++++ ...inear-coslr-90e_omniv2-aquatic_aircraft.py | 89 ++++++++++++++++++ ...ear-coslr-90e_omniv2-aquatic_vertebrate.py | 83 +++++++++++++++++ ...t-base-p16_sam-headless_omniv2-activity.py | 90 +++++++++++++++++++ ...t-base-p16_sam-headless_omniv2-aircraft.py | 89 ++++++++++++++++++ ...-base-p16_sam-headless_omniv2-amphibian.py | 89 ++++++++++++++++++ ..._sam-headless_omniv2-aquatic_vertebrate.py | 90 +++++++++++++++++++ .../vit-base-p16_sam-headless_omniv2-bird.py | 90 +++++++++++++++++++ .../vit-base-p16_sam-headless_omniv2-car.py | 89 ++++++++++++++++++ ...-p16_sam-headless_omniv2-consumer_goods.py | 90 +++++++++++++++++++ ...t-base-p16_sam-headless_omniv2-creation.py | 90 +++++++++++++++++++ ...base-p16_sam-headless_omniv2-decoration.py | 90 +++++++++++++++++++ ...vit-base-p16_sam-headless_omniv2-device.py | 90 +++++++++++++++++++ .../vit-base-p16_sam-headless_omniv2-food.py | 90 +++++++++++++++++++ ...am-headless_omniv2-geological_formation.py | 90 +++++++++++++++++++ ...p16_sam-headless_omniv2-instrumentality.py | 90 +++++++++++++++++++ ...base-p16_sam-headless_omniv2-locomotive.py | 90 +++++++++++++++++++ ...vit-base-p16_sam-headless_omniv2-mammal.py | 90 +++++++++++++++++++ ...t-base-p16_sam-headless_omniv2-material.py | 90 +++++++++++++++++++ ...16_sam-headless_omniv2-military_vehicle.py | 90 +++++++++++++++++++ ...it-base-p16_sam-headless_omniv2-process.py | 90 +++++++++++++++++++ ...vit-base-p16_sam-headless_omniv2-region.py | 90 +++++++++++++++++++ ...-base-p16_sam-headless_omniv2-structure.py | 90 +++++++++++++++++++ ...-p16_sam-headless_omniv2-vascular_plant.py | 90 +++++++++++++++++++ 43 files changed, 3687 insertions(+) create mode 100644 configs/_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-activity.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-bird.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-car.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-consumer_goods.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-creation.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-decoration.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-device.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-food.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-geological_formation.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-instrumentality.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-locomotive.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-mammal.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-material.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-military_vehicle.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-process.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-region.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-structure.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-vascular_plant.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb512-linear-coslr-90e_omniv2-amphibian.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb512-linear-coslr-90e_omniv2-aquatic_aircraft.py create mode 100644 configs/mae/benchmarks/vit-base-p16_4xb512-linear-coslr-90e_omniv2-aquatic_vertebrate.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-activity.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-aircraft.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-amphibian.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-aquatic_vertebrate.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-bird.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-car.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-consumer_goods.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-creation.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-decoration.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-device.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-food.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-geological_formation.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-instrumentality.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-locomotive.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-mammal.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-material.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-military_vehicle.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-process.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-region.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-structure.py create mode 100644 configs/sam/vit-base-p16_sam-headless_omniv2-vascular_plant.py diff --git a/configs/_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py b/configs/_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py new file mode 100644 index 00000000000..1a46469d8a5 --- /dev/null +++ b/configs/_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py @@ -0,0 +1,51 @@ +# dataset settings +dataset_type = 'ImageNet' +data_preprocessor = dict( + num_classes=1000, + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='RandomResizedCrop', scale=224, backend='pillow'), + dict(type='RandomFlip', prob=0.5, direction='horizontal'), + dict(type='PackInputs'), +] + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='ResizeEdge', scale=256, edge='short', backend='pillow'), + dict(type='CenterCrop', crop_size=224), + dict(type='PackInputs'), +] + +train_dataloader = dict( + batch_size=32, + num_workers=4, + dataset=dict( + type=dataset_type, + data_root='data/omnibenchmarkv2/data/activity', + ann_file='data/omnibenchmarkv2/meta/activity/train.txt', + pipeline=train_pipeline), + sampler=dict(type='DefaultSampler', shuffle=True), +) + +val_dataloader = dict( + batch_size=32, + num_workers=4, + dataset=dict( + type=dataset_type, + data_root='data/omnibenchmarkv2/data/activity', + ann_file='data/omnibenchmarkv2/meta/activity/test.txt', + pipeline=test_pipeline), + sampler=dict(type='DefaultSampler', shuffle=False), +) +val_evaluator = dict(type='Accuracy', topk=(1, 5)) + +# If you want standard test, please manually configure the test dataset +test_dataloader = val_dataloader +test_evaluator = val_evaluator diff --git a/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-activity.py b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-activity.py new file mode 100644 index 00000000000..ef2af907087 --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-activity.py @@ -0,0 +1,83 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=691, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/activity/meta/train.txt', + data_prefix='data/activity/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/activity/meta/val.txt', + data_prefix='data/activity/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/activity/meta/test.txt', + data_prefix='data/activity/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=691, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-bird.py b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-bird.py new file mode 100644 index 00000000000..8da5a956ac5 --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-bird.py @@ -0,0 +1,83 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=646, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/bird/meta/train.txt', + data_prefix='data/bird/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/bird/meta/val.txt', + data_prefix='data/bird/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/bird/meta/test.txt', + data_prefix='data/bird/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=646, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-car.py b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-car.py new file mode 100644 index 00000000000..4e43db73e79 --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-car.py @@ -0,0 +1,83 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=767, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/car/meta/train.txt', + data_prefix='data/car/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/car/meta/val.txt', + data_prefix='data/car/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/car/meta/test.txt', + data_prefix='data/car/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=767, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-consumer_goods.py b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-consumer_goods.py new file mode 100644 index 00000000000..6633f5bfbcc --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-consumer_goods.py @@ -0,0 +1,83 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=190, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/consumer_goods/meta/train.txt', + data_prefix='data/consumer_goods/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/consumer_goods/meta/val.txt', + data_prefix='data/consumer_goods/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/consumer_goods/meta/test.txt', + data_prefix='data/consumer_goods/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=190, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-creation.py b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-creation.py new file mode 100644 index 00000000000..aca705096b6 --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-creation.py @@ -0,0 +1,83 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=105, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/creation/meta/train.txt', + data_prefix='data/creation/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/creation/meta/val.txt', + data_prefix='data/creation/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/creation/meta/test.txt', + data_prefix='data/creation/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=105, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-decoration.py b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-decoration.py new file mode 100644 index 00000000000..c3879ab5920 --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-decoration.py @@ -0,0 +1,83 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=44, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/decoration/meta/train.txt', + data_prefix='data/decoration/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/decoration/meta/val.txt', + data_prefix='data/decoration/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/decoration/meta/test.txt', + data_prefix='data/decoration/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=44, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-device.py b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-device.py new file mode 100644 index 00000000000..6c89e08e472 --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-device.py @@ -0,0 +1,83 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=228, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/device/meta/train.txt', + data_prefix='data/device/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/device/meta/val.txt', + data_prefix='data/device/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/device/meta/test.txt', + data_prefix='data/device/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=228, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-food.py b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-food.py new file mode 100644 index 00000000000..393b66b4b13 --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-food.py @@ -0,0 +1,83 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=891, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/food/meta/train.txt', + data_prefix='data/food/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/food/meta/val.txt', + data_prefix='data/food/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/food/meta/test.txt', + data_prefix='data/food/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=891, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-geological_formation.py b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-geological_formation.py new file mode 100644 index 00000000000..18e67b5a320 --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-geological_formation.py @@ -0,0 +1,83 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=110, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/geological_formation/meta/train.txt', + data_prefix='data/geological_formation/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/geological_formation/meta/val.txt', + data_prefix='data/geological_formation/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/geological_formation/meta/test.txt', + data_prefix='data/geological_formation/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=110, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-instrumentality.py b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-instrumentality.py new file mode 100644 index 00000000000..671d7912c02 --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-instrumentality.py @@ -0,0 +1,83 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=112, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/instrumentality/meta/train.txt', + data_prefix='data/instrumentality/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/instrumentality/meta/val.txt', + data_prefix='data/instrumentality/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/instrumentality/meta/test.txt', + data_prefix='data/instrumentality/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=112, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-locomotive.py b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-locomotive.py new file mode 100644 index 00000000000..13be1600e73 --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-locomotive.py @@ -0,0 +1,83 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=44, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/locomotive/meta/train.txt', + data_prefix='data/locomotive/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/locomotive/meta/val.txt', + data_prefix='data/locomotive/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/locomotive/meta/test.txt', + data_prefix='data/locomotive/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=44, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-mammal.py b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-mammal.py new file mode 100644 index 00000000000..fcc9ed37fce --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-mammal.py @@ -0,0 +1,83 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=384, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/mammal/meta/train.txt', + data_prefix='data/mammal/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/mammal/meta/val.txt', + data_prefix='data/mammal/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/mammal/meta/test.txt', + data_prefix='data/mammal/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=384, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-material.py b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-material.py new file mode 100644 index 00000000000..4790ce0777b --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-material.py @@ -0,0 +1,83 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=97, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/material/meta/train.txt', + data_prefix='data/material/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/material/meta/val.txt', + data_prefix='data/material/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/material/meta/test.txt', + data_prefix='data/material/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=97, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-military_vehicle.py b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-military_vehicle.py new file mode 100644 index 00000000000..7cf032dd8b0 --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-military_vehicle.py @@ -0,0 +1,83 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=72, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/military_vehicle/meta/train.txt', + data_prefix='data/military_vehicle/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/military_vehicle/meta/val.txt', + data_prefix='data/military_vehicle/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/military_vehicle/meta/test.txt', + data_prefix='data/military_vehicle/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=72, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-process.py b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-process.py new file mode 100644 index 00000000000..99984315547 --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-process.py @@ -0,0 +1,83 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=46, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/process/meta/train.txt', + data_prefix='data/process/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/process/meta/val.txt', + data_prefix='data/process/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/process/meta/test.txt', + data_prefix='data/process/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=46, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-region.py b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-region.py new file mode 100644 index 00000000000..272651a4f74 --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-region.py @@ -0,0 +1,83 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=217, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/region/meta/train.txt', + data_prefix='data/region/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/region/meta/val.txt', + data_prefix='data/region/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/region/meta/test.txt', + data_prefix='data/region/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=217, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-structure.py b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-structure.py new file mode 100644 index 00000000000..ee505ed2794 --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-structure.py @@ -0,0 +1,83 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=346, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/structure/meta/train.txt', + data_prefix='data/structure/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/structure/meta/val.txt', + data_prefix='data/structure/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/structure/meta/test.txt', + data_prefix='data/structure/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=346, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-vascular_plant.py b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-vascular_plant.py new file mode 100644 index 00000000000..6cd135c271a --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb2048-linear-coslr-90e_omniv2-vascular_plant.py @@ -0,0 +1,83 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=671, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/vascular_plant/meta/train.txt', + data_prefix='data/vascular_plant/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/vascular_plant/meta/val.txt', + data_prefix='data/vascular_plant/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/vascular_plant/meta/test.txt', + data_prefix='data/vascular_plant/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=671, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/mae/benchmarks/vit-base-p16_4xb512-linear-coslr-90e_omniv2-amphibian.py b/configs/mae/benchmarks/vit-base-p16_4xb512-linear-coslr-90e_omniv2-amphibian.py new file mode 100644 index 00000000000..362a2ae08d4 --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb512-linear-coslr-90e_omniv2-amphibian.py @@ -0,0 +1,83 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=20, +) + +train_dataloader = dict(batch_size=512, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/amphibian/meta/train.txt', + data_prefix='data/amphibian/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/amphibian/meta/val.txt', + data_prefix='data/amphibian/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/amphibian/meta/test.txt', + data_prefix='data/amphibian/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=20, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/mae/benchmarks/vit-base-p16_4xb512-linear-coslr-90e_omniv2-aquatic_aircraft.py b/configs/mae/benchmarks/vit-base-p16_4xb512-linear-coslr-90e_omniv2-aquatic_aircraft.py new file mode 100644 index 00000000000..05ea70c8dfc --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb512-linear-coslr-90e_omniv2-aquatic_aircraft.py @@ -0,0 +1,89 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=237, +) + +train_dataloader = dict(batch_size=512, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/aircraft/meta/train.txt', + data_prefix='data/aircraft/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/aircraft/meta/val.txt', + data_prefix='data/aircraft/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/aircraft/meta/test.txt', + data_prefix='data/aircraft/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=237, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) + + + + + + diff --git a/configs/mae/benchmarks/vit-base-p16_4xb512-linear-coslr-90e_omniv2-aquatic_vertebrate.py b/configs/mae/benchmarks/vit-base-p16_4xb512-linear-coslr-90e_omniv2-aquatic_vertebrate.py new file mode 100644 index 00000000000..78bcac6f91a --- /dev/null +++ b/configs/mae/benchmarks/vit-base-p16_4xb512-linear-coslr-90e_omniv2-aquatic_vertebrate.py @@ -0,0 +1,83 @@ +_base_ = [ + '../../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + num_classes=42, +) + +train_dataloader = dict(batch_size=512, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/aquatic_vertebrate/meta/train.txt', + data_prefix='data/aquatic_vertebrate/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/aquatic_vertebrate/meta/val.txt', + data_prefix='data/aquatic_vertebrate/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/aquatic_vertebrate/meta/test.txt', + data_prefix='data/aquatic_vertebrate/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='VisionTransformer', + arch='base', + img_size=224, + patch_size=16, + frozen_stages=12, + out_type='cls_token', + final_norm=True, + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/mae_vit-base-p16_8xb512-fp16-coslr-1600e_in1k_20220825-f7569ca2.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=42, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-activity.py b/configs/sam/vit-base-p16_sam-headless_omniv2-activity.py new file mode 100644 index 00000000000..5b6d843937b --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-activity.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/activity/meta/train.txt', + data_prefix='data/activity/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/activity/meta/val.txt', + data_prefix='data/activity/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/activity/meta/test.txt', + data_prefix='data/activity/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=691, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-aircraft.py b/configs/sam/vit-base-p16_sam-headless_omniv2-aircraft.py new file mode 100644 index 00000000000..c9fc97493a6 --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-aircraft.py @@ -0,0 +1,89 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=512, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/aircraft/meta/train.txt', + data_prefix='data/aircraft/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/aircraft/meta/val.txt', + data_prefix='data/aircraft/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/aircraft/meta/test.txt', + data_prefix='data/aircraft/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=237, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-amphibian.py b/configs/sam/vit-base-p16_sam-headless_omniv2-amphibian.py new file mode 100644 index 00000000000..196d655575d --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-amphibian.py @@ -0,0 +1,89 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=512, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/amphibian/meta/train.txt', + data_prefix='data/amphibian/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/amphibian/meta/val.txt', + data_prefix='data/amphibian/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/amphibian/meta/test.txt', + data_prefix='data/amphibian/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=20, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-aquatic_vertebrate.py b/configs/sam/vit-base-p16_sam-headless_omniv2-aquatic_vertebrate.py new file mode 100644 index 00000000000..9809cb7b417 --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-aquatic_vertebrate.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=512, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/aquatic_vertebrate/meta/train.txt', + data_prefix='data/aquatic_vertebrate/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/aquatic_vertebrate/meta/val.txt', + data_prefix='data/aquatic_vertebrate/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/aquatic_vertebrate/meta/test.txt', + data_prefix='data/aquatic_vertebrate/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=42, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-bird.py b/configs/sam/vit-base-p16_sam-headless_omniv2-bird.py new file mode 100644 index 00000000000..c998af46b8b --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-bird.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/bird/meta/train.txt', + data_prefix='data/bird/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/bird/meta/val.txt', + data_prefix='data/bird/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/bird/meta/test.txt', + data_prefix='data/bird/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=646, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-car.py b/configs/sam/vit-base-p16_sam-headless_omniv2-car.py new file mode 100644 index 00000000000..de72a71cea2 --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-car.py @@ -0,0 +1,89 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/car/meta/train.txt', + data_prefix='data/car/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/car/meta/val.txt', + data_prefix='data/car/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/car/meta/test.txt', + data_prefix='data/car/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=767, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-consumer_goods.py b/configs/sam/vit-base-p16_sam-headless_omniv2-consumer_goods.py new file mode 100644 index 00000000000..4868587fb8a --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-consumer_goods.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/consumer_goods/meta/train.txt', + data_prefix='data/consumer_goods/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/consumer_goods/meta/val.txt', + data_prefix='data/consumer_goods/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/consumer_goods/meta/test.txt', + data_prefix='data/consumer_goods/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=190, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-creation.py b/configs/sam/vit-base-p16_sam-headless_omniv2-creation.py new file mode 100644 index 00000000000..b7a4a3dcf49 --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-creation.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/creation/meta/train.txt', + data_prefix='data/creation/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/creation/meta/val.txt', + data_prefix='data/creation/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/creation/meta/test.txt', + data_prefix='data/creation/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=105, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-decoration.py b/configs/sam/vit-base-p16_sam-headless_omniv2-decoration.py new file mode 100644 index 00000000000..237ab1de335 --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-decoration.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/decoration/meta/train.txt', + data_prefix='data/decoration/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/decoration/meta/val.txt', + data_prefix='data/decoration/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/decoration/meta/test.txt', + data_prefix='data/decoration/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=44, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-device.py b/configs/sam/vit-base-p16_sam-headless_omniv2-device.py new file mode 100644 index 00000000000..04fe2904b69 --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-device.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/device/meta/train.txt', + data_prefix='data/device/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/device/meta/val.txt', + data_prefix='data/device/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/device/meta/test.txt', + data_prefix='data/device/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=228, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-food.py b/configs/sam/vit-base-p16_sam-headless_omniv2-food.py new file mode 100644 index 00000000000..0e6adf67708 --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-food.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/food/meta/train.txt', + data_prefix='data/food/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/food/meta/val.txt', + data_prefix='data/food/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/food/meta/test.txt', + data_prefix='data/food/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=891, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-geological_formation.py b/configs/sam/vit-base-p16_sam-headless_omniv2-geological_formation.py new file mode 100644 index 00000000000..0b3b0f97404 --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-geological_formation.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/geological_formation/meta/train.txt', + data_prefix='data/geological_formation/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/geological_formation/meta/val.txt', + data_prefix='data/geological_formation/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/geological_formation/meta/test.txt', + data_prefix='data/geological_formation/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=110, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-instrumentality.py b/configs/sam/vit-base-p16_sam-headless_omniv2-instrumentality.py new file mode 100644 index 00000000000..6d14f2d02f3 --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-instrumentality.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/instrumentality/meta/train.txt', + data_prefix='data/instrumentality/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/instrumentality/meta/val.txt', + data_prefix='data/instrumentality/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/instrumentality/meta/test.txt', + data_prefix='data/instrumentality/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=112, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-locomotive.py b/configs/sam/vit-base-p16_sam-headless_omniv2-locomotive.py new file mode 100644 index 00000000000..e303998a204 --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-locomotive.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/locomotive/meta/train.txt', + data_prefix='data/locomotive/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/locomotive/meta/val.txt', + data_prefix='data/locomotive/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/locomotive/meta/test.txt', + data_prefix='data/locomotive/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=44, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-mammal.py b/configs/sam/vit-base-p16_sam-headless_omniv2-mammal.py new file mode 100644 index 00000000000..1e554718608 --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-mammal.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/mammal/meta/train.txt', + data_prefix='data/mammal/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/mammal/meta/val.txt', + data_prefix='data/mammal/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/mammal/meta/test.txt', + data_prefix='data/mammal/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=384, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-material.py b/configs/sam/vit-base-p16_sam-headless_omniv2-material.py new file mode 100644 index 00000000000..44e067dba02 --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-material.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/material/meta/train.txt', + data_prefix='data/material/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/material/meta/val.txt', + data_prefix='data/material/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/material/meta/test.txt', + data_prefix='data/material/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=97, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-military_vehicle.py b/configs/sam/vit-base-p16_sam-headless_omniv2-military_vehicle.py new file mode 100644 index 00000000000..e8d81537161 --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-military_vehicle.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/military_vehicle/meta/train.txt', + data_prefix='data/military_vehicle/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/military_vehicle/meta/val.txt', + data_prefix='data/military_vehicle/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/military_vehicle/meta/test.txt', + data_prefix='data/military_vehicle/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=72, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-process.py b/configs/sam/vit-base-p16_sam-headless_omniv2-process.py new file mode 100644 index 00000000000..5240deed4db --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-process.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/process/meta/train.txt', + data_prefix='data/process/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/process/meta/val.txt', + data_prefix='data/process/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/process/meta/test.txt', + data_prefix='data/process/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=46, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-region.py b/configs/sam/vit-base-p16_sam-headless_omniv2-region.py new file mode 100644 index 00000000000..3823aeced2e --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-region.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/region/meta/train.txt', + data_prefix='data/region/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/region/meta/val.txt', + data_prefix='data/region/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/region/meta/test.txt', + data_prefix='data/region/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=217, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-structure.py b/configs/sam/vit-base-p16_sam-headless_omniv2-structure.py new file mode 100644 index 00000000000..448e8d9677d --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-structure.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/structure/meta/train.txt', + data_prefix='data/structure/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/structure/meta/val.txt', + data_prefix='data/structure/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/structure/meta/test.txt', + data_prefix='data/structure/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=346, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True) diff --git a/configs/sam/vit-base-p16_sam-headless_omniv2-vascular_plant.py b/configs/sam/vit-base-p16_sam-headless_omniv2-vascular_plant.py new file mode 100644 index 00000000000..0383ac9d9ba --- /dev/null +++ b/configs/sam/vit-base-p16_sam-headless_omniv2-vascular_plant.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/datasets/omnibenchmarkv2_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +# dataset settings +data_preprocessor = dict( + # RGB format normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + # convert image from BGR to RGB + to_rgb=True, +) + +train_dataloader = dict(batch_size=2048, + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/vascular_plant/meta/train.txt', + data_prefix='data/vascular_plant/images/'), + drop_last=True) +val_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/vascular_plant/meta/val.txt', + data_prefix='data/vascular_plant/images/'), + drop_last=False) +test_dataloader = dict( + dataset=dict( + data_root='data/omnibenchmarkv2/', + ann_file='annotation/vascular_plant/meta/test.txt', + data_prefix='data/vascular_plant/images/'), + drop_last=False) + +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ViTSAM', + arch='base', + img_size=224, + patch_size=16, + out_channels=0, + use_abs_pos=True, + use_rel_pos=True, + window_size=14, + frozen_stages=12, + out_type='avg_featmap', + init_cfg=dict(type='Pretrained', checkpoint='/mnt/petrelfs/zhangyuanhan/weights/vit-base-p16_sam-pre_3rdparty_sa1b-1024px_20230411-2320f9cc.pth', prefix="backbone.")), + neck=dict(type='ClsBatchNormNeck', input_features=768), + head=dict( + type='VisionTransformerClsHead', + num_classes=671, + in_channels=768, + loss=dict(type='CrossEntropyLoss'), + init_cfg=[dict(type='TruncNormal', layer='Linear', std=0.01)])) + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='AmpOptimWrapper', + optimizer=dict(type='LARS', lr=6.4, weight_decay=0.0, momentum=0.9)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=80, + by_epoch=True, + begin=10, + end=90, + eta_min=0.0, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=90,val_interval=10) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=-1), + logger=dict(type='LoggerHook', interval=10)) + +randomness = dict(seed=0, diff_rank_seed=True)