Skip to content

Commit 7d4ada6

Browse files
committed
Update ViTamin model defs
1 parent cc8a03d commit 7d4ada6

File tree

1 file changed

+114
-34
lines changed

1 file changed

+114
-34
lines changed

timm/models/vitamin.py

Lines changed: 114 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -308,33 +308,35 @@ def _cfg(url='', **kwargs):
308308

309309

310310
default_cfgs = generate_default_cfgs({
311-
'vitamin_small.datacomp1b_clip_ltt': _cfg(
311+
'vitamin_small_224.datacomp1b_clip_ltt': _cfg(
312312
hf_hub_id='jienengchen/ViTamin-S-LTT', num_classes=384),
313-
'vitamin_small.datacomp1b_clip': _cfg(
313+
'vitamin_small_224.datacomp1b_clip': _cfg(
314314
hf_hub_id='jienengchen/ViTamin-S', num_classes=384),
315-
'vitamin_base.datacomp1b_clip_ltt': _cfg(
315+
'vitamin_base_224.datacomp1b_clip_ltt': _cfg(
316316
hf_hub_id='jienengchen/ViTamin-B-LTT', num_classes=768),
317-
'vitamin_base.datacomp1b_clip': _cfg(
317+
'vitamin_base_224.datacomp1b_clip': _cfg(
318318
hf_hub_id='jienengchen/ViTamin-B', num_classes=768),
319-
'vitamin_large.datacomp1b_clip': _cfg(
320-
hf_hub_id='jienengchen/ViTamin-L-224px', num_classes=1024),
321-
'vitamin_large_256.datacomp1b_clip_l2': _cfg(
322-
hf_hub_id='jienengchen/ViTamin-L2-256px', num_classes=1024,
323-
input_size=(3, 256, 256), crop_pct=1.0),
319+
'vitamin_large_224.datacomp1b_clip': _cfg(
320+
hf_hub_id='jienengchen/ViTamin-L-224px', num_classes=768),
324321
'vitamin_large_256.datacomp1b_clip': _cfg(
325-
hf_hub_id='jienengchen/ViTamin-L-256px', num_classes=1024,
322+
hf_hub_id='jienengchen/ViTamin-L-256px', num_classes=768,
326323
input_size=(3, 256, 256), crop_pct=1.0),
327-
'vitamin_large_336.datacomp1b_clip_l2': _cfg(
328-
hf_hub_id='jienengchen/ViTamin-L2-336px', num_classes=1024,
329-
input_size=(3, 336, 336), crop_pct=1.0),
330324
'vitamin_large_336.datacomp1b_clip': _cfg(
331-
hf_hub_id='jienengchen/ViTamin-L-336px', num_classes=1024,
325+
hf_hub_id='jienengchen/ViTamin-L-336px', num_classes=768,
332326
input_size=(3, 336, 336), crop_pct=1.0),
333-
'vitamin_large_384.datacomp1b_clip_l2': _cfg(
334-
hf_hub_id='jienengchen/ViTamin-L2-384px', num_classes=1024,
335-
input_size=(3, 384, 384), crop_pct=1.0),
336327
'vitamin_large_384.datacomp1b_clip': _cfg(
337-
hf_hub_id='jienengchen/ViTamin-L-384px', num_classes=1024,
328+
hf_hub_id='jienengchen/ViTamin-L-384px', num_classes=768,
329+
input_size=(3, 384, 384), crop_pct=1.0),
330+
'vitamin_large2_224.datacomp1b_clip': _cfg(
331+
hf_hub_id='jienengchen/ViTamin-L2-224px', num_classes=1024),
332+
'vitamin_large2_256.datacomp1b_clip': _cfg(
333+
hf_hub_id='jienengchen/ViTamin-L2-256px', num_classes=1024,
334+
input_size=(3, 256, 256), crop_pct=1.0),
335+
'vitamin_large2_336.datacomp1b_clip': _cfg(
336+
hf_hub_id='jienengchen/ViTamin-L2-336px', num_classes=1024,
337+
input_size=(3, 336, 336), crop_pct=1.0),
338+
'vitamin_large2_384.datacomp1b_clip': _cfg(
339+
hf_hub_id='jienengchen/ViTamin-L2-384px', num_classes=1024,
338340
input_size=(3, 384, 384), crop_pct=1.0),
339341
'vitamin_xlarge_256.datacomp1b_clip': _cfg(
340342
hf_hub_id='jienengchen/ViTamin-XL-256px', num_classes=1152,
@@ -349,12 +351,12 @@ def _cfg(url='', **kwargs):
349351

350352

351353
@register_model
352-
def vitamin_small(pretrained=False, **kwargs) -> VisionTransformer:
354+
def vitamin_small_224(pretrained=False, **kwargs) -> VisionTransformer:
353355
embed_cfg = VitCfg(
354356
embed_dim=(64, 128, 384),
355357
depths=(2, 4, 1),
356358
stem_width=64,
357-
conv_cfg = VitConvCfg(
359+
conv_cfg=VitConvCfg(
358360
norm_layer='layernorm2d',
359361
norm_eps=1e-6,
360362
),
@@ -364,17 +366,17 @@ def vitamin_small(pretrained=False, **kwargs) -> VisionTransformer:
364366
embed_dim=384, depth=14, num_heads=6, mlp_layer=GeGluMlp, mlp_ratio=2.,
365367
class_token=False, global_pool='avg', embed_cfg=embed_cfg
366368
)
367-
model = _create_vitamin('vitamin_small', pretrained=pretrained, **dict(model_args, **kwargs))
369+
model = _create_vitamin('vitamin_small_224', pretrained=pretrained, **dict(model_args, **kwargs))
368370
return model
369371

370372

371373
@register_model
372-
def vitamin_base(pretrained=False, **kwargs) -> VisionTransformer:
374+
def vitamin_base_224(pretrained=False, **kwargs) -> VisionTransformer:
373375
embed_cfg = VitCfg(
374376
embed_dim=(128, 256, 768),
375377
depths=(2, 4, 1),
376378
stem_width=128,
377-
conv_cfg = VitConvCfg(
379+
conv_cfg=VitConvCfg(
378380
norm_layer='layernorm2d',
379381
norm_eps=1e-6,
380382
),
@@ -383,17 +385,17 @@ def vitamin_base(pretrained=False, **kwargs) -> VisionTransformer:
383385
model_args = dict(
384386
embed_dim=768, depth=14, num_heads=12, mlp_layer=GeGluMlp, mlp_ratio=2.,
385387
class_token=False, global_pool='avg', embed_cfg=embed_cfg)
386-
model = _create_vitamin('vitamin_base', pretrained=pretrained, **dict(model_args, **kwargs))
388+
model = _create_vitamin('vitamin_base_224', pretrained=pretrained, **dict(model_args, **kwargs))
387389
return model
388390

389391

390392
@register_model
391-
def vitamin_large(pretrained=False, **kwargs) -> VisionTransformer:
393+
def vitamin_large_224(pretrained=False, **kwargs) -> VisionTransformer:
392394
embed_cfg = VitCfg(
393395
embed_dim=(160, 320, 1024),
394396
depths=(2, 4, 1),
395397
stem_width=160,
396-
conv_cfg = VitConvCfg(
398+
conv_cfg=VitConvCfg(
397399
norm_layer='layernorm2d',
398400
norm_eps=1e-6,
399401
),
@@ -403,7 +405,7 @@ def vitamin_large(pretrained=False, **kwargs) -> VisionTransformer:
403405
embed_dim=1024, depth=31, num_heads=16, mlp_layer=GeGluMlp, mlp_ratio=2.,
404406
class_token=False, global_pool='avg', embed_cfg=embed_cfg,
405407
)
406-
model = _create_vitamin('vitamin_large', pretrained=pretrained, **dict(model_args, **kwargs))
408+
model = _create_vitamin('vitamin_large_224', pretrained=pretrained, **dict(model_args, **kwargs))
407409
return model
408410

409411

@@ -413,7 +415,7 @@ def vitamin_large_256(pretrained=False, **kwargs) -> VisionTransformer:
413415
embed_dim=(160, 320, 1024),
414416
depths=(2, 4, 1),
415417
stem_width=160,
416-
conv_cfg = VitConvCfg(
418+
conv_cfg=VitConvCfg(
417419
norm_layer='layernorm2d',
418420
norm_eps=1e-6,
419421
),
@@ -432,7 +434,7 @@ def vitamin_large_336(pretrained=False, **kwargs) -> VisionTransformer:
432434
embed_dim=(160, 320, 1024),
433435
depths=(2, 4, 1),
434436
stem_width=160,
435-
conv_cfg = VitConvCfg(
437+
conv_cfg=VitConvCfg(
436438
norm_layer='layernorm2d',
437439
norm_eps=1e-6,
438440
),
@@ -452,7 +454,7 @@ def vitamin_large_384(pretrained=False, **kwargs) -> VisionTransformer:
452454
embed_dim=(160, 320, 1024),
453455
depths=(2, 4, 1),
454456
stem_width=160,
455-
conv_cfg = VitConvCfg(
457+
conv_cfg=VitConvCfg(
456458
norm_layer='layernorm2d',
457459
norm_eps=1e-6,
458460
),
@@ -465,13 +467,91 @@ def vitamin_large_384(pretrained=False, **kwargs) -> VisionTransformer:
465467
return model
466468

467469

470+
@register_model
471+
def vitamin_large2_224(pretrained=False, **kwargs) -> VisionTransformer:
472+
embed_cfg = VitCfg(
473+
embed_dim=(160, 320, 1024),
474+
depths=(2, 4, 1),
475+
stem_width=160,
476+
conv_cfg=VitConvCfg(
477+
norm_layer='layernorm2d',
478+
norm_eps=1e-6,
479+
),
480+
head_type='1d',
481+
)
482+
model_args = dict(
483+
embed_dim=1024, depth=31, num_heads=16, mlp_layer=GeGluMlp, mlp_ratio=2.,
484+
class_token=False, global_pool='avg', embed_cfg=embed_cfg,
485+
)
486+
model = _create_vitamin('vitamin_large2_224', pretrained=pretrained, **dict(model_args, **kwargs))
487+
return model
488+
489+
490+
@register_model
491+
def vitamin_large2_256(pretrained=False, **kwargs) -> VisionTransformer:
492+
embed_cfg = VitCfg(
493+
embed_dim=(160, 320, 1024),
494+
depths=(2, 4, 1),
495+
stem_width=160,
496+
conv_cfg=VitConvCfg(
497+
norm_layer='layernorm2d',
498+
norm_eps=1e-6,
499+
),
500+
head_type='1d',
501+
)
502+
model_args = dict(
503+
img_size=256, embed_dim=1024, depth=31, num_heads=16, mlp_layer=GeGluMlp, mlp_ratio=2.,
504+
class_token=False, global_pool='avg', embed_cfg=embed_cfg)
505+
model = _create_vitamin('vitamin_large2_256', pretrained=pretrained, **dict(model_args, **kwargs))
506+
return model
507+
508+
509+
@register_model
510+
def vitamin_large2_336(pretrained=False, **kwargs) -> VisionTransformer:
511+
embed_cfg = VitCfg(
512+
embed_dim=(160, 320, 1024),
513+
depths=(2, 4, 1),
514+
stem_width=160,
515+
conv_cfg=VitConvCfg(
516+
norm_layer='layernorm2d',
517+
norm_eps=1e-6,
518+
),
519+
head_type='1d',
520+
)
521+
model_args = dict(
522+
img_size=336, embed_dim=1024, depth=31, num_heads=16, mlp_layer=GeGluMlp, mlp_ratio=2.,
523+
class_token=False, global_pool='avg', embed_cfg=embed_cfg
524+
)
525+
model = _create_vitamin('vitamin_large2_336', pretrained=pretrained, **dict(model_args, **kwargs))
526+
return model
527+
528+
529+
@register_model
530+
def vitamin_large2_384(pretrained=False, **kwargs) -> VisionTransformer:
531+
embed_cfg = VitCfg(
532+
embed_dim=(160, 320, 1024),
533+
depths=(2, 4, 1),
534+
stem_width=160,
535+
conv_cfg=VitConvCfg(
536+
norm_layer='layernorm2d',
537+
norm_eps=1e-6,
538+
),
539+
head_type='1d',
540+
)
541+
model_args = dict(
542+
img_size=384, embed_dim=1024, depth=31, num_heads=16, mlp_layer=GeGluMlp, mlp_ratio=2.,
543+
class_token=False, global_pool='avg', embed_cfg=embed_cfg)
544+
model = _create_vitamin('vitamin_large2_384', pretrained=pretrained, **dict(model_args, **kwargs))
545+
return model
546+
547+
468548
@register_model
469549
def vitamin_xlarge_256(pretrained=False, **kwargs) -> VisionTransformer:
470550
embed_cfg=VitCfg(
471551
embed_dim=(192, 384, 1152),
472552
depths=(2, 4, 1),
473553
stem_width=192,
474-
conv_cfg = VitConvCfg(
554+
conv_cfg=VitConvCfg(
475555
norm_layer='layernorm2d',
476556
norm_eps=1e-6,
477557
),
@@ -491,7 +571,7 @@ def vitamin_xlarge_336(pretrained=False, **kwargs) -> VisionTransformer:
491571
embed_dim=(192, 384, 1152),
492572
depths=(2, 4, 1),
493573
stem_width=192,
494-
conv_cfg = VitConvCfg(
574+
conv_cfg=VitConvCfg(
495575
norm_layer='layernorm2d',
496576
norm_eps=1e-6,
497577
),
@@ -500,7 +580,7 @@ def vitamin_xlarge_336(pretrained=False, **kwargs) -> VisionTransformer:
500580
model_args = dict(
501581
img_size=336, embed_dim=1152, depth=32, num_heads=16, mlp_layer=GeGluMlp, mlp_ratio=2.,
502582
class_token=False, global_pool='avg', pos_embed='none', embed_cfg=embed_cfg)
503-
model = _create_vitamin('vitamin_xlarge_336', pretrained=pretrained, **dict(model_args, **kwargs))
583+
model = _create_vitamin('vitamin_xlarge_256', pretrained=pretrained, **dict(model_args, **kwargs))
504584
return model
505585

506586

@@ -510,7 +590,7 @@ def vitamin_xlarge_384(pretrained=False, **kwargs) -> VisionTransformer:
510590
embed_dim=(192, 384, 1152),
511591
depths=(2, 4, 1),
512592
stem_width=192,
513-
conv_cfg = VitConvCfg(
593+
conv_cfg=VitConvCfg(
514594
norm_layer='layernorm2d',
515595
norm_eps=1e-6,
516596
),

0 commit comments

Comments
 (0)