@@ -308,33 +308,35 @@ def _cfg(url='', **kwargs):
308
308
309
309
310
310
default_cfgs = generate_default_cfgs ({
311
- 'vitamin_small .datacomp1b_clip_ltt' : _cfg (
311
+ 'vitamin_small_224 .datacomp1b_clip_ltt' : _cfg (
312
312
hf_hub_id = 'jienengchen/ViTamin-S-LTT' , num_classes = 384 ),
313
- 'vitamin_small .datacomp1b_clip' : _cfg (
313
+ 'vitamin_small_224 .datacomp1b_clip' : _cfg (
314
314
hf_hub_id = 'jienengchen/ViTamin-S' , num_classes = 384 ),
315
- 'vitamin_base .datacomp1b_clip_ltt' : _cfg (
315
+ 'vitamin_base_224 .datacomp1b_clip_ltt' : _cfg (
316
316
hf_hub_id = 'jienengchen/ViTamin-B-LTT' , num_classes = 768 ),
317
- 'vitamin_base .datacomp1b_clip' : _cfg (
317
+ 'vitamin_base_224 .datacomp1b_clip' : _cfg (
318
318
hf_hub_id = 'jienengchen/ViTamin-B' , num_classes = 768 ),
319
- 'vitamin_large.datacomp1b_clip' : _cfg (
320
- hf_hub_id = 'jienengchen/ViTamin-L-224px' , num_classes = 1024 ),
321
- 'vitamin_large_256.datacomp1b_clip_l2' : _cfg (
322
- hf_hub_id = 'jienengchen/ViTamin-L2-256px' , num_classes = 1024 ,
323
- input_size = (3 , 256 , 256 ), crop_pct = 1.0 ),
319
+ 'vitamin_large_224.datacomp1b_clip' : _cfg (
320
+ hf_hub_id = 'jienengchen/ViTamin-L-224px' , num_classes = 768 ),
324
321
'vitamin_large_256.datacomp1b_clip' : _cfg (
325
- hf_hub_id = 'jienengchen/ViTamin-L-256px' , num_classes = 1024 ,
322
+ hf_hub_id = 'jienengchen/ViTamin-L-256px' , num_classes = 768 ,
326
323
input_size = (3 , 256 , 256 ), crop_pct = 1.0 ),
327
- 'vitamin_large_336.datacomp1b_clip_l2' : _cfg (
328
- hf_hub_id = 'jienengchen/ViTamin-L2-336px' , num_classes = 1024 ,
329
- input_size = (3 , 336 , 336 ), crop_pct = 1.0 ),
330
324
'vitamin_large_336.datacomp1b_clip' : _cfg (
331
- hf_hub_id = 'jienengchen/ViTamin-L-336px' , num_classes = 1024 ,
325
+ hf_hub_id = 'jienengchen/ViTamin-L-336px' , num_classes = 768 ,
332
326
input_size = (3 , 336 , 336 ), crop_pct = 1.0 ),
333
- 'vitamin_large_384.datacomp1b_clip_l2' : _cfg (
334
- hf_hub_id = 'jienengchen/ViTamin-L2-384px' , num_classes = 1024 ,
335
- input_size = (3 , 384 , 384 ), crop_pct = 1.0 ),
336
327
'vitamin_large_384.datacomp1b_clip' : _cfg (
337
- hf_hub_id = 'jienengchen/ViTamin-L-384px' , num_classes = 1024 ,
328
+ hf_hub_id = 'jienengchen/ViTamin-L-384px' , num_classes = 768 ,
329
+ input_size = (3 , 384 , 384 ), crop_pct = 1.0 ),
330
+ 'vitamin_large2_224.datacomp1b_clip' : _cfg (
331
+ hf_hub_id = 'jienengchen/ViTamin-L2-224px' , num_classes = 1024 ),
332
+ 'vitamin_large2_256.datacomp1b_clip' : _cfg (
333
+ hf_hub_id = 'jienengchen/ViTamin-L2-256px' , num_classes = 1024 ,
334
+ input_size = (3 , 256 , 256 ), crop_pct = 1.0 ),
335
+ 'vitamin_large2_336.datacomp1b_clip' : _cfg (
336
+ hf_hub_id = 'jienengchen/ViTamin-L2-336px' , num_classes = 1024 ,
337
+ input_size = (3 , 336 , 336 ), crop_pct = 1.0 ),
338
+ 'vitamin_large2_384.datacomp1b_clip' : _cfg (
339
+ hf_hub_id = 'jienengchen/ViTamin-L2-384px' , num_classes = 1024 ,
338
340
input_size = (3 , 384 , 384 ), crop_pct = 1.0 ),
339
341
'vitamin_xlarge_256.datacomp1b_clip' : _cfg (
340
342
hf_hub_id = 'jienengchen/ViTamin-XL-256px' , num_classes = 1152 ,
@@ -349,12 +351,12 @@ def _cfg(url='', **kwargs):
349
351
350
352
351
353
@register_model
352
- def vitamin_small (pretrained = False , ** kwargs ) -> VisionTransformer :
354
+ def vitamin_small_224 (pretrained = False , ** kwargs ) -> VisionTransformer :
353
355
embed_cfg = VitCfg (
354
356
embed_dim = (64 , 128 , 384 ),
355
357
depths = (2 , 4 , 1 ),
356
358
stem_width = 64 ,
357
- conv_cfg = VitConvCfg (
359
+ conv_cfg = VitConvCfg (
358
360
norm_layer = 'layernorm2d' ,
359
361
norm_eps = 1e-6 ,
360
362
),
@@ -364,17 +366,17 @@ def vitamin_small(pretrained=False, **kwargs) -> VisionTransformer:
364
366
embed_dim = 384 , depth = 14 , num_heads = 6 , mlp_layer = GeGluMlp , mlp_ratio = 2. ,
365
367
class_token = False , global_pool = 'avg' , embed_cfg = embed_cfg
366
368
)
367
- model = _create_vitamin ('vitamin_small ' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
369
+ model = _create_vitamin ('vitamin_small_224 ' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
368
370
return model
369
371
370
372
371
373
@register_model
372
- def vitamin_base (pretrained = False , ** kwargs ) -> VisionTransformer :
374
+ def vitamin_base_224 (pretrained = False , ** kwargs ) -> VisionTransformer :
373
375
embed_cfg = VitCfg (
374
376
embed_dim = (128 , 256 , 768 ),
375
377
depths = (2 , 4 , 1 ),
376
378
stem_width = 128 ,
377
- conv_cfg = VitConvCfg (
379
+ conv_cfg = VitConvCfg (
378
380
norm_layer = 'layernorm2d' ,
379
381
norm_eps = 1e-6 ,
380
382
),
@@ -383,17 +385,17 @@ def vitamin_base(pretrained=False, **kwargs) -> VisionTransformer:
383
385
model_args = dict (
384
386
embed_dim = 768 , depth = 14 , num_heads = 12 , mlp_layer = GeGluMlp , mlp_ratio = 2. ,
385
387
class_token = False , global_pool = 'avg' , embed_cfg = embed_cfg )
386
- model = _create_vitamin ('vitamin_base ' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
388
+ model = _create_vitamin ('vitamin_base_224 ' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
387
389
return model
388
390
389
391
390
392
@register_model
391
- def vitamin_large (pretrained = False , ** kwargs ) -> VisionTransformer :
393
+ def vitamin_large_224 (pretrained = False , ** kwargs ) -> VisionTransformer :
392
394
embed_cfg = VitCfg (
393
395
embed_dim = (160 , 320 , 1024 ),
394
396
depths = (2 , 4 , 1 ),
395
397
stem_width = 160 ,
396
- conv_cfg = VitConvCfg (
398
+ conv_cfg = VitConvCfg (
397
399
norm_layer = 'layernorm2d' ,
398
400
norm_eps = 1e-6 ,
399
401
),
@@ -403,7 +405,7 @@ def vitamin_large(pretrained=False, **kwargs) -> VisionTransformer:
403
405
embed_dim = 1024 , depth = 31 , num_heads = 16 , mlp_layer = GeGluMlp , mlp_ratio = 2. ,
404
406
class_token = False , global_pool = 'avg' , embed_cfg = embed_cfg ,
405
407
)
406
- model = _create_vitamin ('vitamin_large ' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
408
+ model = _create_vitamin ('vitamin_large_224 ' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
407
409
return model
408
410
409
411
@@ -413,7 +415,7 @@ def vitamin_large_256(pretrained=False, **kwargs) -> VisionTransformer:
413
415
embed_dim = (160 , 320 , 1024 ),
414
416
depths = (2 , 4 , 1 ),
415
417
stem_width = 160 ,
416
- conv_cfg = VitConvCfg (
418
+ conv_cfg = VitConvCfg (
417
419
norm_layer = 'layernorm2d' ,
418
420
norm_eps = 1e-6 ,
419
421
),
@@ -432,7 +434,7 @@ def vitamin_large_336(pretrained=False, **kwargs) -> VisionTransformer:
432
434
embed_dim = (160 , 320 , 1024 ),
433
435
depths = (2 , 4 , 1 ),
434
436
stem_width = 160 ,
435
- conv_cfg = VitConvCfg (
437
+ conv_cfg = VitConvCfg (
436
438
norm_layer = 'layernorm2d' ,
437
439
norm_eps = 1e-6 ,
438
440
),
@@ -452,7 +454,7 @@ def vitamin_large_384(pretrained=False, **kwargs) -> VisionTransformer:
452
454
embed_dim = (160 , 320 , 1024 ),
453
455
depths = (2 , 4 , 1 ),
454
456
stem_width = 160 ,
455
- conv_cfg = VitConvCfg (
457
+ conv_cfg = VitConvCfg (
456
458
norm_layer = 'layernorm2d' ,
457
459
norm_eps = 1e-6 ,
458
460
),
@@ -465,13 +467,91 @@ def vitamin_large_384(pretrained=False, **kwargs) -> VisionTransformer:
465
467
return model
466
468
467
469
470
+ @register_model
471
+ def vitamin_large2_224 (pretrained = False , ** kwargs ) -> VisionTransformer :
472
+ embed_cfg = VitCfg (
473
+ embed_dim = (160 , 320 , 1024 ),
474
+ depths = (2 , 4 , 1 ),
475
+ stem_width = 160 ,
476
+ conv_cfg = VitConvCfg (
477
+ norm_layer = 'layernorm2d' ,
478
+ norm_eps = 1e-6 ,
479
+ ),
480
+ head_type = '1d' ,
481
+ )
482
+ model_args = dict (
483
+ embed_dim = 1024 , depth = 31 , num_heads = 16 , mlp_layer = GeGluMlp , mlp_ratio = 2. ,
484
+ class_token = False , global_pool = 'avg' , embed_cfg = embed_cfg ,
485
+ )
486
+ model = _create_vitamin ('vitamin_large2_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
487
+ return model
488
+
489
+
490
+ @register_model
491
+ def vitamin_large2_256 (pretrained = False , ** kwargs ) -> VisionTransformer :
492
+ embed_cfg = VitCfg (
493
+ embed_dim = (160 , 320 , 1024 ),
494
+ depths = (2 , 4 , 1 ),
495
+ stem_width = 160 ,
496
+ conv_cfg = VitConvCfg (
497
+ norm_layer = 'layernorm2d' ,
498
+ norm_eps = 1e-6 ,
499
+ ),
500
+ head_type = '1d' ,
501
+ )
502
+ model_args = dict (
503
+ img_size = 256 , embed_dim = 1024 , depth = 31 , num_heads = 16 , mlp_layer = GeGluMlp , mlp_ratio = 2. ,
504
+ class_token = False , global_pool = 'avg' , embed_cfg = embed_cfg )
505
+ model = _create_vitamin ('vitamin_large2_256' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
506
+ return model
507
+
508
+
509
+ @register_model
510
+ def vitamin_large2_336 (pretrained = False , ** kwargs ) -> VisionTransformer :
511
+ embed_cfg = VitCfg (
512
+ embed_dim = (160 , 320 , 1024 ),
513
+ depths = (2 , 4 , 1 ),
514
+ stem_width = 160 ,
515
+ conv_cfg = VitConvCfg (
516
+ norm_layer = 'layernorm2d' ,
517
+ norm_eps = 1e-6 ,
518
+ ),
519
+ head_type = '1d' ,
520
+ )
521
+ model_args = dict (
522
+ img_size = 336 , embed_dim = 1024 , depth = 31 , num_heads = 16 , mlp_layer = GeGluMlp , mlp_ratio = 2. ,
523
+ class_token = False , global_pool = 'avg' , embed_cfg = embed_cfg
524
+ )
525
+ model = _create_vitamin ('vitamin_large2_336' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
526
+ return model
527
+
528
+
529
+ @register_model
530
+ def vitamin_large2_384 (pretrained = False , ** kwargs ) -> VisionTransformer :
531
+ embed_cfg = VitCfg (
532
+ embed_dim = (160 , 320 , 1024 ),
533
+ depths = (2 , 4 , 1 ),
534
+ stem_width = 160 ,
535
+ conv_cfg = VitConvCfg (
536
+ norm_layer = 'layernorm2d' ,
537
+ norm_eps = 1e-6 ,
538
+ ),
539
+ head_type = '1d' ,
540
+ )
541
+ model_args = dict (
542
+ img_size = 384 , embed_dim = 1024 , depth = 31 , num_heads = 16 , mlp_layer = GeGluMlp , mlp_ratio = 2. ,
543
+ class_token = False , global_pool = 'avg' , embed_cfg = embed_cfg )
544
+ model = _create_vitamin ('vitamin_large2_384' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
545
+ return model
546
+
547
+
468
548
@register_model
469
549
def vitamin_xlarge_256 (pretrained = False , ** kwargs ) -> VisionTransformer :
470
550
embed_cfg = VitCfg (
471
551
embed_dim = (192 , 384 , 1152 ),
472
552
depths = (2 , 4 , 1 ),
473
553
stem_width = 192 ,
474
- conv_cfg = VitConvCfg (
554
+ conv_cfg = VitConvCfg (
475
555
norm_layer = 'layernorm2d' ,
476
556
norm_eps = 1e-6 ,
477
557
),
@@ -491,7 +571,7 @@ def vitamin_xlarge_336(pretrained=False, **kwargs) -> VisionTransformer:
491
571
embed_dim = (192 , 384 , 1152 ),
492
572
depths = (2 , 4 , 1 ),
493
573
stem_width = 192 ,
494
- conv_cfg = VitConvCfg (
574
+ conv_cfg = VitConvCfg (
495
575
norm_layer = 'layernorm2d' ,
496
576
norm_eps = 1e-6 ,
497
577
),
@@ -500,7 +580,7 @@ def vitamin_xlarge_336(pretrained=False, **kwargs) -> VisionTransformer:
500
580
model_args = dict (
501
581
img_size = 336 , embed_dim = 1152 , depth = 32 , num_heads = 16 , mlp_layer = GeGluMlp , mlp_ratio = 2. ,
502
582
class_token = False , global_pool = 'avg' , pos_embed = 'none' , embed_cfg = embed_cfg )
503
- model = _create_vitamin ('vitamin_xlarge_336 ' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
583
+ model = _create_vitamin ('vitamin_xlarge_256 ' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
504
584
return model
505
585
506
586
@@ -510,7 +590,7 @@ def vitamin_xlarge_384(pretrained=False, **kwargs) -> VisionTransformer:
510
590
embed_dim = (192 , 384 , 1152 ),
511
591
depths = (2 , 4 , 1 ),
512
592
stem_width = 192 ,
513
- conv_cfg = VitConvCfg (
593
+ conv_cfg = VitConvCfg (
514
594
norm_layer = 'layernorm2d' ,
515
595
norm_eps = 1e-6 ,
516
596
),
0 commit comments