FluxML
diff --git a/‎.github/workflows/CI.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/CI.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎Project.toml
Lines changed: 7 additions & 3 deletions b/‎Project.toml
Lines changed: 7 additions & 3 deletions
diff --git a/‎src/Metalhead.jl
Lines changed: 3 additions & 5 deletions b/‎src/Metalhead.jl
Lines changed: 3 additions & 5 deletions
diff --git a/‎src/convnets/alexnet.jl
Lines changed: 31 additions & 26 deletions b/‎src/convnets/alexnet.jl
Lines changed: 31 additions & 26 deletions
diff --git a/‎src/convnets/convmixer.jl
Lines changed: 23 additions & 27 deletions b/‎src/convnets/convmixer.jl
Lines changed: 23 additions & 27 deletions
diff --git a/‎src/convnets/convnext.jl
Lines changed: 34 additions & 33 deletions b/‎src/convnets/convnext.jl
Lines changed: 34 additions & 33 deletions
@@ -34,8 +34,8 @@ jobs:
           - '"Inception"'
           - '"DenseNet"'
           - '["ConvNeXt", "ConvMixer"]'
-          - 'r"ViTs"'
           - 'r"Mixers"'
+          - 'r"ViTs"'
     steps:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@v1
 
@@ -20,9 +20,13 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 [compat]
 BSON = "0.3.2"
 Flux = "0.13"
-Functors = "0.2"
-MLUtils = "0.2.6"
-NNlib = "0.7.34, 0.8"
+Functors = "0.2, 0.3"
+CUDA = "3"
+ChainRulesCore = "1"
+PartialFunctions = "1"
+MLUtils = "0.2.10"
+NNlib = "0.8"
+NNlibCUDA = "0.2"
 julia = "1.6"
 
 [publish]
 
@@ -56,14 +56,12 @@ include("vit-based/vit.jl")
 include("pretrain.jl")
 
 export AlexNet, VGG, VGG11, VGG13, VGG16, VGG19,
-       ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152, ResNeXt,
+       ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152,
+       WideResNet, ResNeXt, SEResNet, SEResNeXt,
        DenseNet, DenseNet121, DenseNet161, DenseNet169, DenseNet201,
        GoogLeNet, Inception3, Inceptionv3, Inceptionv4, InceptionResNetv2, Xception,
        SqueezeNet, MobileNetv1, MobileNetv2, MobileNetv3, EfficientNet,
-       WideResNet, SEResNet, SEResNeXt,
-       MLPMixer, ResMLP, gMLP,
-       ViT,
-       ConvMixer, ConvNeXt
+       MLPMixer, ResMLP, gMLP, ViT, ConvMixer, ConvNeXt
 
 # use Flux._big_show to pretty print large models
 for T in (:AlexNet, :VGG, :ResNet, :ResNeXt, :DenseNet, :SEResNet, :SEResNeXt,
 
@@ -1,54 +1,59 @@
 """
-    alexnet(; nclasses = 1000)
+    alexnet(; inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Create an AlexNet model
 ([reference](https://papers.nips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf)).
 
 # Arguments
 
+  - `inchannels`: The number of input channels.
   - `nclasses`: the number of output classes
 """
-function alexnet(; nclasses = 1000)
-    layers = Chain(Chain(Conv((11, 11), 3 => 64, relu; stride = (4, 4), pad = (2, 2)),
-                         MaxPool((3, 3); stride = (2, 2)),
-                         Conv((5, 5), 64 => 192, relu; pad = (2, 2)),
-                         MaxPool((3, 3); stride = (2, 2)),
-                         Conv((3, 3), 192 => 384, relu; pad = (1, 1)),
-                         Conv((3, 3), 384 => 256, relu; pad = (1, 1)),
-                         Conv((3, 3), 256 => 256, relu; pad = (1, 1)),
-                         MaxPool((3, 3); stride = (2, 2)),
-                         AdaptiveMeanPool((6, 6))),
-                   Chain(MLUtils.flatten,
-                         Dropout(0.5),
-                         Dense(256 * 6 * 6, 4096, relu),
-                         Dropout(0.5),
-                         Dense(4096, 4096, relu),
-                         Dense(4096, nclasses)))
-    return layers
+function alexnet(; inchannels::Integer = 3, nclasses::Integer = 1000)
+    backbone = Chain(Conv((11, 11), inchannels => 64, relu; stride = 4, pad = 2),
+                     MaxPool((3, 3); stride = 2),
+                     Conv((5, 5), 64 => 192, relu; pad = 2),
+                     MaxPool((3, 3); stride = 2),
+                     Conv((3, 3), 192 => 384, relu; pad = 1),
+                     Conv((3, 3), 384 => 256, relu; pad = 1),
+                     Conv((3, 3), 256 => 256, relu; pad = 1),
+                     MaxPool((3, 3); stride = 2))
+    classifier = Chain(AdaptiveMeanPool((6, 6)), MLUtils.flatten,
+                       Dropout(0.5),
+                       Dense(256 * 6 * 6, 4096, relu),
+                       Dropout(0.5),
+                       Dense(4096, 4096, relu),
+                       Dense(4096, nclasses))
+    return Chain(backbone, classifier)
 end
 
 """
-    AlexNet(; pretrain = false, nclasses = 1000)
+    AlexNet(; pretrain::Bool = false, inchannels::Integer = 3,
+            nclasses::Integer = 1000)
 
 Create a `AlexNet`.
-See also [`alexnet`](#).
-
-!!! warning
-    
-    `AlexNet` does not currently support pretrained weights.
+([reference](https://papers.nips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf)).
 
 # Arguments
 
   - `pretrain`: set to `true` to load pre-trained weights for ImageNet
+  - `inchannels`: The number of input channels.
   - `nclasses`: the number of output classes
+
+!!! warning
+    
+    `AlexNet` does not currently support pretrained weights.
+
+See also [`alexnet`](#).
 """
 struct AlexNet
     layers::Any
 end
 @functor AlexNet
 
-function AlexNet(; pretrain = false, nclasses = 1000)
-    layers = alexnet(; nclasses = nclasses)
+function AlexNet(; pretrain::Bool = false, inchannels::Integer = 3,
+                 nclasses::Integer = 1000)
+    layers = alexnet(; inchannels, nclasses)
     if pretrain
         loadpretrain!(layers, "AlexNet")
     end
 
@@ -1,6 +1,7 @@
 """
-    convmixer(planes, depth; inchannels = 3, kernel_size = (9, 9), patch_size::Dims{2} = 7,
-              activation = gelu, nclasses = 1000)
+    convmixer(planes::Integer, depth::Integer; kernel_size = (9, 9),
+              patch_size::Dims{2} = (7, 7), activation = gelu,
+              inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a ConvMixer model.
 ([reference](https://arxiv.org/abs/2201.09792))
@@ -9,61 +10,56 @@ Creates a ConvMixer model.
 
   - `planes`: number of planes in the output of each block
   - `depth`: number of layers
-  - `inchannels`: The number of channels in the input.
   - `kernel_size`: kernel size of the convolutional layers
   - `patch_size`: size of the patches
   - `activation`: activation function used after the convolutional layers
+  - `inchannels`: The number of channels in the input.
   - `nclasses`: number of classes in the output
 """
-function convmixer(planes, depth; inchannels = 3, kernel_size = (9, 9),
-                   patch_size::Dims{2} = (7, 7), activation = gelu, nclasses = 1000)
+function convmixer(planes::Integer, depth::Integer; kernel_size = (9, 9),
+                   patch_size::Dims{2} = (7, 7), activation = gelu,
+                   inchannels::Integer = 3, nclasses::Integer = 1000)
     stem = conv_norm(patch_size, inchannels, planes, activation; preact = true,
                      stride = patch_size[1])
     blocks = [Chain(SkipConnection(Chain(conv_norm(kernel_size, planes, planes, activation;
                                                    preact = true, groups = planes,
                                                    pad = SamePad())), +),
                     conv_norm((1, 1), planes, planes, activation; preact = true)...)
               for _ in 1:depth]
-    head = Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(planes, nclasses))
-    return Chain(Chain(stem..., Chain(blocks)), head)
+    return Chain(Chain(stem..., Chain(blocks...)), create_classifier(planes, nclasses))
 end
 
-const CONVMIXER_CONFIGS = Dict(:base => Dict(:planes => 1536, :depth => 20,
-                                             :kernel_size => (9, 9),
-                                             :patch_size => (7, 7)),
-                               :small => Dict(:planes => 768, :depth => 32,
-                                              :kernel_size => (7, 7),
-                                              :patch_size => (7, 7)),
-                               :large => Dict(:planes => 1024, :depth => 20,
-                                              :kernel_size => (9, 9),
-                                              :patch_size => (7, 7)))
+const CONVMIXER_CONFIGS = Dict(:base => ((1536, 20),
+                                         (kernel_size = (9, 9),
+                                          patch_size = (7, 7))),
+                               :small => ((768, 32),
+                                          (kernel_size = (7, 7),
+                                           patch_size = (7, 7))),
+                               :large => ((1024, 20),
+                                          (kernel_size = (9, 9),
+                                           patch_size = (7, 7))))
 
 """
-    ConvMixer(mode::Symbol = :base; inchannels = 3, activation = gelu, nclasses = 1000)
+    ConvMixer(config::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a ConvMixer model.
 ([reference](https://arxiv.org/abs/2201.09792))
 
 # Arguments
 
-  - `mode`: the mode of the model, either `:base`, `:small` or `:large`
+  - `config`: the size of the model, either `:base`, `:small` or `:large`
   - `inchannels`: The number of channels in the input.
-  - `activation`: activation function used after the convolutional layers
   - `nclasses`: number of classes in the output
 """
 struct ConvMixer
     layers::Any
 end
 @functor ConvMixer
 
-function ConvMixer(mode::Symbol = :base; inchannels = 3, activation = gelu, nclasses = 1000)
-    _checkconfig(mode, keys(CONVMIXER_CONFIGS))
-    planes = CONVMIXER_CONFIGS[mode][:planes]
-    depth = CONVMIXER_CONFIGS[mode][:depth]
-    kernel_size = CONVMIXER_CONFIGS[mode][:kernel_size]
-    patch_size = CONVMIXER_CONFIGS[mode][:patch_size]
-    layers = convmixer(planes, depth; inchannels, kernel_size, patch_size, activation,
-                       nclasses)
+function ConvMixer(config::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
+    _checkconfig(config, keys(CONVMIXER_CONFIGS))
+    layers = convmixer(CONVMIXER_CONFIGS[config][1]...; CONVMIXER_CONFIGS[config][2]...,
+                       inchannels, nclasses)
     return ConvMixer(layers)
 end
 
 
@@ -1,5 +1,5 @@
 """
-    convnextblock(planes, drop_path_rate = 0., λ = 1f-6)
+    convnextblock(planes::Integer, drop_path_rate = 0.0, layerscale_init = 1.0f-6)
 
 Creates a single block of ConvNeXt.
 ([reference](https://arxiv.org/abs/2201.03545))
@@ -8,61 +8,64 @@ Creates a single block of ConvNeXt.
 
   - `planes`: number of input channels.
   - `drop_path_rate`: Stochastic depth rate.
-  - `λ`: Initial value for [`LayerScale`](#)
+  - `layerscale_init`: Initial value for [`LayerScale`](#)
 """
-function convnextblock(planes, drop_path_rate = 0.0, λ = 1.0f-6)
+function convnextblock(planes::Integer, drop_path_rate = 0.0, layerscale_init = 1.0f-6)
     layers = SkipConnection(Chain(DepthwiseConv((7, 7), planes => planes; pad = 3),
                                   swapdims((3, 1, 2, 4)),
                                   LayerNorm(planes; ϵ = 1.0f-6),
                                   mlp_block(planes, 4 * planes),
-                                  LayerScale(planes, λ),
+                                  LayerScale(planes, layerscale_init),
                                   swapdims((2, 3, 1, 4)),
                                   DropPath(drop_path_rate)), +)
     return layers
 end
 
 """
-    convnext(depths, planes; inchannels = 3, drop_path_rate = 0., λ = 1f-6, nclasses = 1000)
+    convnext(depths::AbstractVector{<:Integer}, planes::AbstractVector{<:Integer};
+             drop_path_rate = 0.0, layerscale_init = 1.0f-6, inchannels::Integer = 3,
+             nclasses::Integer = 1000)
 
 Creates the layers for a ConvNeXt model.
 ([reference](https://arxiv.org/abs/2201.03545))
 
 # Arguments
 
-  - `inchannels`: number of input channels.
   - `depths`: list with configuration for depth of each block
   - `planes`: list with configuration for number of output channels in each block
   - `drop_path_rate`: Stochastic depth rate.
-  - `λ`: Initial value for [`LayerScale`](#)
+  - `layerscale_init`: Initial value for [`LayerScale`](#)
     ([reference](https://arxiv.org/abs/2103.17239))
+  - `inchannels`: number of input channels.
   - `nclasses`: number of output classes
 """
-function convnext(depths, planes; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6,
-                  nclasses = 1000)
+function convnext(depths::AbstractVector{<:Integer}, planes::AbstractVector{<:Integer};
+                  drop_path_rate = 0.0, layerscale_init = 1.0f-6, inchannels::Integer = 3,
+                  nclasses::Integer = 1000)
     @assert length(depths) == length(planes)
     "`planes` should have exactly one value for each block"
     downsample_layers = []
-    stem = Chain(Conv((4, 4), inchannels => planes[1]; stride = 4),
-                 ChannelLayerNorm(planes[1]))
-    push!(downsample_layers, stem)
+    push!(downsample_layers,
+          Chain(conv_norm((4, 4), inchannels => planes[1]; stride = 4,
+                          norm_layer = ChannelLayerNorm)...))
     for m in 1:(length(depths) - 1)
-        downsample_layer = Chain(ChannelLayerNorm(planes[m]),
-                                 Conv((2, 2), planes[m] => planes[m + 1]; stride = 2))
-        push!(downsample_layers, downsample_layer)
+        push!(downsample_layers,
+              Chain(conv_norm((2, 2), planes[m] => planes[m + 1]; stride = 2,
+                              norm_layer = ChannelLayerNorm, revnorm = true)...))
     end
     stages = []
     dp_rates = linear_scheduler(drop_path_rate; depth = sum(depths))
     cur = 0
     for i in eachindex(depths)
-        push!(stages, [convnextblock(planes[i], dp_rates[cur + j], λ) for j in 1:depths[i]])
+        push!(stages,
+              [convnextblock(planes[i], dp_rates[cur + j], layerscale_init)
+               for j in 1:depths[i]])
         cur += depths[i]
     end
     backbone = collect(Iterators.flatten(Iterators.flatten(zip(downsample_layers, stages))))
-    head = Chain(GlobalMeanPool(),
-                 MLUtils.flatten,
-                 LayerNorm(planes[end]),
-                 Dense(planes[end], nclasses))
-    return Chain(Chain(backbone), head)
+    classifier = Chain(GlobalMeanPool(), MLUtils.flatten,
+                       LayerNorm(planes[end]), Dense(planes[end], nclasses))
+    return Chain(Chain(backbone...), classifier)
 end
 
 # Configurations for ConvNeXt models
@@ -72,30 +75,28 @@ const CONVNEXT_CONFIGS = Dict(:tiny => ([3, 3, 9, 3], [96, 192, 384, 768]),
                               :large => ([3, 3, 27, 3], [192, 384, 768, 1536]),
                               :xlarge => ([3, 3, 27, 3], [256, 512, 1024, 2048]))
 
-struct ConvNeXt
-    layers::Any
-end
-@functor ConvNeXt
-
 """
-    ConvNeXt(mode::Symbol = :base; inchannels = 3, drop_path_rate = 0., λ = 1f-6, nclasses = 1000)
+    ConvNeXt(config::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
 
 Creates a ConvNeXt model.
 ([reference](https://arxiv.org/abs/2201.03545))
 
 # Arguments
 
+  - `config`: The size of the model, one of `tiny`, `small`, `base`, `large` or `xlarge`.
   - `inchannels`: The number of channels in the input.
-  - `drop_path_rate`: Stochastic depth rate.
-  - `λ`: Init value for [LayerScale](https://arxiv.org/abs/2103.17239)
   - `nclasses`: number of output classes
 
 See also [`Metalhead.convnext`](#).
 """
-function ConvNeXt(mode::Symbol = :base; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6,
-                  nclasses = 1000)
-    _checkconfig(mode, keys(CONVNEXT_CONFIGS))
-    layers = convnext(CONVNEXT_CONFIGS[mode]...; inchannels, drop_path_rate, λ, nclasses)
+struct ConvNeXt
+    layers::Any
+end
+@functor ConvNeXt
+
+function ConvNeXt(config::Symbol; inchannels::Integer = 3, nclasses::Integer = 1000)
+    _checkconfig(config, keys(CONVNEXT_CONFIGS))
+    layers = convnext(CONVNEXT_CONFIGS[config]...; inchannels, nclasses)
     return ConvNeXt(layers)
 end