FluxML
diff --git a/‎src/Flux.jl
Lines changed: 4 additions & 7 deletions b/‎src/Flux.jl
Lines changed: 4 additions & 7 deletions
diff --git a/‎src/deprecations.jl
Lines changed: 35 additions & 13 deletions b/‎src/deprecations.jl
Lines changed: 35 additions & 13 deletions
diff --git a/‎src/layers/conv.jl
Lines changed: 2 additions & 2 deletions b/‎src/layers/conv.jl
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/layers/show.jl
Lines changed: 5 additions & 5 deletions b/‎src/layers/show.jl
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/outputsize.jl
Lines changed: 0 additions & 2 deletions b/‎src/outputsize.jl
Lines changed: 0 additions & 2 deletions
diff --git a/‎test/data.jl
Lines changed: 6 additions & 4 deletions b/‎test/data.jl
Lines changed: 6 additions & 4 deletions
diff --git a/‎test/ext_cuda/cuda.jl
Lines changed: 1 addition & 1 deletion b/‎test/ext_cuda/cuda.jl
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/ext_cuda/curnn.jl
Lines changed: 55 additions & 0 deletions b/‎test/ext_cuda/curnn.jl
Lines changed: 55 additions & 0 deletions
diff --git a/‎test/ext_cuda/layers.jl
Lines changed: 20 additions & 42 deletions b/‎test/ext_cuda/layers.jl
Lines changed: 20 additions & 42 deletions
@@ -12,6 +12,8 @@ using MLUtils
 const stack = MLUtils.stack  # now exported by Base
 import Optimisers: Optimisers, trainable, destructure  # before v0.13, Flux owned these functions
 using Optimisers: freeze!, thaw!, adjust!, trainables
+@reexport using Optimisers
+
 using Random: default_rng
 using Zygote, ChainRulesCore
 using Zygote: Params, @adjoint, gradient, pullback
@@ -56,13 +58,8 @@ export Chain, Dense, Embedding, EmbeddingBag,
 ))
 
 include("optimise/Optimise.jl")
-using .Optimise
-export Descent, Adam, Momentum, Nesterov, RMSProp,
-  AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, OAdam,
-  AdamW, RAdam, AdaBelief, InvDecay, ExpDecay,
-  WeightDecay, SignDecay, ClipValue, ClipNorm
-
-export ClipGrad, OptimiserChain  # these are const defined in deprecations, for ClipValue, Optimiser
+using .Optimise: Optimise
+export ClipValue # this is const defined in deprecations, for ClipGrad
 
 include("train.jl")
 using .Train
 
@@ -41,31 +41,40 @@ train!(loss, ps::Params, data, opt::Optimisers.AbstractRule; cb=nothing) = error
   """)
 
 train!(loss, model, data, opt::Optimise.AbstractOptimiser; cb=nothing) =
-  train!(loss, model, data, _old_to_new(opt); cb)
+  train!(loss, model, data, __old_to_new(opt); cb)
 
 # Next, to use the new `setup` with the still-exported old-style `Adam` etc:
 import .Train: setup
-setup(rule::Optimise.AbstractOptimiser, model) = setup(_old_to_new(rule), model)
+setup(rule::Optimise.AbstractOptimiser, model) = setup(__old_to_new(rule), model)
 # ... and allow accidental use of `Optimisers.setup` to do the same:
-Optimisers.setup(rule::Optimise.AbstractOptimiser, model) = setup(_old_to_new(rule), model)
+Optimisers.setup(rule::Optimise.AbstractOptimiser, model) = setup(__old_to_new(rule), model)
+
+
+function __old_to_new(rule)
+  Base.depwarn("""Optimisers from  Flux.Optimise module are deprecated. 
+                   Use optimisers from Optimisers.jl instead.""", :__old_to_new)
+  return _old_to_new(rule)
+end
 
 for T in [:Descent, :Adam, :Momentum, :Nesterov,
    	      :AdaGrad, :AdaMax, :AdaDelta, :AMSGrad, :NAdam, :RAdam, :OAdam, :AdaBelief,
    	      # :InvDecay, :ExpDecay, 
           :SignDecay,
           ]
-  @eval function _old_to_new(rule::$T)
+  @eval function _old_to_new(rule::Optimise.$T)
     args = map(f -> getfield(rule, f), fieldnames(Optimisers.$T))
     Optimisers.$T(args...)
   end
 end
-_old_to_new(rule::Optimiser) = Optimisers.OptimiserChain(map(_old_to_new, rule.os)...)
-const OptimiserChain = Optimise.Optimiser  # lets you use new name with implicit params too.
-_old_to_new(rule::WeightDecay) = Optimisers.WeightDecay(rule.wd)  # called lambda now
-_old_to_new(rule::ClipNorm) = Optimisers.ClipNorm(rule.thresh)  # called omega, and there are more fields 
-_old_to_new(rule::ClipValue) = Optimisers.ClipGrad(rule.thresh)  # called delta now, and struct name differs
-const ClipGrad = Optimise.ClipValue
-_old_to_new(rule::RMSProp) = Optimisers.RMSProp(rule.eta, rule.rho, rule.epsilon)  # RMSProp has no field centred
+_old_to_new(rule::Optimise.Optimiser) = Optimisers.OptimiserChain(map(_old_to_new, rule.os)...)
+# const OptimiserChain = Optimise.Optimiser  # lets you use new name with implicit params too.
+const Optimiser = Optimisers.OptimiserChain
+_old_to_new(rule::Optimise.WeightDecay) = Optimisers.WeightDecay(rule.wd)  # called lambda now
+_old_to_new(rule::Optimise.ClipNorm) = Optimisers.ClipNorm(rule.thresh)  # called omega, and there are more fields 
+_old_to_new(rule::Optimise.ClipValue) = Optimisers.ClipGrad(rule.thresh)  # called delta now, and struct name differs
+# const ClipGrad = Optimise.ClipValue
+const ClipValue = Optimisers.ClipGrad
+_old_to_new(rule::Optimise.RMSProp) = Optimisers.RMSProp(rule.eta, rule.rho, rule.epsilon)  # RMSProp has no field centred
 
 _old_to_new(rule) = error("Flux.setup does not know how to translate this old-style implicit rule to a new-style Optimisers.jl explicit rule")
 
@@ -83,8 +92,21 @@ function update!(opt::Optimise.AbstractOptimiser, model, grad)
   # to accept only arrays. Remove if this causes problems!
   # update!(opt::Flux.Optimise.AbstractOptimiser, x::AbstractArray, x̄)
   error("""Invalid input to `update!`.
-    * For the implicit style, this needs `update(::AbstractOptimiser, ::Params, ::Grads)`
-    * For the explicit style, `update(state, model, grad)` needs `state = Flux.setup(opt, model)`.
+    * For the implicit style, this needs `update!(::AbstractOptimiser, ::Params, ::Grads)`
+    * For the explicit style, `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
+    """)
+end
+
+# TODO this friendly error should go in Optimisers.jl.
+# remove after https://github.com/FluxML/Optimisers.jl/pull/181
+function update!(opt::Optimisers.AbstractRule, model, grad)
+  error("""Invalid input to `update!`.
+     `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
+    """)
+end
+function update!(opt::Optimisers.AbstractRule, model::Chain, grad::Tuple)
+  error("""Invalid input to `update!`.
+     `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
     """)
 end
 
 
@@ -145,7 +145,7 @@ Conv((3,), 4 => 5, σ)  # 65 parameters
 julia> layer(randn(100, 4, 64)) |> size
 (98, 5, 64)
 
-julia> Flux.params(layer) |> length
+julia> Flux.trainables(layer) |> length
 2
 ```
 """
@@ -294,7 +294,7 @@ ConvTranspose((3,), 5 => 4, σ)  # 64 parameters
 julia> layer(randn(100, 5, 64)) |> size  # transposed convolution will increase the dimension size (upsampling)
 (102, 4, 64)
 
-julia> Flux.params(layer) |> length
+julia> Flux.trainables(layer) |> length
 2
 ```
 """
 
@@ -104,15 +104,15 @@ function _layer_show(io::IO, layer, indent::Int=0, name=nothing)
   _str = isnothing(name) ? "" : "$name = "
   str = _str * _layer_string(io, layer)
   print(io, " "^indent, str, indent==0 ? "" : ",")
-  if !isempty(params(layer))
+  if !isempty(trainables(layer))
     print(io, " "^max(2, (indent==0 ? 20 : 39) - indent - length(str)))
-    printstyled(io, "# ", underscorise(sum(length, params(layer); init=0)), " parameters"; 
+    printstyled(io, "# ", underscorise(sum(length, trainables(layer); init=0)), " parameters"; 
 color=:light_black)
-    nonparam = _childarray_sum(length, layer) - sum(length, params(layer), init=0)
+    nonparam = _childarray_sum(length, layer) - sum(length, trainables(layer), init=0)
     if nonparam > 0
       printstyled(io, ", plus ", underscorise(nonparam), indent==0 ? " non-trainable" : ""; color=:light_black)
     end
-    _nan_show(io, params(layer))
+    _nan_show(io, trainables(layer))
   end
   indent==0 || println(io)
 end
@@ -127,7 +127,7 @@ function _layer_string(::IO, a::AbstractArray)
 end
 
 function _big_finale(io::IO, m)
-  ps = params(m)
+  ps = trainables(m)
   if length(ps) > 2
     pars = underscorise(sum(length, ps; init=0))
     bytes = Base.format_bytes(Base.summarysize(m))
 
@@ -302,8 +302,6 @@ function ChainRulesCore.rrule(::typeof(striplazy), m)
   striplazy(m), _ -> error("striplazy should never be used within a gradient")
 end
 
-params!(p::Params, x::LazyLayer, seen = IdSet()) = error("LazyLayer should never be used within params(m). Call striplazy(m) first.")
-
 Functors.functor(::Type{<:LazyLayer}, x) = error("LazyLayer should not be walked with Functors.jl, as the arrays which Flux.gpu wants to move may not exist yet.")
 
 function Base.show(io::IO, l::LazyLayer)
 
@@ -80,18 +80,20 @@ using Random
     # test interaction with `train!`
     θ = ones(2)
     X = zeros(2, 10)
-    loss(x) = sum((x .- θ).^2)
+    loss(θ, x) = sum((x .- θ).^2)
     d  = DataLoader(X)
-    Flux.train!(loss, Params([θ]), ncycle(d, 10), Descent(0.1))
+    opt_state = Flux.setup(Descent(0.1), θ)
+    Flux.train!(loss, θ, ncycle(d, 10), opt_state)
     @test norm(θ) < 1e-4
 
     # test interaction with `train!`
     θ = zeros(2)
     X = ones(2, 10)
     Y = fill(2, 10)
-    loss(x, y) = sum((y - x'*θ).^2)
+    loss(θ, x, y) = sum((y - x'*θ).^2)
     d  = DataLoader((X, Y))
-    Flux.train!(loss, Params([θ]), ncycle(d, 10), Descent(0.1))
+    opt_state = Flux.setup(Descent(0.1), θ)
+    Flux.train!(loss, θ, ncycle(d, 10), opt_state)
     @test norm(θ .- 1) < 1e-10
 
     # specify the rng
 
@@ -21,7 +21,7 @@ CUDA.allowscalar(false)
   m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
   cm = gpu(m)
 
-  @test all(p isa CuArray for p in Flux.params(cm))
+  @test all(p isa CuArray for p in Flux.trainables(cm))
   @test cm(gpu(rand(10, 10))) isa CuArray{Float32,2}
 
   xs = rand(5, 5)
 
@@ -0,0 +1,55 @@
+
+@testset "RNN" begin
+  @testset for R in [RNN, GRU, LSTM, GRUv3], batch_size in (1, 5)
+    rnn = R(10, 5)
+    curnn = rnn |> gpu
+
+    Flux.reset!(rnn)
+    Flux.reset!(curnn)
+    x = batch_size == 1 ?
+      rand(Float32, 10) :
+      rand(Float32, 10, batch_size)
+    cux = gpu(x)
+
+    y, back = pullback((r, x) -> r(x), rnn, x)
+    cuy, cuback = pullback((r, x) -> r(x), curnn, cux)
+
+    @test y ≈ collect(cuy)
+
+    ȳ = randn(size(y))
+    m̄, x̄ = back(ȳ)
+    cum̄, cux̄ = cuback(gpu(ȳ))
+
+    @test x̄ ≈ collect(cux̄)
+    @test m̄[].cell.Wi ≈ collect(cum̄[].cell.Wi)
+    @test m̄[].cell.Wh ≈ collect(cum̄[].cell.Wh)
+    @test m̄[].cell.b ≈ collect(cum̄[].cell.b)
+    if m̄[].state isa Tuple
+      for (x, cx) in zip(m̄[].state, cum̄[].state)
+        @test x ≈ collect(cx)
+      end
+    else
+      @test m̄[].state ≈ collect(cum̄[].state)
+    end
+
+    Flux.reset!(rnn)
+    Flux.reset!(curnn)
+    ohx = batch_size == 1 ?
+      Flux.onehot(rand(1:10), 1:10) :
+      Flux.onehotbatch(rand(1:10, batch_size), 1:10)
+    cuohx = gpu(ohx)
+    y = (rnn(ohx); rnn(ohx))
+
+    cuy = (curnn(cuohx); curnn(cuohx))
+    @test y ≈ collect(cuy)
+
+    Flux.reset!(rnn)
+    Flux.reset!(curnn)
+    fx = rand(Float32, 10, batch_size, 3)
+    cufx = gpu(fx)
+    fy = (rnn(fx); rnn(fx))
+
+    cufy = (curnn(cufx); curnn(cufx))
+    @test fy ≈ collect(cufy)
+  end
+end
@@ -110,17 +110,17 @@ end
   l = cl((2,2), 1=>3, bias = false) |> gpu
   ip = zeros(Float32, 28,28,1,1) |> gpu
   @test sum(l(ip)) ≈ 0.f0
-  gs = gradient(() -> sum(l(ip)), Flux.params(l))
-  @test l.bias ∉ gs.params
+  gs = gradient(l -> sum(l(ip)), l)[1]
+  @test gs.bias === nothing
 end
 
 @testset "Dense without bias" begin
   l = Dense(ones(Float32, 4, 3), false) |> gpu
   ip = zeros(Float32, 3, 7) |> gpu
 
   @test sum(l(ip)) ≈ 0.f0
-  gs = gradient(() -> sum(l(ip)), Flux.params(l))
-  @test l.bias ∉ gs.params
+  gs = gradient(l -> sum(l(ip)), l)[1]
+  @test gs.bias === nothing
 end
 
 @testset "Extended BatchNorm" begin
@@ -133,13 +133,13 @@ end
   μ_cpu = copy(m_cpu.μ)
   m_cpu(x_cpu)
   @test m_cpu.μ ≈ μ_cpu
-  gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
+  gradient(m_cpu -> sum(m_cpu(x_cpu)), m_cpu)
   @test !(m_cpu.μ ≈ μ_cpu)
 
   μ_gpu = copy(m_gpu.μ)
   m_gpu(x_gpu)
   @test m_gpu.μ ≈ μ_gpu
-  gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
+  gradient(m_gpu -> sum(m_gpu(x_gpu)), m_gpu)
   @test !(m_gpu.μ ≈ μ_gpu)
 
   @test Array(m_gpu.μ) ≈ m_cpu.μ
@@ -149,14 +149,14 @@ end
   μ_cpu = copy(m_cpu.μ)
   m_cpu(x_cpu)
   @test m_cpu.μ ≈ μ_cpu
-  gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
+  gradient(m_cpu -> sum(m_cpu(x_cpu)), m_cpu)
   @test m_cpu.μ ≈ μ_cpu
 
   testmode!(m_gpu)
   μ_gpu = copy(m_gpu.μ)
   m_gpu(x_gpu)
   @test m_gpu.μ ≈ μ_gpu
-  gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
+  gradient(m_gpu -> sum(m_gpu(x_gpu)), m_gpu)
   @test m_gpu.μ ≈ μ_gpu
 
   ## In trainmode, always track statistics
@@ -165,52 +165,36 @@ end
   m_cpu(x_cpu)
   @test !(m_cpu.μ ≈ μ_cpu)
   μ_cpu = copy(m_cpu.μ)
-  gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
+  gradient(m_cpu -> sum(m_cpu(x_cpu)), m_cpu)
   @test !(m_cpu.μ ≈ μ_cpu)
 
   trainmode!(m_gpu)
   μ_gpu = copy(m_gpu.μ)
   m_gpu(x_gpu)
   @test !(m_gpu.μ ≈ μ_gpu)
   μ_gpu = copy(m_gpu.μ)
-  gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
+  gradient(m_gpu -> sum(m_gpu(x_gpu)), m_gpu)
   @test !(m_gpu.μ ≈ μ_gpu)
-
-  ## No errors if input type mistmatch
-  # x_cpu = rand(Float64, 3, 2, 2)
-  # x_gpu = x_cpu |> gpu
-  # m_cpu(x_cpu)
-  # gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu))
-  # m_gpu(x_gpu)
-  # gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu))
 end
 
 @testset "Two-streams Bilinear" begin
   x = zeros(Float32,10,9) |> gpu
   y = zeros(Float32,2,9) |> gpu
   b = Flux.Bilinear(10, 2, 3) |> gpu
-  @test size(b(x,y)) == (3,9)
-  @test sum(abs2, b(x,y)) ≈ 0f0
-  gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b))
-  b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu
-  gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu))
-  for (pgpu, pcpu) in zip(params(b), params(b_cpu))
-    @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu])
-  end
+  @test size(b(x, y)) == (3,9)
+  @test sum(abs2, b(x, y)) ≈ 0f0
+  test_gradients(b |> cpu, x |> cpu, y |> cpu, 
+    test_gpu=true, compare_finite_diff=false, loss=o -> mean(abs2, o))
 end
 
 @testset "Two-streams Bilinear" begin
   x = zeros(Float32,10,9) |> gpu
   y = zeros(Float32,2,9) |> gpu
   b = Flux.Bilinear(10, 2, 3) |> gpu
-  @test size(b(x,y)) == (3,9)
-  @test sum(abs2, b(x,y)) ≈ 0f0
-  gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b))
-  b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu
-  gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu))
-  for (pgpu, pcpu) in zip(params(b), params(b_cpu))
-    @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu])
-  end
+  @test size(b(x, y)) == (3,9)
+  @test sum(abs2, b(x, y)) ≈ 0f0
+  test_gradients(b |> cpu, x |> cpu, y |> cpu, 
+    test_gpu=true, compare_finite_diff=false, loss=o -> mean(abs2, o))
 end
 
 @testset "Parallel" begin
@@ -228,15 +212,9 @@ end
   end
 
   @testset "gradient" begin
-    input_cpu = randn(10, 10, 10, 10)
-    input_gpu = input_cpu |> gpu
     layer_cpu = Parallel(+, x -> zero(x), identity)
-    layer_gpu = layer_cpu |> gpu
-    gs_cpu = gradient(() -> sum(abs2.(layer_cpu(input_cpu))), params(layer_cpu))
-    gs_gpu = gradient(() -> sum(abs2.(layer_gpu(input_gpu))), params(layer_gpu))
-    for (pgpu, pcpu) in zip(params(layer_cpu), params(layer_gpu))
-      @test gs_cpu[pcpu] ≈ gs_gpu[pgpu]
-    end
+    test_gradients(layer_cpu, randn(5, 5, 5, 5), 
+      test_gpu=true, compare_finite_diff=false, loss=o -> mean(abs2, o))
   end
 end
Original file line number	Diff line number	Diff line change
`@@ -145,7 +145,7 @@ Conv((3,), 4 => 5, σ) # 65 parameters`
`145`	`145`	`julia> layer(randn(100, 4, 64)) \|> size`
`146`	`146`	`(98, 5, 64)`
`147`	`147`
`148`		`-julia> Flux.params(layer) \|> length`
	`148`	`+julia> Flux.trainables(layer) \|> length`
`149`	`149`	`2`
`150`	`150`	```
`151`	`151`	`"""`
`@@ -294,7 +294,7 @@ ConvTranspose((3,), 5 => 4, σ) # 64 parameters`
`294`	`294`	`julia> layer(randn(100, 5, 64)) \|> size # transposed convolution will increase the dimension size (upsampling)`
`295`	`295`	`(102, 4, 64)`
`296`	`296`
`297`		`-julia> Flux.params(layer) \|> length`
	`297`	`+julia> Flux.trainables(layer) \|> length`
`298`	`298`	`2`
`299`	`299`	```
`300`	`300`	`"""`