add keep keyword

CarloLucibello · CarloLucibello · commit be419fb8018d · 2023-04-25T08:35:39.000+02:00
diff --git a/docs/make.jl b/docs/make.jl
@@ -35,7 +35,6 @@ makedocs(
             "Gradients -- Zygote.jl" => "training/zygote.md",
             "Batching Data -- MLUtils.jl" => "data/mlutils.md",
             "OneHotArrays.jl" => "data/onehot.md",
-            "Saving and Loading" => "models/saving.md"
             "Low-level Operations -- NNlib.jl" => "models/nnlib.md",
             "Nested Structures -- Functors.jl" => "models/functors.md",
          ],
diff --git a/docs/src/destructure.md b/docs/src/destructure.md
@@ -72,4 +72,11 @@ Another kind of flat view of a nested model is provided by the `modules` command
 
 ```@docs
 Flux.modules
-```
+```
+
+### Saving and Loading
+
+```@docs
+Flux.loadmodel!
+Flux.state
+```
diff --git a/docs/src/models/saving.md b/docs/src/models/saving.md
diff --git a/src/loading.jl b/src/loading.jl
@@ -103,30 +103,47 @@ function loadmodel!(dst, src; filter = _ -> true, cache = Base.IdSet())
 end
 
 """
-    state(x; full=false)
+    state(x; keep = leaf -> !(leaf isa Function))
 
 Return an object with the same nested structure as `x`
-according to `Functors.children()`, but made only of
+according to `Functors.children`, but made only of
 basic containers (e.g. named tuples, tuples, arrays, and dictionaries).
 
-If `full` is `false` (default), then only arrays and scalar original leaves are used as leaf values in the return, 
-with the other leaves being replaced by `nothing`.
+This method is particularly useful for saving and loading models, 
+since it doesn't require the user to specify the model type.
+The state can be passed to `loadmodel!` to restore the model.
 
-This method is particularly useful for saving and loading models, since it doesn't
-require the user to specify the model type.
-The returned state, can be passed to `loadmodel!` to restore the model.
+The `keep` function is applied on the leaves of `x`.
+If `keep(leaf)` is `false` , the leaf is replaced by `nothing`,
+otherwise it is left as is. By default, all functions are excluded.
+
+# Examples
+
+```julia-repl
+julia> m1 = Chain(Dense(1, 2, tanh), Dense(2, 1));
+
+julia> m2 = Chain(Dense(1, 2, tanh), Dense(2, 1));
+
+julia> s = Flux.state(m1)
+layers = ((weight = Float32[-0.56867087; 1.229064;;], bias = Float32[0.0, 0.0], σ = nothing), (weight = Float32[0.23323897 -0.5561147], bias = Float32[0.0], σ = nothing)),)
+
+julia> Flux.loadmodel!(m2, s);
+
+julia> m2[1].weight == m1[1].weight
+true
+```
 """
-function state(x; full=false)
+function state(x; keep = _state_keep)
   if Functors.isleaf(x)
-    if full
-      return x
-    else
-      return x isa Union{Number, AbstractArray} ? x : nothing
-    end
+    return keep(x) ? x : nothing
   else
-    return valuemap(c -> state(c; full), Functors.children(x))
+    return _valuemap(c -> state(c; keep), Functors.children(x))
   end
 end
 
-valuemap(f, x) = map(f, x)
-valuemap(f, x::Dict) = Dict(k => f(v) for (k, v) in x)
+_state_keep(x::Function) = false
+_state_keep(x) = true
+
+# map for tuples, namedtuples, and dicts
+_valuemap(f, x) = map(f, x)
+_valuemap(f, x::Dict) = Dict(k => f(v) for (k, v) in x)
diff --git a/test/loading.jl b/test/loading.jl
@@ -0,0 +1,227 @@
+
+ls(dims...) = reshape(collect(Float32, 1:prod(dims)), dims...) # accepts dims in reverse order to Dense
+dl(nin, nout, bias) = Dense(ls(nout, nin), bias(nout))
+dm(bias) = Chain(
+  dl(3, 5, bias),
+  dl(5, 4, bias),
+  dl(4, 3, bias)
+)
+
+nobias(n) = false
+testdense(m, bt) = @testset "Check layer $i" for (i, (l1, l2)) in enumerate(zip(m, dm(bt)))
+  @test l1.weight == l2.weight
+  @test l1.bias == l2.bias
+  @test_skip typeof(l1.bias) === typeof(l2.bias)
+end
+  
+
+@testset "loadmodel!(dst, src)" begin
+  m1 = Chain(Dense(10, 5), Dense(5, 2, relu))
+  m2 = Chain(Dense(10, 5), Dense(5, 2))
+  m3 = Chain(Conv((3, 3), 3 => 16), Dense(5, 2))
+  m4 = Chain(Dense(10, 6), Dense(6, 2))
+  m5 = Chain(Dense(10, 5), Parallel(+, Dense(Flux.ones32(2, 5), false), Dense(5, 2)))
+  m6 = Chain(Dense(10, 5), Parallel(+, Dense(5, 2), Dense(5, 2)))
+
+  loadmodel!(m1, m2)
+  # trainable parameters copy over
+  @test m1[1].weight == m2[1].weight
+  @test m1[1].bias == m2[1].bias
+  # non-array leaves are untouched
+  @test m1[2].σ == relu
+
+  loadmodel!(m5, m6)
+  # more complex nested structures also work
+  @test m5[1].weight == m6[1].weight
+  @test m5[2][1].weight == m6[2][1].weight
+  # false bias is not overwritten
+  @test m5[2][1].bias == false
+
+  # mismatched nodes throw an error
+  @test_throws ArgumentError loadmodel!(m1, m3)
+  @test_throws ArgumentError loadmodel!(m1, m5)
+  # size mismatches throw an error
+  @test_throws DimensionMismatch loadmodel!(m1, m4)
+
+  # tests for BatchNorm and Dropout
+  m1 = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), Flux.flatten, Dropout(0.2))
+  m2 = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), x -> reshape(x, :, size(x)[end]), Dropout(0.1))
+  m2[2].μ .= rand(Float32, size(m2[2].μ)...)
+  loadmodel!(m1, m2)
+  # non-trainable parameters are copied as well
+  @test m1[2].μ == m2[2].μ
+  # functions are not copied
+  @test m1[3] == Flux.flatten
+  # dropout rate is not copied
+  @test m1[4].p == 0.2
+
+  # from LegolasFlux (https://github.com/beacon-biosignals/LegolasFlux.jl/blob/80569ab63a8248a8a063c76e0bbf701f4ada9bd4/examples/digits.jl#L33)
+  # tests Chain(...) vs Chain([...])
+  # tests MaxPool
+  # tests testmode!/trainmode! is not copied
+  # tests Dense, Conv, BatchNorm, Dropout (like above) but in a bigger model
+  chain1 = Chain(Dropout(0.2),
+                  Conv((3, 3), 1 => 32, relu),
+                  BatchNorm(32, relu),
+                  MaxPool((2, 2)),
+                  Dropout(0.2),
+                  Conv((3, 3), 32 => 16, relu),
+                  Dropout(0.2),
+                  MaxPool((2, 2)),
+                  Dropout(0.2),
+                  Conv((3, 3), 16 => 10, relu),
+                  Dropout(0.2),
+                  x -> reshape(x, :, size(x, 4)),
+                  Dropout(0.2),
+                  Dense(90, 10),
+                  softmax)
+  chain2 = Chain([Dropout(0.1),
+                  Conv((3, 3), 1 => 32, relu),
+                  BatchNorm(32, relu),
+                  MaxPool((3, 3)),
+                  Dropout(0.1),
+                  Conv((3, 3), 32 => 16, relu),
+                  Dropout(0.1),
+                  MaxPool((3, 3)),
+                  Dropout(0.1),
+                  Conv((3, 3), 16 => 10, relu),
+                  Dropout(0.1),
+                  x -> reshape(x, :, size(x, 4)),
+                  Dropout(0.1),
+                  Dense(90, 10),
+                  softmax])
+  chain2[3].μ .= 5f0
+  chain2[3].σ² .= 2f0
+  testmode!(chain2)
+  loadmodel!(chain1, chain2)
+  for (dst, src) in zip(chain1, chain2)
+    if dst isa Dropout
+      @test dst.p == 0.2
+    elseif dst isa Union{Conv, Dense}
+      @test dst.weight == src.weight
+      @test dst.bias == src.bias
+    elseif dst isa MaxPool
+      @test dst.k == (2, 2)
+    elseif dst isa BatchNorm
+      @test dst.μ == src.μ
+      @test dst.σ² == src.σ²
+      @test isnothing(dst.active)
+    end
+  end
+
+  # copy only a subset of the model
+  chain1[end - 1].weight .= 1f0
+  chain1[3].μ .= 3f0
+  chain1[2].bias .= 5f0
+  loadmodel!(chain2[end - 1], chain1[end - 1])
+  loadmodel!(chain2[3], chain1[3])
+  @test chain2[end - 1].weight == chain1[end - 1].weight
+  @test chain2[3].μ == chain1[3].μ
+  @test chain2[2].bias != chain1[2].bias
+
+  # test shared weights
+  shared_dst = Dense(10 => 10)
+  shared_src = Dense(10 => 10)
+  # matched weights are okay
+  m1 = Chain(shared_dst, Dense(shared_dst.weight))
+  m2 = Chain(shared_src, Dense(shared_src.weight))
+  loadmodel!(m1, m2)
+  @test m1[1].weight === m1[2].weight
+  @test m1[1].weight == m2[2].weight
+  # mismatched weights are an error
+  m2 = Chain(Dense(10 => 10), Dense(10 => 10))
+  @test_throws ErrorException loadmodel!(m1, m2)
+  # loading into tied weights with absent parameter is okay when the dst == zero
+  b = Flux.zeros32(5)
+  m1 = Chain(Dense(10 => 5; bias = b), Dense(5 => 5; bias = b))
+  m2 = Chain(Dense(10 => 5; bias = Flux.zeros32(5)), Dense(5 => 5; bias = false))
+  loadmodel!(m1, m2)
+  @test m1[1].bias === m1[2].bias
+  @test iszero(m1[1].bias)
+  # loading into tied weights with absent parameter is bad when the dst != zero
+  m2[1].bias .= 1
+  @test_throws ErrorException loadmodel!(m1, m2)
+
+  @testset "loadmodel! & filter" begin
+    m1 = Chain(Dense(10, 5), Dense(5, 2, relu))
+    m2 = Chain(Dense(10, 5), Dropout(0.2), Dense(5, 2))
+    m3 = Chain(Dense(10, 5), Dense(5, 2, relu))
+
+    # this will not error cause Dropout is skipped
+    loadmodel!(m1, m2; filter = x -> !(x isa Dropout))
+    @test m1[1].weight == m2[1].weight
+    @test m1[2].weight == m2[3].weight
+
+    # this will not error cause Dropout is skipped
+    loadmodel!(m2, m3; filter = x -> !(x isa Dropout))
+    @test m3[1].weight == m2[1].weight
+    @test m3[2].weight == m2[3].weight
+  end
+
+  @testset "loadmodel! & absent bias" begin
+    m0 = Chain(Dense(2 => 3; bias=false, init = Flux.ones32), Dense(3 => 1))
+    m1 = Chain(Dense(2 => 3; bias = Flux.randn32(3)), Dense(3 => 1))
+    m2 = Chain(Dense(Float32[1 2; 3 4; 5 6], Float32[7, 8, 9]), Dense(3 => 1))
+
+    Flux.loadmodel!(m1, m2)
+    @test m1[1].bias == 7:9
+    @test sum(m1[1].weight) == 21
+
+    # load from a model without bias -- should ideally recognise the `false` but `Params` doesn't store it
+    m1 = Flux.loadmodel!(m1, m0)
+    @test iszero(m1[1].bias)
+    @test sum(m1[1].weight) == 6  # written before error
+
+    # load into a model without bias -- should it ignore the parameter which has no home, or error?
+    m0 = Flux.loadmodel!(m0, m2)
+    @test iszero(m0[1].bias)  # obviously unchanged
+    @test sum(m0[1].weight) == 21
+  end
+end
+
+@testset "loadmodel!(dst, src) with BSON" begin
+  m1 = Chain(Dense(Float32[1 2; 3 4; 5 6], Float32[7, 8, 9]), Dense(3 => 1))
+  m2 = Chain(Dense(Float32[0 0; 0 0; 0 0], Float32[0, 0, 0]), Dense(3 => 1))
+  @test m1[1].weight != m2[1].weight
+  mktempdir() do dir
+    BSON.@save joinpath(dir, "test.bson") m1
+    m2 = Flux.loadmodel!(m2, BSON.load(joinpath(dir, "test.bson"))[:m1])
+    @test m1[1].weight == m2[1].weight
+  end
+end
+
+@testset "state" begin
+  m1 = Chain(Dense(10, 5), Parallel(+, Dense(Flux.ones32(2, 5), false), Dense(5 => 2)))
+  m2 = Chain(Dense(10, 5), Parallel(+, Dense(Flux.zeros32(2, 5), Flux.ones32(2)), Dense(5 => 2)))
+  s = Flux.state(m1)
+  @test s isa NamedTuple
+  @test fieldnames(typeof(s)) == (:layers,)
+  @test s.layers isa Tuple
+  @test length(s.layers) == 2
+  @test s.layers[1].weight === m1[1].weight
+  @test s.layers[1].σ === nothing
+  @test s.layers[2].layers[1].weight === m1[2].layers[1].weight
+
+  Flux.loadmodel!(m2, s)
+  @test m2[1].weight == m1[1].weight
+  @test all(m2[2].layers[1].bias .== m1[2].layers[1].bias)
+
+  @testset "track active state and batch norm params" begin
+    m3 = Chain(Dense(10, 5), Dropout(0.2), Dense(5, 2), BatchNorm(2))
+    trainmode!(m3)
+    s = Flux.state(m3)
+    @test s.layers[2].active == true
+    @test s.layers[2].p == 0.2
+    @test s.layers[4] == (λ = nothing, β = Float32[0.0, 0.0], γ = Float32[1.0, 1.0], 
+                          μ = Float32[0.0, 0.0], σ² = Float32[1.0, 1.0], ϵ = 1.0f-5, momentum = 0.1f0, affine = true, 
+                          track_stats = true, active = true, chs = 2)
+  end
+
+  @testset "keep" begin
+    s = Flux.state(m1, keep = x -> x isa AbstractArray)
+    @test s.layers[1].weight isa AbstractArray
+    @test s.layers[1].σ === nothing
+    @test s.layers[2].connection === nothing
+    @test s.layers[2].layers[1].bias === nothing
+  end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -17,6 +17,10 @@ Random.seed!(0)
     include("utils.jl")
   end
 
+  @testset "Loading" begin
+    include("loading.jl")
+  end
+
   @testset "Optimise / Train" begin
     include("optimise.jl")
     include("train.jl")
diff --git a/test/utils.jl b/test/utils.jl