Implement LogDensityFunctionWithGrad

penelopeysm · penelopeysm · commit b314e916c34c · 2025-02-14T11:50:05.000Z
diff --git a/HISTORY.md b/HISTORY.md
@@ -49,6 +49,15 @@ This release removes the feature of `VarInfo` where it kept track of which varia
     
     This change also affects sampling in Turing.jl.
 
+**Other changes**
+
+LogDensityProblemsAD is now removed as a dependency.
+Instead of constructing a `LogDensityProblemAD.ADgradient` object, we now directly use `DifferentiationInterface` to calculate the gradient of the log density with respect to model parameters.
+
+In practice, this means that if you want to calculate the gradient for a model, you can do:
+
+TODO(penelopeysm): Finish this
+
 ## 0.34.2
 
   - Fixed bugs in ValuesAsInModelContext as well as DebugContext where underlying PrefixContexts were not being applied.
diff --git a/Project.toml b/Project.toml
@@ -29,7 +29,6 @@ ZygoteRules = "700de1a5-db45-46bc-99cf-38207098b444"
 [weakdeps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
-ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
 MCMCChains = "c7f686f2-ff18-58e9-bc7b-31028e88f75d"
 Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
@@ -38,7 +37,6 @@ ZygoteRules = "700de1a5-db45-46bc-99cf-38207098b444"
 [extensions]
 DynamicPPLChainRulesCoreExt = ["ChainRulesCore"]
 DynamicPPLEnzymeCoreExt = ["EnzymeCore"]
-DynamicPPLForwardDiffExt = ["ForwardDiff"]
 DynamicPPLJETExt = ["JET"]
 DynamicPPLMCMCChainsExt = ["MCMCChains"]
 DynamicPPLMooncakeExt = ["Mooncake"]
@@ -58,7 +56,6 @@ DifferentiationInterface = "0.6.39"
 Distributions = "0.25"
 DocStringExtensions = "0.9"
 EnzymeCore = "0.6 - 0.8"
-ForwardDiff = "0.10"
 JET = "0.9"
 KernelAbstractions = "0.9.33"
 LinearAlgebra = "1.6"
diff --git a/docs/src/api.md b/docs/src/api.md
@@ -54,10 +54,11 @@ logjoint
 
 ### LogDensityProblems.jl interface
 
-The [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface is also supported by simply wrapping a [`Model`](@ref) in a `DynamicPPL.LogDensityFunction`:
+The [LogDensityProblems.jl](https://github.com/tpapp/LogDensityProblems.jl) interface is also supported by wrapping a [`Model`](@ref) in a `DynamicPPL.LogDensityFunction` or `DynamicPPL.LogDensityFunctionWithGrad`.
 
 ```@docs
 DynamicPPL.LogDensityFunction
+DynamicPPL.LogDensityFunctionWithGrad
 ```
 
 ## Condition and decondition
diff --git a/ext/DynamicPPLForwardDiffExt.jl b/ext/DynamicPPLForwardDiffExt.jl
diff --git a/src/logdensityfunction.jl b/src/logdensityfunction.jl
@@ -4,6 +4,10 @@ import DifferentiationInterface as DI
     LogDensityFunction
 
 A callable representing a log density function of a `model`.
+`DynamicPPL.LogDensityFunction` implements the LogDensityProblems.jl interface,
+but only to 0th-order, i.e. it is only possible to calculate the log density,
+and not its gradient. If you need to calculate the gradient as well, you have
+to construct a [`DynamicPPL.LogDensityFunctionWithGrad`](@ref) object.
 
 # Fields
 $(FIELDS)
@@ -55,16 +59,6 @@ struct LogDensityFunction{V,M,C}
     context::C
 end
 
-# TODO: Deprecate.
-function LogDensityFunction(
-    varinfo::AbstractVarInfo,
-    model::Model,
-    sampler::AbstractSampler,
-    context::AbstractContext,
-)
-    return LogDensityFunction(varinfo, model, SamplingContext(sampler, context))
-end
-
 function LogDensityFunction(
     model::Model,
     varinfo::AbstractVarInfo=VarInfo(model),
@@ -94,48 +88,76 @@ function setmodel(f::DynamicPPL.LogDensityFunction, model::DynamicPPL.Model)
     return Accessors.@set f.model = model
 end
 
-# HACK: heavy usage of `AbstractSampler` for, well, _everything_, is being phased out. In the mean time
-# we need to define these annoying methods to ensure that we stay compatible with everything.
-getsampler(f::LogDensityFunction) = getsampler(getcontext(f))
-hassampler(f::LogDensityFunction) = hassampler(getcontext(f))
-
 """
     getparams(f::LogDensityFunction)
 
 Return the parameters of the wrapped varinfo as a vector.
 """
 getparams(f::LogDensityFunction) = f.varinfo[:]
 
-# LogDensityProblems interface
-function LogDensityProblems.logdensity(f::LogDensityFunction, θ::AbstractVector)
+# LogDensityProblems interface: logp (0th order)
+function LogDensityProblems.logdensity(f::LogDensityFunction, x::AbstractVector)
     context = getcontext(f)
-    vi_new = unflatten(f.varinfo, θ)
+    vi_new = unflatten(f.varinfo, x)
     return getlogp(last(evaluate!!(f.model, vi_new, context)))
 end
+function _flipped_logdensity(x::AbstractVector, f::LogDensityFunction)
+    return LogDensityProblems.logdensity(f, x)
+end
 function LogDensityProblems.capabilities(::Type{<:LogDensityFunction})
     return LogDensityProblems.LogDensityOrder{0}()
 end
 # TODO: should we instead implement and call on `length(f.varinfo)` (at least in the cases where no sampler is involved)?
 LogDensityProblems.dimension(f::LogDensityFunction) = length(getparams(f))
 
-_flipped_logdensity(θ, f) = LogDensityProblems.logdensity(f, θ)
+# LogDensityProblems interface: gradient (1st order)
+"""
+    LogDensityFunctionWithGrad(ldf::DynamicPPL.LogDensityFunction, adtype::ADTypes.AbstractADType)
+
+A callable representing a log density function of a `model`.
+`DynamicPPL.LogDensityFunctionWithGrad` implements the LogDensityProblems.jl
+interface to 1st-order, meaning that you can both calculate the log density
+using
+
+    LogDensityProblems.logdensity(f, x)
+
+and its gradient using
 
+    LogDensityProblems.logdensity_and_gradient(f, x)
+
+where `f` is a `LogDensityFunctionWithGrad` object and `x` is a vector of parameters.
+
+# Fields
+$(FIELDS)
+"""
+struct LogDensityFunctionWithGrad{V,M,C,TAD<:ADTypes.AbstractADType}
+    ldf::LogDensityFunction{V,M,C}
+    adtype::TAD
+    prep::DI.GradientPrep
+
+    function LogDensityFunctionWithGrad(
+        ldf::LogDensityFunction{V,M,C}, adtype::TAD
+    ) where {V,M,C,TAD}
+        # Get a set of dummy params to use for prep and concretise type
+        x = map(identity, getparams(ldf))
+        prep = DI.prepare_gradient(_flipped_logdensity, adtype, x, DI.Constant(ldf))
+        # Store the prep with the struct
+        return new{V,M,C,TAD}(ldf, adtype, prep)
+    end
+end
+function LogDensityProblems.logdensity(f::LogDensityFunctionWithGrad)
+    return LogDensityProblems.logdensity(f.ldf)
+end
+function LogDensityProblems.capabilities(::Type{<:LogDensityFunctionWithGrad})
+    return LogDensityProblems.LogDensityOrder{1}()
+end
 # By default, the AD backend to use is inferred from the context, which would
 # typically be a SamplingContext which contains a sampler.
 function LogDensityProblems.logdensity_and_gradient(
-    f::LogDensityFunction, θ::AbstractVector
-)
-    adtype = getadtype(getsampler(getcontext(f)))
-    return LogDensityProblems.logdensity_and_gradient(f, θ, adtype)
-end
-
-# Extra method allowing one to manually specify the AD backend to use, thus
-# overriding the default AD backend inferred from the sampler.
-function LogDensityProblems.logdensity_and_gradient(
-    f::LogDensityFunction, θ::AbstractVector, adtype::ADTypes.AbstractADType
+    f::LogDensityFunctionWithGrad, x::AbstractVector
 )
-    # Ensure we concretise the elements of the params.
-    θ = map(identity, θ) # TODO: Is this needed?
-    prep = DI.prepare_gradient(_flipped_logdensity, adtype, θ, DI.Constant(f))
-    return DI.value_and_gradient(_flipped_logdensity, prep, adtype, θ, DI.Constant(f))
+    x = map(identity, x)  # Concretise type
+    return DI.value_and_gradient(
+        _flipped_logdensity, f.prep, f.adtype, x, DI.Constant(f.ldf)
+    )
 end
diff --git a/src/sampler.jl b/src/sampler.jl
@@ -54,9 +54,6 @@ Sampler(alg) = Sampler(alg, Selector())
 Sampler(alg, model::Model) = Sampler(alg, model, Selector())
 Sampler(alg, model::Model, s::Selector) = Sampler(alg, s)
 
-# Extract the AD type from the underlying algorithm
-getadtype(s::Sampler) = getadtype(s.alg)
-
 # AbstractMCMC interface for SampleFromUniform and SampleFromPrior
 function AbstractMCMC.step(
     rng::Random.AbstractRNG,
diff --git a/test/ad.jl b/test/ad.jl
@@ -1,31 +1,52 @@
+using DynamicPPL: LogDensityFunction, LogDensityFunctionWithGrad
+
 @testset "AD: ForwardDiff, ReverseDiff, and Mooncake" begin
     @testset "$(m.f)" for m in DynamicPPL.TestUtils.DEMO_MODELS
         rand_param_values = DynamicPPL.TestUtils.rand_prior_true(m)
         vns = DynamicPPL.TestUtils.varnames(m)
         varinfos = DynamicPPL.TestUtils.setup_varinfos(m, rand_param_values, vns)
 
         @testset "$(short_varinfo_name(varinfo))" for varinfo in varinfos
-            f = DynamicPPL.LogDensityFunction(m, varinfo)
-            # convert to `Vector{Float64}` to avoid `ReverseDiff` initializing the gradients to Integer 0
-            # reference: https://github.com/TuringLang/DynamicPPL.jl/pull/571#issuecomment-1924304489
-            θ = convert(Vector{Float64}, varinfo[:])
+            f = LogDensityFunction(m, varinfo)
+            x = DynamicPPL.getparams(f)
             # Calculate reference logp + gradient of logp using ForwardDiff
             default_adtype = ADTypes.AutoForwardDiff()
+            ldf_with_grad = LogDensityFunctionWithGrad(f, default_adtype)
             ref_logp, ref_grad = LogDensityProblems.logdensity_and_gradient(
-                f, θ, default_adtype
+                ldf_with_grad, x
             )
 
             @testset "$adtype" for adtype in [
-                ADTypes.AutoReverseDiff(; compile=false),
-                ADTypes.AutoReverseDiff(; compile=true),
-                ADTypes.AutoMooncake(; config=nothing),
+                AutoReverseDiff(; compile=false),
+                AutoReverseDiff(; compile=true),
+                AutoMooncake(; config=nothing),
             ]
-                # Mooncake can't currently handle something that is going on in
-                # SimpleVarInfo{<:VarNamedVector}. Disable all SimpleVarInfo tests for now.
-                if adtype isa ADTypes.AutoMooncake && varinfo isa DynamicPPL.SimpleVarInfo
-                    @test_broken 1 == 0
+                @info "Testing AD on: $(m.f) - $(short_varinfo_name(varinfo)) - $adtype"
+
+                # Put predicates here to avoid long lines
+                is_mooncake = adtype isa AutoMooncake
+                is_1_10 = v"1.10" <= VERSION < v"1.11"
+                is_1_11 = v"1.11" <= VERSION < v"1.12"
+                is_svi_vnv = varinfo isa SimpleVarInfo{<:DynamicPPL.VarNamedVector}
+                is_svi_od = varinfo isa SimpleVarInfo{<:OrderedDict}
+
+                # Mooncake doesn't work with several combinations of SimpleVarInfo.
+                if is_mooncake && is_1_11 && is_svi_vnv
+                    # https://github.com/compintell/Mooncake.jl/issues/470
+                    @test_throws ArgumentError LogDensityFunctionWithGrad(f, adtype)
+                elseif is_mooncake && is_1_10 && is_svi_vnv
+                    # TODO: report upstream
+                    @test_throws UndefRefError LogDensityFunctionWithGrad(f, adtype)
+                elseif is_mooncake && is_1_10 && is_svi_od
+                    # TODO: report upstream
+                    @test_throws Mooncake.MooncakeRuleCompilationError LogDensityFunctionWithGrad(
+                        f, adtype
+                    )
                 else
-                    logp, grad = LogDensityProblems.logdensity_and_gradient(f, θ, adtype)
+                    ldf_with_grad = LogDensityFunctionWithGrad(f, adtype)
+                    logp, grad = LogDensityProblems.logdensity_and_gradient(
+                        ldf_with_grad, x
+                    )
                     @test grad ≈ ref_grad
                     @test logp ≈ ref_logp
                 end
@@ -62,15 +83,15 @@
         # of implementation
         struct MyEmptyAlg end
         DynamicPPL.getspace(::DynamicPPL.Sampler{MyEmptyAlg}) = ()
-        DynamicPPL.assume(rng, ::DynamicPPL.Sampler{MyEmptyAlg}, dist, vn, vi) =
-            DynamicPPL.assume(dist, vn, vi)
+        DynamicPPL.assume(
+            ::Random.AbstractRNG, ::DynamicPPL.Sampler{MyEmptyAlg}, dist, vn, vi
+        ) = DynamicPPL.assume(dist, vn, vi)
 
         # Compiling the ReverseDiff tape used to fail here
         spl = Sampler(MyEmptyAlg())
         vi = VarInfo(model)
-        ldf = DynamicPPL.LogDensityFunction(vi, model, SamplingContext(spl))
-        @test LogDensityProblems.logdensity_and_gradient(
-            ldf, vi[:], AutoReverseDiff(; compile=true)
-        ) isa Any
+        ldf = LogDensityFunction(vi, model, SamplingContext(spl))
+        ldf_grad = LogDensityFunctionWithGrad(ldf, AutoReverseDiff(; compile=true))
+        @test LogDensityProblems.logdensity_and_gradient(ldf_grad, vi[:]) isa Any
     end
 end
diff --git a/test/ext/DynamicPPLForwardDiffExt.jl b/test/ext/DynamicPPLForwardDiffExt.jl
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -75,7 +75,6 @@ include("test_util.jl")
             include("ext/DynamicPPLJETExt.jl")
         end
         @testset "ad" begin
-            include("ext/DynamicPPLForwardDiffExt.jl")
             include("ext/DynamicPPLMooncakeExt.jl")
             include("ad.jl")
         end
diff --git a/test/test_util.jl b/test/test_util.jl
@@ -56,6 +56,15 @@ function short_varinfo_name(vi::TypedVarInfo)
 end
 short_varinfo_name(::UntypedVarInfo) = "UntypedVarInfo"
 short_varinfo_name(::DynamicPPL.VectorVarInfo) = "VectorVarInfo"
+function short_varinfo_name(::SimpleVarInfo{<:NamedTuple,<:Ref})
+    return "SimpleVarInfo{<:NamedTuple,<:Ref}"
+end
+function short_varinfo_name(::SimpleVarInfo{<:OrderedDict,<:Ref})
+    return "SimpleVarInfo{<:OrderedDict,<:Ref}"
+end
+function short_varinfo_name(::SimpleVarInfo{<:DynamicPPL.VarNamedVector,<:Ref})
+    return "SimpleVarInfo{<:VarNamedVector,<:Ref}"
+end
 short_varinfo_name(::SimpleVarInfo{<:NamedTuple}) = "SimpleVarInfo{<:NamedTuple}"
 short_varinfo_name(::SimpleVarInfo{<:OrderedDict}) = "SimpleVarInfo{<:OrderedDict}"
 function short_varinfo_name(::SimpleVarInfo{<:DynamicPPL.VarNamedVector})