From a2cbbaeb2ed655fe2c8940f67519a7ab12899d12 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Sun, 28 Dec 2025 22:20:31 -0600 Subject: [PATCH 01/23] write up on gradients --- experiments/.gitignore | 4 + experiments/GradientEvaluation.qmd | 383 ++++++++++++++++++++ experiments/Gradient_based_optimization.qmd | 197 ++++++++++ experiments/Project.toml | 10 + experiments/bibliography.bib | 36 ++ 5 files changed, 630 insertions(+) create mode 100644 experiments/.gitignore create mode 100644 experiments/GradientEvaluation.qmd create mode 100644 experiments/Gradient_based_optimization.qmd create mode 100644 experiments/Project.toml create mode 100644 experiments/bibliography.bib diff --git a/experiments/.gitignore b/experiments/.gitignore new file mode 100644 index 000000000..7d064f72a --- /dev/null +++ b/experiments/.gitignore @@ -0,0 +1,4 @@ +*.html +*\~ +*.swp + diff --git a/experiments/GradientEvaluation.qmd b/experiments/GradientEvaluation.qmd new file mode 100644 index 000000000..776221549 --- /dev/null +++ b/experiments/GradientEvaluation.qmd @@ -0,0 +1,383 @@ +--- +title: "Evaluation of the Gradient of the Profiled log-likelihood" +author: + - name: Douglas Bates + email: dmbates@gmail.com + orcid: 0000-0001-8316-9503 + affiliation: + - name: University of Wisconsin - Madison + city: Madison + state: WI + url: https://www.wisc.edu + department: Statistics + - name: Phillip Alday + email: me@phillipalday.com + orcid: 0000-0002-9984-5745 + affiliation: + - name: Beacon Biosignals + url: https://beacon.bio +date: last-modified +date-format: iso +toc: true +bibliography: bibliography.bib +number-sections: true +engine: julia +julia: + exeflags: + - -tauto + - --project=@. +format: + html: + toc: true + toc-location: right + embed-resources: true +--- + +## Introduction {#sec-intro} + +A comparison of algorithms for estimation of variance components given in the supplemental materials for @Zhou03042019 shows the Fisher scoring algorithm taking the fewest iterations to convergence compared to an EM algorithm and the minorization-maximization (MM) algorithm presented in that paper. +The model being simulated in @Zhou03042019, sec 3.2 is relatively simple, with random effects for two factors and their interaction in a balanced crossed design. + +The approach in [lme4](https://github.com/lme4/lme4) (@bates.maechler.etal:2015) and [MixedModels.jl](https://github.com/JuliaStats/MixedModels.jl) (@bates2025mixed) has been to use a profiled log-likelihood expression, with fewer free parameters than the log-likelihood, and to streamline the evaluation of the profiled log-likelihood. +The optimization itself is performed by a gradient-free optimizer, usually either BOBYQA or NEWUOA from Powell's collection of optimizers. + +Expressions for the gradient of the profiled log-likelihood were given in sec. 3.5 of @bates.maechler.etal:2015 but they haven't been implemented in either the `lme4` or the `MixedModels.jl` packages. + +The purpose of this note is to explore whether these expressions can be implemented effectively, even if just for the variance components model, which, for our purposes, is a model in which all the random effects terms are simple, scalar terms. + +### Expressions for the gradient terms + +The linear mixed-effects models we consider are defined by the unconditional distribution of the $q$-dimensional random-effects vector, $\mathbfcal{B}$, and the conditional distribution of the $n$-dimensional response vector, $\mathbfcal{Y}$, given $\mathbfcal{B}=\mathbf{b}$, as + +$$ +\begin{aligned} + (\mathbfcal{Y}|\mathbfcal{B}=\mathbf{b})& + \sim\mathbfcal{N}\left(\mathbf{X}\boldsymbol{\beta}+\mathbf{Z}\mathbf{b},\sigma^2\mathbf{I}\right)\\ + \mathbfcal{B}& + \sim\mathbfcal{N}(\mathbf{0}, \boldsymbol{\Sigma}) +\end{aligned} +$$ {#eq-dists} + +where $\mathbf{X}$ is an $n\times p$ model matrix for the fixed-effects parameter vector, $\boldsymbol{\beta}$, and $\mathbf{Z}$ is an $n\times q$ model matrix for the random effects, $\mathbf{b}$. +Furthermore, $\boldsymbol{\Sigma}$, the covariance of $\mathbfcal{B}$, is positive semi-definite. +We express it as + +$$ +\boldsymbol{\Sigma} = \sigma^2 +\boldsymbol{\Lambda_{\theta}}\boldsymbol{\Lambda^\top_{\theta}} +$$ {#eq-Sigma} + +for a lower-triangular *relative covariance factor*, $\boldsymbol{\Lambda_\theta}$, that depends on a *relative covariance parameter vector*, $\boldsymbol{\theta}$. + +In `MixedModels.jl` the profiled log-likelihood, a function of $\boldsymbol{\theta}$ only, is evaluated from the blocked lower Cholesky factor, $\mathbf{L}_\theta$, of + +$$ +\boldsymbol{\Omega_\theta} = +\begin{bmatrix} +\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda_\theta}+\mathbf{I}& +\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top X} & +\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top y}\\ +\mathbf{X^\top Z}\boldsymbol{\Lambda_\theta} & +\mathbf{X^\top X} & +\mathbf{X^\top y}\\ +\mathbf{y^\top Z}\boldsymbol{\Lambda_\theta} & +\mathbf{y^\top X} & +\mathbf{y^\top y}\\ +\end{bmatrix} +$$ {#eq-blockedOmega} + +where $\mathbf{L}_\theta$ has a similar blocked structure + +$$ +\mathbf{L}_\boldsymbol{\theta} = +\begin{bmatrix} +\mathbf{L_{ZZ}} & \mathbf{0} & \mathbf{0} \\ +\mathbf{L_{XZ}} & \mathbf{L_{XX}} & \mathbf{0} \\ +\mathbf{l_{yZ}} & \mathbf{l_{yX}} & \ell_{\mathbf{yy}} +\end{bmatrix} +$$ {#eq-blockedL} + +(In the actual computational methods the blocked Cholesky factor has a slightly different pattern of blocks in which the "X rows" and the "y row" are amalgamated into dense blocks and the column associated with $\mathbf{Z}$ is split into one or more columns according to the grouping factors determining the random effects, as shown in the examples below.) + +The objective to be optimized is negative twice the profiled log-likelihood, + +$$ +-2\mathcal{L}(\boldsymbol{\theta}|\mathbf{y}) = +\log\left|\mathbf{L_{ZZ}}\right|^2 + n \left[1 + \log\left(\frac{2\pi\ell^2_{\mathbf{yy}}}{n}\right)\right] +$$ {#eq-objective} + +which is on the scale of the deviance (if we were able to define a deviance for these models). + +As shown in @bates.maechler.etal:2015, sec 3.5 the gradient of the first summand in @eq-objective is + +$$ +\begin{aligned} +\nabla\log\left|\mathbf{L_ZZ}\right|^2 &= \nabla\log\left(\left|\mathbf{L_{ZZ}L_{ZZ}}^\top\right|\right)\\ +&=\nabla\log\left(\left|\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+\mathbf{I}\right|\right)\\ +&=\operatorname{tr}\left[\nabla\left(\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}\right) +\left(\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+\mathbf{I}\right)^{-1}\right]\\ +&=\operatorname{tr}\left[\mathbf{L_{ZZ}}^{-1} +\nabla\left(\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}\right) +\mathbf{L_{ZZ}}^{-\top} +\right]\\ +&=\operatorname{tr}\left[\mathbf{L_{ZZ}}^{-1} +\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+ +\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right) +\mathbf{L_{ZZ}}^{-\top} +\right] +\end{aligned} +$$ {#eq-delterm1} + +For the models that we wish to consider the partial derivatives of $\boldsymbol{\Lambda_\theta}$ with respect to the components of $\boldsymbol{\theta}$ are particularly simple in that they are block diagonal with a single non-zero diagonal block, which is an identity matrix. + +## Examples + +To aid in understanding the structure of these equations we consider the structure of the various matrices and their blocks in some simple examples. + +Load the packages to be used + +```{julia} +#| label: load_packages +#| warning: false +#| output: false +using FiniteDiff +using LinearAlgebra +using MixedModels +using MixedModelsDatasets: dataset +using TypedTables: Table +``` + +### Penicillin - two completely crossed scalar random-effects terms + +The `penicillin` dataset in `MixedModelsDatasets.jl` contains 144 measurements of the `diameter` of the cleared area for each of six `sample`s of penicillin on each of 24 `plate`s. + +```{julia} +#| label: penicillin_data +const penicillin = Table(dataset(:penicillin)) +``` + +We construct a `LinearMixedModel` struct with a single fixed-effect parameter, representing the average diameter in the balanced design, and random effects for each `plate` and each `sample`, + +```{julia} +#| label: m02 +#| output: false +#| warn: false +m02 = LinearMixedModel(@formula(diameter ~ 1 + (1|plate) + (1|sample)), penicillin) +``` + +for which the concatenated matrix $\left[\mathbf{ZXy}\right]$ is + +```{julia} +#| label: m02ZXy +Int.(hcat(collect(first(m02.reterms)), collect(last(m02.reterms)), m02.X, m02.y)) +``` + +in which the first 24 columns are the indicators for `plate`, the next 6 columns are the indicators for `sample`, the second-to-last column is the single column of the fixed-effects model matrix, $\mathbf{X}$, and the last column is $\mathbf{y}$. + +The Cholesky factor, $\mathbf{L}$, at the initial value $\boldsymbol\theta=\left[1,1\right]^\top$, can be expressed as a lower-triangular sparse matrix as + +```{julia} +#| label: m02L +Lsparse = LowerTriangular(sparseL(updateL!(m02); full=true)) +``` + +In practice, the full $\mathbf{L}$ matrix is stored in a blocked form + +```{julia} +#| label: m02_blocks +BlockDescription(m02) +``` + +from which the profiled objective (negative twice the log-likelihood) can be evaluated as + +```{julia} +#| label: m02L_initial_objective +objective(m02) +``` + +#### Evaluating terms in the gradient + +For illustration of the gradient evaluation we create the lower-triangular sparse submatrix $\mathbf{L_{ZZ}}$ as + +```{julia} +#| label: m02LZZ +LZZsparse = LowerTriangular(sparseL(m02)) +``` + +from which $\log\left|\mathbf{L_{ZZ}}\right|^2$ can be evaluated as + +```{julia} +#| label: logdet_m02 +2. * sum(log, diag(LZZsparse)) +``` + +In practice we use the `logdet` function + +```{julia} +#| label: logdet__m02 +logdet(m02) +``` + +which evaluates this quantity from the blocked representation of $\mathbf{L}$. + +A finite-difference approximation to the gradient of the `logdet` at this value of $\boldsymbol{\theta}$ is + +```{julia} +ldfun(x::Vector{Float64}) = logdet(updateL!(setθ!(m02, x))) +FiniteDiff.finite_difference_gradient(ldfun, [1., 1.]) +``` + +The matrix $\mathbf{A_{ZZ}}=\mathbf{Z}^\top\mathbf{Z}$ for this model, as a dense matrix, is + +```{julia} +#| label: denseA +A = Int.(hvcat(2, first(m02.A), m02.A[2]', m02.A[2], m02.A[3])) +``` + +and the first face of $\nabla{\boldsymbol{\Lambda}}$ is + +```{julia} +#| label: nabla_Lambda +nabla1 = Int.(Diagonal(vcat(ones(Int, 24), zeros(Int, 6)))) +``` + +With $\boldsymbol{\Lambda(\theta)}$ being + +```{julia} +Λ(θ) = Diagonal(vcat(fill(first(θ), 24), fill(last(θ), 6))) +θ = ones(2) # initial parameter vector +Int.(Λ(θ)) # initial value of Λ +``` + +the first face of $\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+ +\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right)$ is + +```{julia} +#| label: symprod +symprod = nabla1 * A * Λ(θ) + Λ(θ) * A * nabla1 +Int.(symprod) +``` + +producing the matrix whose trace is desired as + +```{julia} +rdiv!(ldiv!(LZZsparse, symprod), LZZsparse') # overwrites the value of symprod +``` + +yielding the trace as + +```{julia} +sum(diag(symprod)) +``` + +One point to notice here is that the $[1,1]$ block of this matrix is diagonal, with elements of + +```{julia} +((2 * first(θ)) .* first(m02.A).diag) ./ abs2.(first(m02.L).diag) +``` + +which can be used to simplify the evaluation of the first gradient term. +In particular, the gradient of a model with a single, scalar random-effects term is, unsurprisingly, straightforward. + +For the second element of the gradient we define + +```{julia} +nabla2 = Diagonal(vcat(zeros(24), ones(6))) +Int.(nabla2) +``` + +and + +```{julia} +symprod = nabla2 * A * Λ(ones(2)) + Λ(ones(2)) * A * nabla2 +``` + +The matrix whose trace is required is + +```{julia} +rdiv!(ldiv!(LZZsparse, symprod), LZZsparse') +``` + +producing the second element of the gradient of `ldfun` as + +```{julia} +sum(diag(symprod)) +``` + +Notice that the entire $[1,1]$ block of this matrix is zero and will not need to be evaluated explicitly. + +We evaluate $\boldsymbol{\hat{\theta}}$ using a derivative-free optimizer as + +```{julia} +θ = refit!(m02).θ +``` + +after which the first face of `symprod` becomes + +```{julia} +symprod = nabla1 * A * Λ(θ) + Λ(θ) * A * nabla1 +``` + +`LZZsparse` becomes + +```{julia} +LZZsparse = LowerTriangular(sparseL(m02)) +``` + +and the matrix whose trace is required is + +```{julia} +rdiv!(ldiv!(LZZsparse, symprod), LZZsparse') +``` + +yielding the gradient term + +```{julia} +sum(diag(symprod)) +``` + +which can be compared to the finite difference value + +```{julia} +FiniteDiff.finite_difference_gradient(ldfun, θ) +``` + +(Note, this is the first element of the gradient of the `logdet` term only, not the gradient of the objective which is near zero + +```{julia} +FiniteDiff.finite_difference_gradient(objective!(m02), θ) +``` + +as it should be at the optimum.) + +For the second element of the gradient of `ldfun` we have + +```{julia} +symprod = nabla2 * A * Λ(θ) + Λ(θ) * A * nabla2 +``` + +After pre- and post-division by `LZZsparse`, this becomes + +```{julia} +rdiv!(ldiv!(LZZsparse, symprod), LZZsparse') +``` + +yielding the second element of the gradient of `ldfun` as + +```{julia} +sum(diag(symprod)) +``` + +#### Factoring the symmetric matrix + +The matrix $\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+ +\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right)$ is symmetric and has the same sparsity structure as $\mathbf{Z^\top Z}$, which is positive semi-definite. +However, it is not clear that the non-zero blocks in $\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+ +\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right)$ will be positive semi-definite in the general case. +In the case of a single variance component it will be positive definite when $\theta_1>0$ because it is $2\theta_1\mathbf{A}$. + + +### References {.unnumbered} + +::: {#refs} +::: diff --git a/experiments/Gradient_based_optimization.qmd b/experiments/Gradient_based_optimization.qmd new file mode 100644 index 000000000..ae2d0a47b --- /dev/null +++ b/experiments/Gradient_based_optimization.qmd @@ -0,0 +1,197 @@ +--- +title: "Gradient-based Optimization of the Profiled log-likelihood" +author: + - name: Douglas Bates + email: dmbates@gmail.com + orcid: 0000-0001-8316-9503 + affiliation: + - name: University of Wisconsin - Madison + city: Madison + state: WI + url: https://www.wisc.edu + department: Statistics + - name: Phillip Alday + email: me@phillipalday.com + orcid: 0000-0002-9984-5745 + affiliation: + - name: Beacon Biosignals + url: https://beacon.bio +date: last-modified +date-format: iso +toc: true +bibliography: bibliography.bib +number-sections: true +engine: julia +julia: + exeflags: + - -tauto + - --project=@. +format: + html: + toc: true + toc-location: right + embed-resources: true +--- + +## Introduction {#sec-intro} + +Before devoting too much effort to efficient evaluation of the gradient of the profiled log-likelihood, we should check if using gradient-based optimization requires sufficiently fewer evaluations of the objective, and the gradient, than does derivative-free optimization. + +Here we fit a few models using automatic differentiation from [ForwardDiff.jl](https://github.com/JuliaDiff/ForwardDiff.jl) and the `ForwardDiff` extension to [MixedModels.jl](https://github.com/JuliaStats/MixedModels.jl) to optimize the profiled log-likelihood with the `LD_LBFGS` optimizer from [NLopt.jl](https://github.com/jump-dev/NLopt.jl), instead of the default `LN_NEWUOA` which does not use gradients. + +The results are more-or-less a toss-up when using `ForwardDiff` to evaluate the gradient. +A more efficient evaluation of the gradient, taking advantage of the sparse-blocked structure of the Cholesky factorization to evaluate the profiled log-likelihood, may tip the balance in favor of gradient-based methods. + +## Preliminaries {#sec-prelim} + +Load the packages to be used + +```{julia} +#| label: load_packages +using BenchmarkTools +using ForwardDiff +using MixedModels +using MixedModelsDatasets: dataset +using NLopt +using Tables: table +using TypedTables: Table +``` + +## Examples {#sec-examples} + +### Penicillin data {#sec-penicillin} + +Load the `penicillin` dataset +```{julia} +penicillin = Table(dataset(:penicillin)) +``` + +and define a model + +```{julia} +#| label: const_defs +#| output: false +m = LinearMixedModel(@formula(diameter ~ 1 + (1|plate) + (1|sample)), penicillin) +θ = copy(m.θ) +k = length(θ) +const fitlog = sizehint!(Vector{Float64}(undef, 0), 200) +``` + +with an NLopt-compatible objective function + +```{julia} +#| label: obj_def +function obj(θ::Vector{Float64}, grad::Vector{Float64}) + val = objective(updateL!(setθ!(m, θ))) + push!(fitlog, val) + append!(fitlog, θ) + if !isempty(grad) + copyto!(grad, ForwardDiff.gradient(m, θ)) + append!(fitlog, grad) + end + return val +end +``` + +A benchmark of evaluating the objective only + +```{julia} +#| label: benchmark_obj +@benchmark objective(updateL!(setθ!($m, $θ))) seconds=1 +``` + +compared to evaluating both the objective and its gradient + +```{julia} +#| label: benchmark_obj_grad +let gr = Vector{Float64}(undef, k) + @benchmark obj($θ, $gr) seconds=1 +end +``` + +Notice that evaluating both the objective and the gradient, using `ForwardDiff.jl`, results in considerably more allocation of storage than does evaluating the objective alone. +(In part this is a reflection of the fact that considerable work has gone into tuning the objective evaluation so that it doesn't allocate a lot of memory.) + +Fitting the model using a derivative-free optimizer gives + +```{julia} +m.optsum.ftol_rel = 1.e-8 # for fair comparison with the gradient-based method +print(refit!(m)) +``` + +for which the optimization summary is + +```{julia} +m.optsum +``` + +The objective at the estimate is + +```{julia} +m.optsum.fmin +``` + +We set up and run the gradient-based optimizer L-BFGS as + +```{julia} +opt = NLopt.Opt(:LD_LBFGS, 2) +NLopt.ftol_rel!(opt, 1.e-8) +NLopt.min_objective!(opt, obj) +empty!(fitlog) +min_f, min_x, ret = NLopt.optimize(opt, copy(θ)) +``` + +where the fitlog is + +```{julia} +header = [:obj, :θ₁, :θ₂, :g₁, :g₂] +fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header)) +``` + +```{julia} +last(fltbl, 3) +``` + +### Sleepstudy {#sec-sleepstudy} + +```{julia} +sleepstudy = Table(dataset(:sleepstudy)) +``` + +```{julia} +m = LinearMixedModel(@formula(reaction ~ 1 + days + (1 + days|subj)), sleepstudy) +θ = copy(m.θ) +@benchmark objective(updateL!(setθ!($m, $(m.θ)))) seconds=1 +``` + + +```{julia} +@benchmark obj($θ, $(similar(θ))) seconds=1 +``` + +```{julia} +print(refit!(m)) +``` + +```{julia} +m.optsum +``` + +```{julia} +opt = NLopt.Opt(:LD_LBFGS, 3) +NLopt.ftol_rel!(opt, 1.e-8) +NLopt.min_objective!(opt, obj) +empty!(fitlog) +min_f, min_x, ret = NLopt.optimize(opt, copy(θ)) +``` + + +```{julia} +header = [:obj, :θ₁, :θ₂, :θ₃, :g₁, :g₂, :g₃] +fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header)) +``` + + +```{julia} +last(fltbl, 3) +``` \ No newline at end of file diff --git a/experiments/Project.toml b/experiments/Project.toml new file mode 100644 index 000000000..45e93db75 --- /dev/null +++ b/experiments/Project.toml @@ -0,0 +1,10 @@ +[deps] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41" +ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +MixedModels = "ff71e718-51f3-5ec2-a782-8ffcbfa3c316" +MixedModelsDatasets = "7e9fb7ac-9f67-43bf-b2c8-96ba0796cbb6" +NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" diff --git a/experiments/bibliography.bib b/experiments/bibliography.bib new file mode 100644 index 000000000..2397e3095 --- /dev/null +++ b/experiments/bibliography.bib @@ -0,0 +1,36 @@ +@article{Zhou03042019, + author = {Hua Zhou and Liuyi Hu and Jin Zhou and Kenneth Lange}, + title = {MM Algorithms for Variance Components Models}, + journal = {Journal of Computational and Graphical Statistics}, + volume = {28}, + number = {2}, + pages = {350--361}, + year = {2019}, + publisher = {ASA Website}, + doi = {10.1080/10618600.2018.1529601}, + note ={PMID: 31592195}, + URL = {https://doi.org/10.1080/10618600.2018.1529601}, + eprint = {https://doi.org/10.1080/10618600.2018.1529601} +} + +@Article{bates.maechler.etal:2015, + author = {Bates, Douglas and Maechler, Martin and Bolker, Benjamin M. and Walker, Steven}, + title = {Fitting Linear Mixed-Effects Models using lme4}, + doi = {10.18637/jss.v067.i01}, + number = {1}, + pages = {1--48}, + volume = {67}, + date-added = {2020-03-24}, + date-modified = {2016-02-12 06:52:06 +0000}, + file = {:2015/bates.maechler.etal_2015 - Fitting Linear Mixed.pdf:PDF}, + journal = {Journal of Statistical Software}, + year = {2015}, +} + +@article{bates2025mixed, + title={Mixed-model Log-likelihood Evaluation Via a Blocked Cholesky Factorization}, + author={Bates, Douglas and Alday, Phillip M and Kokandakar, Ajinkya H}, + journal={arXiv preprint arXiv:2505.11674}, + year={2025} +} + From b55d6222d71a9c70dca015867a24dd289b0140e1 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Sun, 28 Dec 2025 23:19:21 -0600 Subject: [PATCH 02/23] slight optimization of gradient computation --- ext/MixedModelsForwardDiffExt.jl | 21 ++++++++++++++++--- {experiments => gradients}/.gitignore | 0 .../GradientEvaluation.qmd | 0 .../Gradient_based_optimization.qmd | 9 +++++--- {experiments => gradients}/Project.toml | 3 +++ {experiments => gradients}/bibliography.bib | 0 6 files changed, 27 insertions(+), 6 deletions(-) rename {experiments => gradients}/.gitignore (100%) rename {experiments => gradients}/GradientEvaluation.qmd (100%) rename {experiments => gradients}/Gradient_based_optimization.qmd (94%) rename {experiments => gradients}/Project.toml (92%) rename {experiments => gradients}/bibliography.bib (100%) diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl index c3263e620..757c4b7d2 100644 --- a/ext/MixedModelsForwardDiffExt.jl +++ b/ext/MixedModelsForwardDiffExt.jl @@ -30,7 +30,7 @@ using LinearAlgebra: LinearAlgebra, using SparseArrays: SparseArrays, nzrange # Stuff we're defining in this file -using ForwardDiff: ForwardDiff +using ForwardDiff: ForwardDiff, DiffResult, GradientConfig, Chunk using MixedModels: fd_cholUnblocked!, fd_deviance, fd_logdet, @@ -68,9 +68,24 @@ values. $(FORWARDDIFF) """ function ForwardDiff.gradient( - model::LinearMixedModel{T}, θ::Vector{T}=model.θ + model::LinearMixedModel{T}, θ::Vector{T}=model.θ, + cfg::GradientConfig=GradientConfig(model, x) ) where {T} - return ForwardDiff.gradient(fd_deviance(model), θ) + return ForwardDiff.gradient(model, θ, cfg, check) +end + +# gradient!(::Union{AbstractArray, DiffResults.DiffResult}, ::F, ::Vector{T}, +# ::ForwardDiff.GradientConfig{T}, ::Val{CHK}) where {T, T, CHK, F<:LinearMixedModel{T}} + +function ForwardDiff.gradient!(result::Union{AbstractArray,DiffResult}, + model::LinearMixedModel{T}, θ::Vector{T}=model.θ, + cfg::GradientConfig=GradientConfig(model, x) +) where {T} + return ForwardDiff.gradient!(result, fd_deviance(model), θ, cfg) +end + +function ForwardDiff.GradientConfig(model::LinearMixedModel, x::AbstractArray, chunk::Chunk = Chunk(x)) + return GradientConfig(fd_deviance(model), x, chunk) end """ diff --git a/experiments/.gitignore b/gradients/.gitignore similarity index 100% rename from experiments/.gitignore rename to gradients/.gitignore diff --git a/experiments/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd similarity index 100% rename from experiments/GradientEvaluation.qmd rename to gradients/GradientEvaluation.qmd diff --git a/experiments/Gradient_based_optimization.qmd b/gradients/Gradient_based_optimization.qmd similarity index 94% rename from experiments/Gradient_based_optimization.qmd rename to gradients/Gradient_based_optimization.qmd index ae2d0a47b..140ec6fbf 100644 --- a/experiments/Gradient_based_optimization.qmd +++ b/gradients/Gradient_based_optimization.qmd @@ -51,6 +51,7 @@ Load the packages to be used using BenchmarkTools using ForwardDiff using MixedModels +using MixedModels: fd_deviance using MixedModelsDatasets: dataset using NLopt using Tables: table @@ -81,12 +82,13 @@ with an NLopt-compatible objective function ```{julia} #| label: obj_def +grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ) function obj(θ::Vector{Float64}, grad::Vector{Float64}) val = objective(updateL!(setθ!(m, θ))) push!(fitlog, val) append!(fitlog, θ) if !isempty(grad) - copyto!(grad, ForwardDiff.gradient(m, θ)) + copyto!(grad, ForwardDiff.gradient!(grad, m, θ, grad_config)) append!(fitlog, grad) end return val @@ -161,6 +163,7 @@ sleepstudy = Table(dataset(:sleepstudy)) ```{julia} m = LinearMixedModel(@formula(reaction ~ 1 + days + (1 + days|subj)), sleepstudy) θ = copy(m.θ) +grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ) @benchmark objective(updateL!(setθ!($m, $(m.θ)))) seconds=1 ``` @@ -178,7 +181,7 @@ m.optsum ``` ```{julia} -opt = NLopt.Opt(:LD_LBFGS, 3) +opt = NLopt.Opt(:LD_LBFGS, length(θ)) NLopt.ftol_rel!(opt, 1.e-8) NLopt.min_objective!(opt, obj) empty!(fitlog) @@ -194,4 +197,4 @@ fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header)) ```{julia} last(fltbl, 3) -``` \ No newline at end of file +``` diff --git a/experiments/Project.toml b/gradients/Project.toml similarity index 92% rename from experiments/Project.toml rename to gradients/Project.toml index 45e93db75..220557789 100644 --- a/experiments/Project.toml +++ b/gradients/Project.toml @@ -8,3 +8,6 @@ MixedModelsDatasets = "7e9fb7ac-9f67-43bf-b2c8-96ba0796cbb6" NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" + +[sources] +MixedModels = {path = ".."} diff --git a/experiments/bibliography.bib b/gradients/bibliography.bib similarity index 100% rename from experiments/bibliography.bib rename to gradients/bibliography.bib From 4aa750e26512da743c52cc1c9b58fed0dcf32e0b Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Sun, 28 Dec 2025 23:21:36 -0600 Subject: [PATCH 03/23] kb07 --- gradients/Gradient_based_optimization.qmd | 43 ++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/gradients/Gradient_based_optimization.qmd b/gradients/Gradient_based_optimization.qmd index 140ec6fbf..53a96eac2 100644 --- a/gradients/Gradient_based_optimization.qmd +++ b/gradients/Gradient_based_optimization.qmd @@ -88,7 +88,7 @@ function obj(θ::Vector{Float64}, grad::Vector{Float64}) push!(fitlog, val) append!(fitlog, θ) if !isempty(grad) - copyto!(grad, ForwardDiff.gradient!(grad, m, θ, grad_config)) + ForwardDiff.gradient!(grad, m, θ, grad_config) append!(fitlog, grad) end return val @@ -198,3 +198,44 @@ fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header)) ```{julia} last(fltbl, 3) ``` + +### Kronmueller-Barr 2007 {#sec-kb07} + +```{julia} +kb07 = Table(dataset(:kb07)) +``` + +```{julia} +# this model is very overparameterized, but it's a test example +m = LinearMixedModel(@formula(rt_trunc ~ 1 + spkr * prec * load + (1 + spkr * prec * load | subj) + (1 + spkr * prec * load | item)), kb07) +θ = copy(m.θ) +grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ) +@benchmark objective(updateL!(setθ!($m, $(m.θ)))) seconds=1 +``` + + +```{julia} +@benchmark obj($θ, $(similar(θ))) seconds=5 +``` + +```{julia} +print(refit!(m)) +``` + +```{julia} +m.optsum +``` + +```{julia} +opt = NLopt.Opt(:LD_LBFGS, length(θ)) +NLopt.ftol_rel!(opt, 1.e-8) +NLopt.min_objective!(opt, obj) +empty!(fitlog) +min_f, min_x, ret = NLopt.optimize(opt, copy(θ)) +``` + + +```{julia} +header = [:obj, :θ₁, :θ₂, :θ₃, :g₁, :g₂, :g₃] +fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header)) +``` From 2765eef6cd698979ed03921959ca141e132045bf Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Tue, 30 Dec 2025 09:06:03 -0600 Subject: [PATCH 04/23] Spelling mistakes? --- ext/MixedModelsForwardDiffExt.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl index 757c4b7d2..ef4d4421f 100644 --- a/ext/MixedModelsForwardDiffExt.jl +++ b/ext/MixedModelsForwardDiffExt.jl @@ -69,7 +69,7 @@ $(FORWARDDIFF) """ function ForwardDiff.gradient( model::LinearMixedModel{T}, θ::Vector{T}=model.θ, - cfg::GradientConfig=GradientConfig(model, x) + cfg::GradientConfig=GradientConfig(model, θ) ) where {T} return ForwardDiff.gradient(model, θ, cfg, check) end @@ -79,7 +79,7 @@ end function ForwardDiff.gradient!(result::Union{AbstractArray,DiffResult}, model::LinearMixedModel{T}, θ::Vector{T}=model.θ, - cfg::GradientConfig=GradientConfig(model, x) + cfg::GradientConfig=GradientConfig(model, θ) ) where {T} return ForwardDiff.gradient!(result, fd_deviance(model), θ, cfg) end From 644027406527ade5db64ed2110f26fff521d7c63 Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Tue, 30 Dec 2025 11:29:41 -0600 Subject: [PATCH 05/23] Still not passing tests. In write-up made method comparisons fairer. --- ext/MixedModelsForwardDiffExt.jl | 2 +- gradients/Gradient_based_optimization.qmd | 167 ++++++++-------------- 2 files changed, 57 insertions(+), 112 deletions(-) diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl index ef4d4421f..324186065 100644 --- a/ext/MixedModelsForwardDiffExt.jl +++ b/ext/MixedModelsForwardDiffExt.jl @@ -71,7 +71,7 @@ function ForwardDiff.gradient( model::LinearMixedModel{T}, θ::Vector{T}=model.θ, cfg::GradientConfig=GradientConfig(model, θ) ) where {T} - return ForwardDiff.gradient(model, θ, cfg, check) + return ForwardDiff.gradient(model, θ, cfg)#, check) end # gradient!(::Union{AbstractArray, DiffResults.DiffResult}, ::F, ::Vector{T}, diff --git a/gradients/Gradient_based_optimization.qmd b/gradients/Gradient_based_optimization.qmd index 53a96eac2..06b27c2f9 100644 --- a/gradients/Gradient_based_optimization.qmd +++ b/gradients/Gradient_based_optimization.qmd @@ -48,7 +48,6 @@ Load the packages to be used ```{julia} #| label: load_packages -using BenchmarkTools using ForwardDiff using MixedModels using MixedModels: fd_deviance @@ -56,186 +55,132 @@ using MixedModelsDatasets: dataset using NLopt using Tables: table using TypedTables: Table -``` - -## Examples {#sec-examples} -### Penicillin data {#sec-penicillin} - -Load the `penicillin` dataset -```{julia} -penicillin = Table(dataset(:penicillin)) +const progress = false ``` -and define a model - -```{julia} -#| label: const_defs -#| output: false -m = LinearMixedModel(@formula(diameter ~ 1 + (1|plate) + (1|sample)), penicillin) -θ = copy(m.θ) -k = length(θ) -const fitlog = sizehint!(Vector{Float64}(undef, 0), 200) -``` +## Examples {#sec-examples} -with an NLopt-compatible objective function +We create a function to take a `LinearMixedModel` that has been fit and refit it using the `:LD_LBFGS` optimizer applied to an objective function that evaluates the gradient using `ForwardDiff`. ```{julia} -#| label: obj_def -grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ) -function obj(θ::Vector{Float64}, grad::Vector{Float64}) +addinds(ch::Char, n::Integer) = Symbol.(lpad.(string.(ch, Base.OneTo(n)), ndigits(n), '0')) +function gr_refit!(m::LinearMixedModel{T}) where {T} + θ = copy(m.optsum.initial) + k = length(θ) + fitlog = sizehint!(T[], 50 * k) + grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ) + function obj(θ::Vector{Float64}, grad::Vector{Float64}) val = objective(updateL!(setθ!(m, θ))) push!(fitlog, val) append!(fitlog, θ) if !isempty(grad) - ForwardDiff.gradient!(grad, m, θ, grad_config) - append!(fitlog, grad) + ForwardDiff.gradient!(grad, m, θ, grad_config) + append!(fitlog, grad) + else + append!(fitlog, fill(NaN, k)) # never called with empty grad but just in case end return val + end + opt = NLopt.Opt(:LD_LBFGS, k) + NLopt.ftol_rel!(opt, 1.e-12) + NLopt.ftol_abs!(opt, 1.e-8) + NLopt.min_objective!(opt, obj) + min_f, min_x, ret = NLopt.optimize(opt, θ) + header = vcat([:obj], addinds('θ', k), addinds('g', k)) + return Table(table(transpose(reshape(fitlog, 2k + 1, :)); header)) end ``` -A benchmark of evaluating the objective only - -```{julia} -#| label: benchmark_obj -@benchmark objective(updateL!(setθ!($m, $θ))) seconds=1 -``` - -compared to evaluating both the objective and its gradient - -```{julia} -#| label: benchmark_obj_grad -let gr = Vector{Float64}(undef, k) - @benchmark obj($θ, $gr) seconds=1 -end -``` - -Notice that evaluating both the objective and the gradient, using `ForwardDiff.jl`, results in considerably more allocation of storage than does evaluating the objective alone. -(In part this is a reflection of the fact that considerable work has gone into tuning the objective evaluation so that it doesn't allocate a lot of memory.) +### Penicillin data {#sec-penicillin} -Fitting the model using a derivative-free optimizer gives +Define a model for the `penicillin` data ```{julia} -m.optsum.ftol_rel = 1.e-8 # for fair comparison with the gradient-based method -print(refit!(m)) +#| label: const_defs +#| output: false +m1 = fit(MixedModel, @formula(diameter ~ 1 + (1|plate) + (1|sample)), dataset(:penicillin); progress) +print(m1) ``` for which the optimization summary is ```{julia} -m.optsum +m1.optsum ``` -The objective at the estimate is +and refit the model using ForwardDiff gradient evaluations. ```{julia} -m.optsum.fmin +fitlog = gr_refit!(m1) ``` -We set up and run the gradient-based optimizer L-BFGS as +The objective at convergence is ```{julia} -opt = NLopt.Opt(:LD_LBFGS, 2) -NLopt.ftol_rel!(opt, 1.e-8) -NLopt.min_objective!(opt, obj) -empty!(fitlog) -min_f, min_x, ret = NLopt.optimize(opt, copy(θ)) +last(fitlog.obj) ``` -where the fitlog is - -```{julia} -header = [:obj, :θ₁, :θ₂, :g₁, :g₂] -fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header)) -``` +and the last few evaluations are ```{julia} -last(fltbl, 3) +last(fitlog, 5) ``` ### Sleepstudy {#sec-sleepstudy} ```{julia} -sleepstudy = Table(dataset(:sleepstudy)) -``` - -```{julia} -m = LinearMixedModel(@formula(reaction ~ 1 + days + (1 + days|subj)), sleepstudy) -θ = copy(m.θ) -grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ) -@benchmark objective(updateL!(setθ!($m, $(m.θ)))) seconds=1 -``` - - -```{julia} -@benchmark obj($θ, $(similar(θ))) seconds=1 +m2 = fit(MixedModel, @formula(reaction ~ 1 + days + (1 + days|subj)), dataset(:sleepstudy); progress) +print(m2) ``` ```{julia} -print(refit!(m)) +m2.optsum ``` ```{julia} -m.optsum +fitlog = gr_refit!(m2) ``` ```{julia} -opt = NLopt.Opt(:LD_LBFGS, length(θ)) -NLopt.ftol_rel!(opt, 1.e-8) -NLopt.min_objective!(opt, obj) -empty!(fitlog) -min_f, min_x, ret = NLopt.optimize(opt, copy(θ)) +last(fitlog.obj) ``` - ```{julia} -header = [:obj, :θ₁, :θ₂, :θ₃, :g₁, :g₂, :g₃] -fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header)) -``` - - -```{julia} -last(fltbl, 3) +last(fitlog, 5) ``` ### Kronmueller-Barr 2007 {#sec-kb07} -```{julia} -kb07 = Table(dataset(:kb07)) -``` - ```{julia} # this model is very overparameterized, but it's a test example -m = LinearMixedModel(@formula(rt_trunc ~ 1 + spkr * prec * load + (1 + spkr * prec * load | subj) + (1 + spkr * prec * load | item)), kb07) -θ = copy(m.θ) -grad_config = ForwardDiff.GradientConfig(fd_deviance(m), θ) -@benchmark objective(updateL!(setθ!($m, $(m.θ)))) seconds=1 +m3 = fit( + MixedModel, + @formula(rt_trunc ~ 1 + spkr * prec * load + (1 + spkr * prec * load | subj) + (1 + spkr * prec * load | item)), + dataset(:kb07); + progress, +) +print(m3) ``` - ```{julia} -@benchmark obj($θ, $(similar(θ))) seconds=5 +m3.optsum ``` +Several of the parameters on the diagonal of $\boldsymbol{\Lambda}$ are close to zero at convergence and are replaced by zero in the returned parameter vector + ```{julia} -print(refit!(m)) +findall(iszero, m3.θ) ``` ```{julia} -m.optsum +fitlog = gr_refit!(m3) ``` ```{julia} -opt = NLopt.Opt(:LD_LBFGS, length(θ)) -NLopt.ftol_rel!(opt, 1.e-8) -NLopt.min_objective!(opt, obj) -empty!(fitlog) -min_f, min_x, ret = NLopt.optimize(opt, copy(θ)) +last(fitlog.obj) ``` - ```{julia} -header = [:obj, :θ₁, :θ₂, :θ₃, :g₁, :g₂, :g₃] -fltbl = Table(table(transpose(reshape(fitlog, length(header), :)); header)) -``` +last(fitlog, 5) +``` \ No newline at end of file From 16cbd8e6ce2dfabe79691159297ab9b8fd2320db Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 30 Dec 2025 12:53:44 -0600 Subject: [PATCH 06/23] test fix --- ext/MixedModelsForwardDiffExt.jl | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl index 324186065..a65f6c1df 100644 --- a/ext/MixedModelsForwardDiffExt.jl +++ b/ext/MixedModelsForwardDiffExt.jl @@ -30,7 +30,10 @@ using LinearAlgebra: LinearAlgebra, using SparseArrays: SparseArrays, nzrange # Stuff we're defining in this file -using ForwardDiff: ForwardDiff, DiffResult, GradientConfig, Chunk +using ForwardDiff: ForwardDiff, + Chunk, + DiffResult, + GradientConfig using MixedModels: fd_cholUnblocked!, fd_deviance, fd_logdet, @@ -59,6 +62,12 @@ const FORWARDDIFF = """ should be included is currently still being decided. """ +function ForwardDiff.GradientConfig( + model::LinearMixedModel, x::AbstractArray=model.θ, chunk::Chunk=Chunk(x) +) + return GradientConfig(fd_deviance(model), x, chunk) +end + """ ForwardDiff.gradient(model::LinearMixedModel) @@ -69,23 +78,18 @@ $(FORWARDDIFF) """ function ForwardDiff.gradient( model::LinearMixedModel{T}, θ::Vector{T}=model.θ, - cfg::GradientConfig=GradientConfig(model, θ) -) where {T} - return ForwardDiff.gradient(model, θ, cfg)#, check) + cfg::GradientConfig=GradientConfig(model, θ), + check::Val{CHK}=Val(true), +) where {T, CHK} + return ForwardDiff.gradient!(similar(model.θ), fd_deviance(model), θ, cfg, check) end -# gradient!(::Union{AbstractArray, DiffResults.DiffResult}, ::F, ::Vector{T}, -# ::ForwardDiff.GradientConfig{T}, ::Val{CHK}) where {T, T, CHK, F<:LinearMixedModel{T}} - function ForwardDiff.gradient!(result::Union{AbstractArray,DiffResult}, model::LinearMixedModel{T}, θ::Vector{T}=model.θ, - cfg::GradientConfig=GradientConfig(model, θ) -) where {T} - return ForwardDiff.gradient!(result, fd_deviance(model), θ, cfg) -end - -function ForwardDiff.GradientConfig(model::LinearMixedModel, x::AbstractArray, chunk::Chunk = Chunk(x)) - return GradientConfig(fd_deviance(model), x, chunk) + cfg::GradientConfig=GradientConfig(model, θ), + check::Val{CHK}=Val(true), +) where {T, CHK} + return ForwardDiff.gradient!(result, fd_deviance(model), θ, cfg, check) end """ From f184c8834f5e5b0fe352f752968b520466f13d9e Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 30 Dec 2025 13:04:24 -0600 Subject: [PATCH 07/23] methods for HessianConfig and hessian! --- ext/MixedModelsForwardDiffExt.jl | 42 +++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl index a65f6c1df..de2d80c54 100644 --- a/ext/MixedModelsForwardDiffExt.jl +++ b/ext/MixedModelsForwardDiffExt.jl @@ -32,8 +32,8 @@ using SparseArrays: SparseArrays, nzrange # Stuff we're defining in this file using ForwardDiff: ForwardDiff, Chunk, - DiffResult, - GradientConfig + GradientConfig, + HessianConfig using MixedModels: fd_cholUnblocked!, fd_deviance, fd_logdet, @@ -62,9 +62,13 @@ const FORWARDDIFF = """ should be included is currently still being decided. """ +##### +##### Gradients +##### + function ForwardDiff.GradientConfig( - model::LinearMixedModel, x::AbstractArray=model.θ, chunk::Chunk=Chunk(x) -) + model::LinearMixedModel{T}, x::AbstractVector{T}=model.θ, chunk::Chunk=Chunk(x) +) where {T} return GradientConfig(fd_deviance(model), x, chunk) end @@ -81,10 +85,10 @@ function ForwardDiff.gradient( cfg::GradientConfig=GradientConfig(model, θ), check::Val{CHK}=Val(true), ) where {T, CHK} - return ForwardDiff.gradient!(similar(model.θ), fd_deviance(model), θ, cfg, check) + return ForwardDiff.gradient!(similar(model.θ), model, θ, cfg, check) end -function ForwardDiff.gradient!(result::Union{AbstractArray,DiffResult}, +function ForwardDiff.gradient!(result::AbstractArray, model::LinearMixedModel{T}, θ::Vector{T}=model.θ, cfg::GradientConfig=GradientConfig(model, θ), check::Val{CHK}=Val(true), @@ -92,6 +96,16 @@ function ForwardDiff.gradient!(result::Union{AbstractArray,DiffResult}, return ForwardDiff.gradient!(result, fd_deviance(model), θ, cfg, check) end +##### +##### Hessians +##### + +function ForwardDiff.HessianConfig( + model::LinearMixedModel{T}, x::AbstractVector{T}=model.θ, chunk::Chunk=Chunk(x) +) where {T} + return HessianConfig(fd_deviance(model), x, chunk) +end + """ ForwardDiff.hessian(model::LinearMixedModel) @@ -101,9 +115,19 @@ values. $(FORWARDDIFF) """ function ForwardDiff.hessian( - model::LinearMixedModel{T}, θ::Vector{T}=model.θ -) where {T} - return ForwardDiff.hessian(fd_deviance(model), θ) + model::LinearMixedModel{T}, θ::Vector{T}=model.θ, + cfg::HessianConfig=HessianConfig(model, θ), + check::Val{CHK}=Val(true), +) where {T, CHK} + return ForwardDiff.hessian!(similar(model.θ), model, θ, cfg, check) +end + +function ForwardDiff.hessian!(result::AbstractArray, + model::LinearMixedModel{T}, θ::Vector{T}=model.θ, + cfg::HessianConfig=HessianConfig(model, θ), + check::Val{CHK}=Val(true), +) where {T, CHK} + return ForwardDiff.hessian!(result, fd_deviance(model), θ, cfg, check) end ##### From 8e8ee8532ad850bd56f3e67f8e1397def5b665a0 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 30 Dec 2025 13:04:52 -0600 Subject: [PATCH 08/23] format --- ext/MixedModelsForwardDiffExt.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl index de2d80c54..15f850a86 100644 --- a/ext/MixedModelsForwardDiffExt.jl +++ b/ext/MixedModelsForwardDiffExt.jl @@ -84,7 +84,7 @@ function ForwardDiff.gradient( model::LinearMixedModel{T}, θ::Vector{T}=model.θ, cfg::GradientConfig=GradientConfig(model, θ), check::Val{CHK}=Val(true), -) where {T, CHK} +) where {T,CHK} return ForwardDiff.gradient!(similar(model.θ), model, θ, cfg, check) end @@ -92,7 +92,7 @@ function ForwardDiff.gradient!(result::AbstractArray, model::LinearMixedModel{T}, θ::Vector{T}=model.θ, cfg::GradientConfig=GradientConfig(model, θ), check::Val{CHK}=Val(true), -) where {T, CHK} +) where {T,CHK} return ForwardDiff.gradient!(result, fd_deviance(model), θ, cfg, check) end @@ -118,7 +118,7 @@ function ForwardDiff.hessian( model::LinearMixedModel{T}, θ::Vector{T}=model.θ, cfg::HessianConfig=HessianConfig(model, θ), check::Val{CHK}=Val(true), -) where {T, CHK} +) where {T,CHK} return ForwardDiff.hessian!(similar(model.θ), model, θ, cfg, check) end @@ -126,7 +126,7 @@ function ForwardDiff.hessian!(result::AbstractArray, model::LinearMixedModel{T}, θ::Vector{T}=model.θ, cfg::HessianConfig=HessianConfig(model, θ), check::Val{CHK}=Val(true), -) where {T, CHK} +) where {T,CHK} return ForwardDiff.hessian!(result, fd_deviance(model), θ, cfg, check) end From 072bd5d25de0c0d41d67d7e3506e25b7bb658e32 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 30 Dec 2025 13:07:11 -0600 Subject: [PATCH 09/23] NEWS --- NEWS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/NEWS.md b/NEWS.md index 0a63d124f..eb00af618 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +- Additional methods for pre-allocated result arrays and `*Config` instances have been added to the ForwardDiff extension. [#871]. + MixedModels v5.1.0 Release Notes ============================== - Nesting checks for the likelihoodratio test have been slightly tweaked to be more robust, at the cost of being slightly slower. In particular, the comparison of models with pre-centered variables with those with variables centered via StandardizedPredictors.jl was previously incorrectly rejected as non-nested, but should be correctly accepted as nested now. Additionally, some further logging messages are emitted when a nesting check fails. [#867] @@ -710,3 +712,4 @@ Package dependencies [#864]: https://github.com/JuliaStats/MixedModels.jl/issues/864 [#865]: https://github.com/JuliaStats/MixedModels.jl/issues/865 [#867]: https://github.com/JuliaStats/MixedModels.jl/issues/867 +[#871]: https://github.com/JuliaStats/MixedModels.jl/issues/871 From 03009d15ce84e73685ee40776e4bd7cf64dfc8c3 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 30 Dec 2025 13:29:37 -0600 Subject: [PATCH 10/23] oops --- ext/MixedModelsForwardDiffExt.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ext/MixedModelsForwardDiffExt.jl b/ext/MixedModelsForwardDiffExt.jl index 15f850a86..149bfe355 100644 --- a/ext/MixedModelsForwardDiffExt.jl +++ b/ext/MixedModelsForwardDiffExt.jl @@ -119,7 +119,8 @@ function ForwardDiff.hessian( cfg::HessianConfig=HessianConfig(model, θ), check::Val{CHK}=Val(true), ) where {T,CHK} - return ForwardDiff.hessian!(similar(model.θ), model, θ, cfg, check) + n = length(θ) + return ForwardDiff.hessian!(Matrix{T}(undef, n, n), model, θ, cfg, check) end function ForwardDiff.hessian!(result::AbstractArray, From 0bfd467925118c03289fcff0cc5766ee4166da4e Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 30 Dec 2025 13:42:19 -0600 Subject: [PATCH 11/23] docs fix: AoG update --- docs/Project.toml | 1 + docs/src/ecosystem.md | 14 +++++++------- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/Project.toml b/docs/Project.toml index c396b21bc..4857b499c 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -26,6 +26,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d" [compat] +AlgebraOfGraphics = "0.12" BenchmarkTools = "1" DataFrames = "1" Documenter = "1.3" diff --git a/docs/src/ecosystem.md b/docs/src/ecosystem.md index 53d0ba028..82aa5fe45 100644 --- a/docs/src/ecosystem.md +++ b/docs/src/ecosystem.md @@ -8,7 +8,7 @@ Several packages extend the functionality of MixedModels.jl, both in ways specif ```@example Ecosystem using MixedModels -progress = false +progress = isinteractive() ``` ```@example Ecosystem @@ -182,16 +182,16 @@ Effects are particularly nice for visualizing the model fit and its predictions. using AlgebraOfGraphics # like ggplot2, but an algebra instead of a grammar using CairoMakie -plt1 = data(eff_logit) * - mapping(:age, :use; color=:anych) * - (visual(Lines) + mapping(; lower=:lower, upper=:upper) * visual(LinesFill)) +plt1 = data(eff_logit) * mapping(:age; color=:anych) * + (mapping(:use) * visual(Lines) + + mapping(:lower, :upper) * visual(Band; alpha=0.3)) draw(plt1) ``` ```@example Ecosystem -plt2 = data(eff_prob) * - mapping(:age, :use; color=:anych => "children") * - (visual(Lines) + mapping(; lower=:lower, upper=:upper) * visual(LinesFill)) +plt2 = data(eff_prob) * mapping(:age; color=:anych) * + (mapping(:use) * visual(Lines) + + mapping(:lower, :upper) * visual(Band; alpha=0.3)) draw(plt2) ``` From 463a7d66bfebd62aaa119bc26d5e0357c1a3842c Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Sat, 3 Jan 2026 12:32:21 -0600 Subject: [PATCH 12/23] Add information on gradient evaluation --- gradients/GradientEvaluation.qmd | 301 ++++++++++++++++++++-- gradients/Gradient_based_optimization.qmd | 10 +- gradients/Project.toml | 1 + gradients/bibliography.bib | 6 + 4 files changed, 296 insertions(+), 22 deletions(-) diff --git a/gradients/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd index 776221549..a3ee8ae03 100644 --- a/gradients/GradientEvaluation.qmd +++ b/gradients/GradientEvaluation.qmd @@ -1,5 +1,5 @@ --- -title: "Evaluation of the Gradient of the Profiled log-likelihood" +title: "Gradient of the Profiled log-likelihood" author: - name: Douglas Bates email: dmbates@gmail.com @@ -38,8 +38,8 @@ format: A comparison of algorithms for estimation of variance components given in the supplemental materials for @Zhou03042019 shows the Fisher scoring algorithm taking the fewest iterations to convergence compared to an EM algorithm and the minorization-maximization (MM) algorithm presented in that paper. The model being simulated in @Zhou03042019, sec 3.2 is relatively simple, with random effects for two factors and their interaction in a balanced crossed design. -The approach in [lme4](https://github.com/lme4/lme4) (@bates.maechler.etal:2015) and [MixedModels.jl](https://github.com/JuliaStats/MixedModels.jl) (@bates2025mixed) has been to use a profiled log-likelihood expression, with fewer free parameters than the log-likelihood, and to streamline the evaluation of the profiled log-likelihood. -The optimization itself is performed by a gradient-free optimizer, usually either BOBYQA or NEWUOA from Powell's collection of optimizers. +The approach in [lme4](https://github.com/lme4/lme4) (@bates.maechler.etal:2015) and in [MixedModels.jl](https://github.com/JuliaStats/MixedModels.jl) (@bates2025mixed) has been to use a profiled log-likelihood expression, with fewer free parameters than the log-likelihood, and to streamline the evaluation of the profiled log-likelihood. +The optimization itself is performed by a derivative-free optimizer, usually either BOBYQA or NEWUOA from Powell's collection of optimizers. Expressions for the gradient of the profiled log-likelihood were given in sec. 3.5 of @bates.maechler.etal:2015 but they haven't been implemented in either the `lme4` or the `MixedModels.jl` packages. @@ -69,10 +69,11 @@ $$ {#eq-Sigma} for a lower-triangular *relative covariance factor*, $\boldsymbol{\Lambda_\theta}$, that depends on a *relative covariance parameter vector*, $\boldsymbol{\theta}$. -In `MixedModels.jl` the profiled log-likelihood, a function of $\boldsymbol{\theta}$ only, is evaluated from the blocked lower Cholesky factor, $\mathbf{L}_\theta$, of +In `MixedModels.jl` the profiled log-likelihood, a function of $\boldsymbol{\theta}$ only, is evaluated from the blocked lower Cholesky factor, $\mathbf{L}_\theta$, defined from the relationship $$ -\boldsymbol{\Omega_\theta} = +\begin{aligned} +\boldsymbol{\Omega_\theta}&= \begin{bmatrix} \boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda_\theta}+\mathbf{I}& \boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top X} & @@ -83,30 +84,37 @@ $$ \mathbf{y^\top Z}\boldsymbol{\Lambda_\theta} & \mathbf{y^\top X} & \mathbf{y^\top y}\\ -\end{bmatrix} -$$ {#eq-blockedOmega} - -where $\mathbf{L}_\theta$ has a similar blocked structure - -$$ -\mathbf{L}_\boldsymbol{\theta} = +\end{bmatrix}\\ +&=\mathbf{L}_\boldsymbol{\theta} \mathbf{L}^\top_\boldsymbol{\theta}\\ +&= \begin{bmatrix} \mathbf{L_{ZZ}} & \mathbf{0} & \mathbf{0} \\ \mathbf{L_{XZ}} & \mathbf{L_{XX}} & \mathbf{0} \\ \mathbf{l_{yZ}} & \mathbf{l_{yX}} & \ell_{\mathbf{yy}} \end{bmatrix} -$$ {#eq-blockedL} +\begin{bmatrix} +\mathbf{L_{ZZ}} & \mathbf{0} & \mathbf{0} \\ +\mathbf{L_{XZ}} & \mathbf{L_{XX}} & \mathbf{0} \\ +\mathbf{l_{yZ}} & \mathbf{l_{yX}} & \ell_{\mathbf{yy}} +\end{bmatrix}^\top +\end{aligned} +$$ {#eq-blockedOmega} + +where the diagonal elements of $\mathbf{L}_\theta$ are positive. -(In the actual computational methods the blocked Cholesky factor has a slightly different pattern of blocks in which the "X rows" and the "y row" are amalgamated into dense blocks and the column associated with $\mathbf{Z}$ is split into one or more columns according to the grouping factors determining the random effects, as shown in the examples below.) +(In the `MixedModels.jl` implementation the blocked Cholesky factor has a slightly different pattern of blocks in which the "X rows" and the "y row" are amalgamated into dense blocks and the column associated with $\mathbf{Z}$ is split into one or more columns according to the grouping factors determining the random effects, as shown in the examples below.) -The objective to be optimized is negative twice the profiled log-likelihood, +The objective to be optimized, on the scale of the deviance, is negative twice the profiled log-likelihood, $$ --2\mathcal{L}(\boldsymbol{\theta}|\mathbf{y}) = -\log\left|\mathbf{L_{ZZ}}\right|^2 + n \left[1 + \log\left(\frac{2\pi\ell^2_{\mathbf{yy}}}{n}\right)\right] +\begin{aligned} +-2\mathcal{L}(\boldsymbol{\theta}|\mathbf{y})&= +\log\left|\mathbf{L_{ZZ}}\right|^2 + n \left[1 + \log\left(\frac{2\pi\ell^2_{\mathbf{yy}}}{n}\right)\right]\\ +&=\log\left|\mathbf{L_{ZZ}}\right|^2 + n\log\ell^2_{\mathbf{yy}} + c +\end{aligned} $$ {#eq-objective} -which is on the scale of the deviance (if we were able to define a deviance for these models). +where $c$ is a constant. As shown in @bates.maechler.etal:2015, sec 3.5 the gradient of the first summand in @eq-objective is @@ -128,7 +136,41 @@ $$ \end{aligned} $$ {#eq-delterm1} -For the models that we wish to consider the partial derivatives of $\boldsymbol{\Lambda_\theta}$ with respect to the components of $\boldsymbol{\theta}$ are particularly simple in that they are block diagonal with a single non-zero diagonal block, which is an identity matrix. +For the models that we wish to consider the partial derivatives of $\boldsymbol{\Lambda_\theta}$ with respect to the components of $\boldsymbol{\theta}$ are particularly simple. +The partial derivatives are zeroes except for a single diagonal block, which is an identity matrix. + +### General expressions for differentiating a Cholesky factor + +@murray2016differentiation section 3.1 provides a general approach to differentiating the Cholesky factor by differentiating both sides of @eq-blockedOmega. + +Repeating his derivation, with minor changes in notation, we express the relationship between the infinitesimals $d\boldsymbol{\Omega}$ and $d\mathbf{L}$ as + +$$ +d\boldsymbol{\Omega}=d\mathbf{L}\mathbf{L}^\top + \mathbf{L} d\mathbf{L}^\top +$$ {#eq-infinitesimal} + +Pre-multiplying @eq-infinitesimal by $\mathbf{L}^{-1}$ and post-multiplying by $\mathbf{L}^{-\top}$ gives + +$$ +\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}=\mathbf{L}^{-1}d\mathbf{L} + d\mathbf{L}^\top\mathbf{L}^{-\top} +$$ {#eq-LOmegaLT} + +The first addend on the right-hand side of @eq-LOmegaLT is lower triangular and the second addend is the transpose of the first. +We wish to isolate the first addend, $\mathbf{L}^{-1}d\mathbf{L}$, which we do with the $\Phi$ transformation applied to a symmetric matrix, which preserves the strict lower triangle, halves the diagonal elements, and zeros out the strict upper triangle. +Applied to the right-hand side of @eq-LOmegaLT, the $\Phi$ transformation isolates the first addend, providing + +$$ +\Phi\left(\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}\right)=\mathbf{L}^{-1}d\mathbf{L} +$$ {#eq-Phi_dOmega} + +or + +$$ +d\mathbf{L}=\mathbf{L}\Phi\left(\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}\right) +$$ {#eq-dL} + +As we shall see, because we only need the derivative of the logarithms of the diagonal elements of $d\mathbf{L}$ to obtain the gradient of the profiled log-likelihood, and because $\mathbf{L}$ is lower triangular, we can stop at @eq-LOmegaLT. +Furthermore, several of the blocks in $\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}$ are either zero or exactly equal to the corresponding block of $\boldsymbol{\Omega_\theta}$. ## Examples @@ -140,13 +182,234 @@ Load the packages to be used #| label: load_packages #| warning: false #| output: false +using CairoMakie using FiniteDiff using LinearAlgebra using MixedModels using MixedModelsDatasets: dataset using TypedTables: Table + +const progress = isinteractive() # to suppress progress bars in non-interactive sessions +CairoMakie.activate!(type="svg") # use svg graphics output +``` + +### Dyestuff - a single, scalar random-effects term + +The `dyestuff` data set provides the yield of dyestuff in each of 5 samples from each of 6 batches of an intermediate product in the process of producing a dye. + +```{julia} +#| label: dyestuff_data +dyestuff = Table(dataset(:dyestuff)) +``` + +A mixed-effects model for these data includes an overall "intercept" term (whose estimate will be the sample mean because of the balanced design) and random effects for each level of `batch`. + +```{julia} +#| label: dyestuff_model +m01 = fit(MixedModel, @formula(yield ~ 1 + (1|batch)), dyestuff; progress) +θ = m01.θ # retain a copy of the estimate of θ +print(m01) +``` + +#### The objective as a function of $\theta_1$ + +@fig-obj_graph shows the objective (negative twice the log-likelihood) of this model as a function of $\theta_1$, the relative covariance parameter. + +```{julia} +#| label: fig-obj_graph +#| fig-cap: "Graph of the objective for model m01 as a function of θ₁" +#| code-fold: true +let f = Figure() + ax = Axis(f[1,1], xlabel="θ₁", ylabel="objective") + lines!(ax, -0.25..1.5, objective!(m01)) + f +end +``` + +Notice that the objective is well-defined for negative values of $\theta_1$ and that it is an even function, in the sense that $f(-\theta_1)=f(\theta_1)\,\forall\theta_1$. + +This means that $\theta_1=0$ will always be a critical value (have a derivative of zero) for this function. + +The maximum likelihood estimate (i.e. the minimizer of the objective) of $\boldsymbol{\theta}$ for this model is + +```{julia} +#| label: dyestuff_theta +updateL!(setθ!(m01, θ)) # restore the estimate of θ and the Cholesky factor +θ +``` + +#### Evaluating the gradient terms + +Both $\mathbf{A}$ and $\mathbf{L}$ are stored as blocked matrices with blocks in the pattern + +```{julia} +BlockDescription(m01) +``` + +In @eq-Sigma the matrix $\boldsymbol{\Lambda_\theta}$ is defined as a $6\times6$ diagonal matrix. +Here, for convenience, we extend it to an $8\times8$ matrix with a trailing diagonal block that is the identity, so that multiplication by $\boldsymbol{\Lambda_\theta}$ applies to the full matrix $\mathbf{A}$. + +```{julia} +Λ(θ) = Diagonal(vcat(fill(only(θ), 6), ones(eltype(θ), 2))) +Λ(θ) +``` + +The derivative of $\Lambda$ with respect to the first (and only) element of $\boldsymbol\theta$ is + +```{julia} +dΛdθ1 = Diagonal(vcat(ones(6), zeros(2))) +``` + +The matrix $\mathbf{A}$ from which $\boldsymbol{\Omega_\theta}$ is generated is stored in blocks. +We assemble these into a sparse matrix as + +```{julia} +A = sparse(hvcat(2, first(m01.A), m01.A[2]', m01.A[2], last(m01.A))) +``` + +which could be generated as the [Gram matrix](https://en.wikipedia.org/wiki/Gram_matrix) (i.e. the matrix of the form $\mathbf{X}^\top\mathbf{X}$ for any $\mathbf{X}$) of the columns of + +```{julia} +ZXy = hcat(collect(only(m01.reterms)), m01.X, m01.y) +Int.(ZXy) # Int for more concise printing +``` + +::: {.callout-note collapse=true} +#### Printing Int. of a matrix with integer entries + +When we know that all the entries in a floating point matrix, `X`, will be integers, we convert it to integer elements as `Int.(X)` to save space in the output. +::: + +```{julia} +A == ZXy'ZXy +``` + +The matrix $\boldsymbol{\Omega_\theta}$ is + +```{julia} +Ω(θ) = Λ(θ) * A * Λ(θ)' + Diagonal(vcat(ones(6), zeros(2))) +Ω(θ) +``` + +The lower Cholesky factor, $\mathbf{L}_\boldsymbol{\theta}$, which is stored in three blocks as described above, can be extracted as a sparse matrix with + +```{julia} +L = LowerTriangular(sparseL(m01; full=true)) +``` + +We can check that it is indeed the lower Cholesky factor + +```{julia} +L * L' ≈ Ω(θ) +``` + +The derivative of $\boldsymbol{\Omega}$ with respect to $\theta$ is + +```{julia} +dΩdθ1(θ) = dΛdθ1 * A * Λ(θ)' + Λ(θ) * A * dΛdθ1 +dΩdθ1(θ) +``` + +Notice that this matrix, like $\mathbf{A}$ is symmetric and has the same block structure as $\mathbf{A}$. +In fact, the $[2,1]$ block of this matrix is the same as the $[2,1]$ block of $\mathbf{A}$. + +Premultiplying by $\mathbf{L}^{-1}$ and postmultiplying by $\mathbf{L}^{-\top}$ is equivalent to + +```{julia} +prePhi = rdiv!(ldiv!(L, dΩdθ1(θ)), L') +``` + +We note that @eq-delterm1 is the sum of the first 6 diagonal elements of `prePhi`, + +```{julia} +sum(prePhi[i,i] for i in 1:6) +``` + +which should equal + +```{julia} +ldfun(x::Float64) = logdet(updateL!(setθ!(m01, [x]))) +FiniteDiff.finite_difference_derivative(ldfun, only(θ)) +``` + +Similarly the gradient of the other non-constant term in @eq-logdet is + +```{julia} +size(m01.y, 1) * last(prePhi) ``` +compared to + +```{julia} +n_log_lyy_sq(x::Float64) = length(m01.y) * log(abs2(last(last(updateL!(setθ!(m01, [x])).L)))) +n_log_lyy_sq(only(θ)) +``` + +```{julia} +FiniteDiff.finite_difference_derivative(n_log_lyy_sq, only(θ)) +``` + +```{julia} +#| output: false +updateL!(setθ!(m01, θ)) # reset the value of θ in the model +``` + +If we wish to continue the evaluation of all of the elements of $d\mathbf{L}/d\theta_1$ we would use +the $\Phi$ transformation and premultiplication by $\mathbf{L}$ + +```{julia} +function Φ(S) + val = tril(S) # extract the lower triangle + for i in diagind(val) + val[i] *= 0.5 # halve the diagonal elements + end + return LowerTriangular(val) +end +dLdθ1 = L * Φ(prePhi) +``` + +We can check these results against results from finite difference methods. +First we check the $[1,1]$ entry of $\frac{d\mathbf{L}}{d\theta}$ and its derivative + +```{julia} +l11(θ::T) where {T<:Real} = first(first(updateL!(setθ!(m01, [θ])).L)) +l11(only(θ)) +``` + +and its finite-difference derivative + +```{julia} +FiniteDiff.finite_difference_derivative(l11, only(θ)) +``` + +which matches the leading diagonal elements of `dLdθ1`. + +Next we check the last diagonal element of $\mathbf{L}$, which is the square root of the penalized residual sum-of-squares. +```{julia} +rtprss(θ::T) where {T<:Real} = last(last(updateL!(setθ!(m01, [θ])).L)) +rtprss(only(θ)) +``` + +```{julia} +FiniteDiff.finite_difference_derivative(rtprss, only(θ)) +``` + +These calculations verify the values of diagonal elements of $\frac{d\mathbf{L}}{d\theta}$ that will be used to evaluate the gradient of the profiled log-likelihood. + +The derivative of the objective is more easily obtained by rearranging the terms in the objective to isolate constants. +First, because the determinant of a triangular matrix is the product of its diagonal elements, and hence the log of the determinant is the sum of the logs of its diagonal elements, + +$$ +\begin{aligned} +\frac{\partial\log\left|\mathbf{L_{ZZ}}\right|^2}{\partial\theta_i}&= +2\frac{\partial\sum_{j=1}^q \log\left(L_{jj}\right)}{\partial\theta_i}\\ +&=2\sum_{j=1}^q\frac{1}{L_{jj}}\frac{\partial{L_{jj}}}{\partial\theta_i} +\end{aligned} +$$ {#eq-logdet} + +Again, because $\mathbf{L}$ is lower triangular, the division by $L_{jj}$ in the summands of @eq-logdet exactly cancels the effect of multiplication on the left by $\mathbf{L}$ in @eq-dL in these diagonal terms. +Also the multiplier of $2$ in @eq-logdet cancels the halving of the diagonal in @eq-Phi_dOmega, leaving only @eq-delterm1. + ### Penicillin - two completely crossed scalar random-effects terms The `penicillin` dataset in `MixedModelsDatasets.jl` contains 144 measurements of the `diameter` of the cleared area for each of six `sample`s of penicillin on each of 24 `plate`s. diff --git a/gradients/Gradient_based_optimization.qmd b/gradients/Gradient_based_optimization.qmd index 06b27c2f9..d3167307f 100644 --- a/gradients/Gradient_based_optimization.qmd +++ b/gradients/Gradient_based_optimization.qmd @@ -97,9 +97,13 @@ end Define a model for the `penicillin` data ```{julia} -#| label: const_defs -#| output: false -m1 = fit(MixedModel, @formula(diameter ~ 1 + (1|plate) + (1|sample)), dataset(:penicillin); progress) +#| label: m1 +m1 = fit( + MixedModel, + @formula(diameter ~ 1 + (1|plate) + (1|sample)), + dataset(:penicillin); + progress, +) print(m1) ``` diff --git a/gradients/Project.toml b/gradients/Project.toml index 220557789..a05f9e152 100644 --- a/gradients/Project.toml +++ b/gradients/Project.toml @@ -1,5 +1,6 @@ [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" diff --git a/gradients/bibliography.bib b/gradients/bibliography.bib index 2397e3095..25109d8e4 100644 --- a/gradients/bibliography.bib +++ b/gradients/bibliography.bib @@ -34,3 +34,9 @@ @article{bates2025mixed year={2025} } +@article{murray2016differentiation, + title={Differentiation of the Cholesky decomposition}, + author={Murray, Iain}, + journal={arXiv preprint arXiv:1602.07527}, + year={2016} +} From ab0a7cb9b865d731baffd71cb01edcac4d25376f Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Mon, 5 Jan 2026 18:54:25 -0600 Subject: [PATCH 13/23] Short-cut method of gradient evaluation --- gradients/GradientEvaluation.qmd | 383 +++++++++---------------------- gradients/bibliography.bib | 6 +- 2 files changed, 113 insertions(+), 276 deletions(-) diff --git a/gradients/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd index a3ee8ae03..ac652277d 100644 --- a/gradients/GradientEvaluation.qmd +++ b/gradients/GradientEvaluation.qmd @@ -43,18 +43,17 @@ The optimization itself is performed by a derivative-free optimizer, usually eit Expressions for the gradient of the profiled log-likelihood were given in sec. 3.5 of @bates.maechler.etal:2015 but they haven't been implemented in either the `lme4` or the `MixedModels.jl` packages. -The purpose of this note is to explore whether these expressions can be implemented effectively, even if just for the variance components model, which, for our purposes, is a model in which all the random effects terms are simple, scalar terms. +The purpose of this note is to provide an alternative derivation for the gradient of the profiled log-likelihood and of the REML criterion for linear mixed-effects models, along with concise algorithms for evaluation of these gradients. -### Expressions for the gradient terms +### Model definition and evaluation of the objective and gradient The linear mixed-effects models we consider are defined by the unconditional distribution of the $q$-dimensional random-effects vector, $\mathbfcal{B}$, and the conditional distribution of the $n$-dimensional response vector, $\mathbfcal{Y}$, given $\mathbfcal{B}=\mathbf{b}$, as $$ \begin{aligned} + \mathbfcal{B}&\sim\mathbfcal{N}(\mathbf{0}, \boldsymbol{\Sigma})\\ (\mathbfcal{Y}|\mathbfcal{B}=\mathbf{b})& - \sim\mathbfcal{N}\left(\mathbf{X}\boldsymbol{\beta}+\mathbf{Z}\mathbf{b},\sigma^2\mathbf{I}\right)\\ - \mathbfcal{B}& - \sim\mathbfcal{N}(\mathbf{0}, \boldsymbol{\Sigma}) + \sim\mathbfcal{N}\left(\mathbf{X}\boldsymbol{\beta}+\mathbf{Z}\mathbf{b},\sigma^2\mathbf{I}\right) \end{aligned} $$ {#eq-dists} @@ -69,7 +68,7 @@ $$ {#eq-Sigma} for a lower-triangular *relative covariance factor*, $\boldsymbol{\Lambda_\theta}$, that depends on a *relative covariance parameter vector*, $\boldsymbol{\theta}$. -In `MixedModels.jl` the profiled log-likelihood, a function of $\boldsymbol{\theta}$ only, is evaluated from the blocked lower Cholesky factor, $\mathbf{L}_\theta$, defined from the relationship +In `MixedModels.jl` the profiled log-likelihood, a function of $\boldsymbol{\theta}$ only, is evaluated from the blocked lower-triangular Cholesky factor, $\mathbf{L}_\theta$, defined from the relationship $$ \begin{aligned} @@ -100,48 +99,43 @@ $$ \end{aligned} $$ {#eq-blockedOmega} -where the diagonal elements of $\mathbf{L}_\theta$ are positive. +where the diagonal elements of $\mathbf{L}_\theta$ are chosen to be positive. +(It is assumed that $\mathbf{X}$ has full column rank and that $\mathbf{y}$ is not in the column span of $\mathbf{X}$.) -(In the `MixedModels.jl` implementation the blocked Cholesky factor has a slightly different pattern of blocks in which the "X rows" and the "y row" are amalgamated into dense blocks and the column associated with $\mathbf{Z}$ is split into one or more columns according to the grouping factors determining the random effects, as shown in the examples below.) +(In `MixedModels.jl` the blocked Cholesky factor has a slightly different pattern of blocks in which the "X rows" and the "y row" are amalgamated into dense blocks and the column associated with $\mathbf{Z}$ is split into one or more columns according to the grouping factors determining the random effects, as shown in the examples below.) -The objective to be optimized, on the scale of the deviance, is negative twice the profiled log-likelihood, +As shown in @bates2025mixed, the objective to be optimized, on the scale of the deviance, which is negative twice the profiled log-likelihood, can be expressed as $$ \begin{aligned} -2\mathcal{L}(\boldsymbol{\theta}|\mathbf{y})&= \log\left|\mathbf{L_{ZZ}}\right|^2 + n \left[1 + \log\left(\frac{2\pi\ell^2_{\mathbf{yy}}}{n}\right)\right]\\ -&=\log\left|\mathbf{L_{ZZ}}\right|^2 + n\log\ell^2_{\mathbf{yy}} + c +&=\log\left|\mathbf{L_{ZZ}}\right|^2 + n\log\ell^2_{\mathbf{yy}} + c_\ell\\ +&=2\sum_{j=1}^q\log L_{j,j} + 2n \log L_{q+p+1,q+p+1} + c_\ell \end{aligned} $$ {#eq-objective} -where $c$ is a constant. +where $c_\ell$ is a constant. +That is, the objective is an affine function (a linear function plus a constant) of the logarithms of the diagonal elements of $\mathbf{L}_\boldsymbol{\theta}$. +It happens that the gradient of the objective, as a function of $\boldsymbol{\theta}$, expressed in this form is straightforward to evaluate, as shown in @sec-Cholesky_derivative. -As shown in @bates.maechler.etal:2015, sec 3.5 the gradient of the first summand in @eq-objective is +As shown in @bates.maechler.etal:2015, sec 3.4 the REML criterion, which some prefer for parameter estimation, can be written as $$ \begin{aligned} -\nabla\log\left|\mathbf{L_ZZ}\right|^2 &= \nabla\log\left(\left|\mathbf{L_{ZZ}L_{ZZ}}^\top\right|\right)\\ -&=\nabla\log\left(\left|\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+\mathbf{I}\right|\right)\\ -&=\operatorname{tr}\left[\nabla\left(\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}\right) -\left(\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+\mathbf{I}\right)^{-1}\right]\\ -&=\operatorname{tr}\left[\mathbf{L_{ZZ}}^{-1} -\nabla\left(\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}\right) -\mathbf{L_{ZZ}}^{-\top} -\right]\\ -&=\operatorname{tr}\left[\mathbf{L_{ZZ}}^{-1} -\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+ -\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right) -\mathbf{L_{ZZ}}^{-\top} -\right] +-2\mathcal{L}_R(\boldsymbol{\theta}|\mathbf{y})&= +\log\left(\left|\mathbf{L_{ZZ}}\right|^2\left|\mathbf{L_{XX}}\right|^2\right) + (n-p) \left[1 + \log\left(\frac{2\pi\ell^2_{\mathbf{yy}}}{n-p}\right)\right]\\ +&=\log\left|\mathbf{L_{ZZ}}\right|^2 + \log\left|\mathbf{L_{XX}}\right|^2 + (n-p)\log\ell^2_{\mathbf{yy}} + c_r\\ +&=2\sum_{j=1}^{q+p}\log L_{j,j} + 2(n-p)\log L_{q+p+1,q+p+1} + c_r \end{aligned} -$$ {#eq-delterm1} +$$ {#eq-objective} -For the models that we wish to consider the partial derivatives of $\boldsymbol{\Lambda_\theta}$ with respect to the components of $\boldsymbol{\theta}$ are particularly simple. -The partial derivatives are zeroes except for a single diagonal block, which is an identity matrix. +where $c_r$ is likewise a constant. +This is also an affine function of the logarithms of the diagonal elements of $\mathbf{L_{ZZ}}$. -### General expressions for differentiating a Cholesky factor +### General expressions for differentiating a Cholesky factor {#sec-Cholesky_derivative} -@murray2016differentiation section 3.1 provides a general approach to differentiating the Cholesky factor by differentiating both sides of @eq-blockedOmega. +@murray2016differentiation, section 3.1, provides a general approach to differentiating the Cholesky factor by differentiating both sides of @eq-blockedOmega. Repeating his derivation, with minor changes in notation, we express the relationship between the infinitesimals $d\boldsymbol{\Omega}$ and $d\mathbf{L}$ as @@ -156,7 +150,12 @@ $$ $$ {#eq-LOmegaLT} The first addend on the right-hand side of @eq-LOmegaLT is lower triangular and the second addend is the transpose of the first. -We wish to isolate the first addend, $\mathbf{L}^{-1}d\mathbf{L}$, which we do with the $\Phi$ transformation applied to a symmetric matrix, which preserves the strict lower triangle, halves the diagonal elements, and zeros out the strict upper triangle. +Thus, the diagonal of the left-hand side is exactly the result we wish to evaluate, twice the infinitesimal of the logarithms of the diagonal elements of $\mathbf{L}$. + +For completeness, we provide the conclusion of the derivation in @murray2016differentiation but we don't need the more general result of $d\mathbf{L}$ - we only need the particular result from the left-hand side of @eq-LOmegaLT. + +To evaluate $d\mathbf{L}$ we must isolate the first addend, $\mathbf{L}^{-1}d\mathbf{L}$, on the right-hand side of @eq-LOmegaLT, which we do with the $\Phi$ transformation applied to a symmetric matrix. +This transformation preserves the strict lower triangle, halves the diagonal elements, and zeros out the strict upper triangle. Applied to the right-hand side of @eq-LOmegaLT, the $\Phi$ transformation isolates the first addend, providing $$ @@ -169,9 +168,6 @@ $$ d\mathbf{L}=\mathbf{L}\Phi\left(\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}\right) $$ {#eq-dL} -As we shall see, because we only need the derivative of the logarithms of the diagonal elements of $d\mathbf{L}$ to obtain the gradient of the profiled log-likelihood, and because $\mathbf{L}$ is lower triangular, we can stop at @eq-LOmegaLT. -Furthermore, several of the blocks in $\mathbf{L}^{-1}d\boldsymbol{\Omega}\mathbf{L}^{-\top}$ are either zero or exactly equal to the corresponding block of $\boldsymbol{\Omega_\theta}$. - ## Examples To aid in understanding the structure of these equations we consider the structure of the various matrices and their blocks in some simple examples. @@ -189,7 +185,7 @@ using MixedModels using MixedModelsDatasets: dataset using TypedTables: Table -const progress = isinteractive() # to suppress progress bars in non-interactive sessions +const progress = isinteractive() # suppress progress bars in non-interactive sessions CairoMakie.activate!(type="svg") # use svg graphics output ``` @@ -230,16 +226,35 @@ Notice that the objective is well-defined for negative values of $\theta_1$ and This means that $\theta_1=0$ will always be a critical value (have a derivative of zero) for this function. -The maximum likelihood estimate (i.e. the minimizer of the objective) of $\boldsymbol{\theta}$ for this model is +At the maximum likelihood estimate (i.e. the minimizer of the objective) of $\theta_1$ ```{julia} #| label: dyestuff_theta -updateL!(setθ!(m01, θ)) # restore the estimate of θ and the Cholesky factor -θ +only(θ) +``` + +the derivative should also be zero (in practice, close to zero). + +We can see from @fig-obj_graph that the derivative at $\theta_1=1.$ will be positive. + +To show the evaluation of the gradient at $\boldsymbol{\theta}=[1.]$ we reset the parameter in the model object to $[1.]$ + +```{julia} +#| output: false +updateL!(setθ!(m01, ones(1))); ``` #### Evaluating the gradient terms +In `MixedModels.jl` the [Gram matrix](https://en.wikipedia.org/wiki/Gram_matrix) (i.e. the matrix of the form $\mathbf{X}^\top\mathbf{X}$ for any $\mathbf{X}$) of the columns of + +```{julia} +ZXy = hcat(collect(only(m01.reterms)), m01.X, m01.y) +Int.(ZXy) # Int for more concise printing +``` + +is stored as the $\mathbf{A}$ property of the model object. + Both $\mathbf{A}$ and $\mathbf{L}$ are stored as blocked matrices with blocks in the pattern ```{julia} @@ -251,7 +266,7 @@ Here, for convenience, we extend it to an $8\times8$ matrix with a trailing diag ```{julia} Λ(θ) = Diagonal(vcat(fill(only(θ), 6), ones(eltype(θ), 2))) -Λ(θ) +Λ([1.]) ``` The derivative of $\Lambda$ with respect to the first (and only) element of $\boldsymbol\theta$ is @@ -267,19 +282,7 @@ We assemble these into a sparse matrix as A = sparse(hvcat(2, first(m01.A), m01.A[2]', m01.A[2], last(m01.A))) ``` -which could be generated as the [Gram matrix](https://en.wikipedia.org/wiki/Gram_matrix) (i.e. the matrix of the form $\mathbf{X}^\top\mathbf{X}$ for any $\mathbf{X}$) of the columns of - -```{julia} -ZXy = hcat(collect(only(m01.reterms)), m01.X, m01.y) -Int.(ZXy) # Int for more concise printing -``` - -::: {.callout-note collapse=true} -#### Printing Int. of a matrix with integer entries - -When we know that all the entries in a floating point matrix, `X`, will be integers, we convert it to integer elements as `Int.(X)` to save space in the output. -::: - +and check that these blocks are indeed the Gram matrix of the columns of $\mathbf{[ZXy]}$ ```{julia} A == ZXy'ZXy ``` @@ -288,10 +291,10 @@ The matrix $\boldsymbol{\Omega_\theta}$ is ```{julia} Ω(θ) = Λ(θ) * A * Λ(θ)' + Diagonal(vcat(ones(6), zeros(2))) -Ω(θ) +Ω([1.]) ``` -The lower Cholesky factor, $\mathbf{L}_\boldsymbol{\theta}$, which is stored in three blocks as described above, can be extracted as a sparse matrix with +The lower Cholesky factor, $\mathbf{L}_\boldsymbol{\theta}$, which is stored in three blocks as described above, can be extracted as a sparse matrix as ```{julia} L = LowerTriangular(sparseL(m01; full=true)) @@ -300,123 +303,58 @@ L = LowerTriangular(sparseL(m01; full=true)) We can check that it is indeed the lower Cholesky factor ```{julia} -L * L' ≈ Ω(θ) +L * L' ≈ Ω([1.]) ``` The derivative of $\boldsymbol{\Omega}$ with respect to $\theta$ is ```{julia} dΩdθ1(θ) = dΛdθ1 * A * Λ(θ)' + Λ(θ) * A * dΛdθ1 -dΩdθ1(θ) +dΩdθ1([1.]) ``` -Notice that this matrix, like $\mathbf{A}$ is symmetric and has the same block structure as $\mathbf{A}$. +Notice that this matrix, like $\mathbf{A}$, is symmetric and has the same block structure as $\mathbf{A}$. In fact, the $[2,1]$ block of this matrix is the same as the $[2,1]$ block of $\mathbf{A}$. Premultiplying by $\mathbf{L}^{-1}$ and postmultiplying by $\mathbf{L}^{-\top}$ is equivalent to ```{julia} -prePhi = rdiv!(ldiv!(L, dΩdθ1(θ)), L') -``` - -We note that @eq-delterm1 is the sum of the first 6 diagonal elements of `prePhi`, - -```{julia} -sum(prePhi[i,i] for i in 1:6) +prePhi = rdiv!(ldiv!(L, dΩdθ1([1.])), L') ``` -which should equal +The derivative of the objective at $\theta_1=1$ is, therefore ```{julia} -ldfun(x::Float64) = logdet(updateL!(setθ!(m01, [x]))) -FiniteDiff.finite_difference_derivative(ldfun, only(θ)) +dot(vcat(ones(6), 0., size(dyestuff, 1)), diag(prePhi)) ``` -Similarly the gradient of the other non-constant term in @eq-logdet is +which we can compare to the finite-difference evaluation ```{julia} -size(m01.y, 1) * last(prePhi) -``` - -compared to - -```{julia} -n_log_lyy_sq(x::Float64) = length(m01.y) * log(abs2(last(last(updateL!(setθ!(m01, [x])).L)))) -n_log_lyy_sq(only(θ)) -``` - -```{julia} -FiniteDiff.finite_difference_derivative(n_log_lyy_sq, only(θ)) +FiniteDiff.finite_difference_gradient(objective!(m01), [1.]) ``` +If we repeat these steps at the parameter estimate we have ```{julia} #| output: false updateL!(setθ!(m01, θ)) # reset the value of θ in the model +L = LowerTriangular(sparseL(m01; full=true)) +prePhi = rdiv!(ldiv!(L, dΩdθ1(θ)), L') ``` -If we wish to continue the evaluation of all of the elements of $d\mathbf{L}/d\theta_1$ we would use -the $\Phi$ transformation and premultiplication by $\mathbf{L}$ - -```{julia} -function Φ(S) - val = tril(S) # extract the lower triangle - for i in diagind(val) - val[i] *= 0.5 # halve the diagonal elements - end - return LowerTriangular(val) -end -dLdθ1 = L * Φ(prePhi) -``` - -We can check these results against results from finite difference methods. -First we check the $[1,1]$ entry of $\frac{d\mathbf{L}}{d\theta}$ and its derivative - -```{julia} -l11(θ::T) where {T<:Real} = first(first(updateL!(setθ!(m01, [θ])).L)) -l11(only(θ)) -``` - -and its finite-difference derivative - -```{julia} -FiniteDiff.finite_difference_derivative(l11, only(θ)) -``` - -which matches the leading diagonal elements of `dLdθ1`. - -Next we check the last diagonal element of $\mathbf{L}$, which is the square root of the penalized residual sum-of-squares. -```{julia} -rtprss(θ::T) where {T<:Real} = last(last(updateL!(setθ!(m01, [θ])).L)) -rtprss(only(θ)) -``` +with the derivative being evaluated as ```{julia} -FiniteDiff.finite_difference_derivative(rtprss, only(θ)) +dot(vcat(ones(6), 0., size(dyestuff, 1)), diag(prePhi)) ``` -These calculations verify the values of diagonal elements of $\frac{d\mathbf{L}}{d\theta}$ that will be used to evaluate the gradient of the profiled log-likelihood. - -The derivative of the objective is more easily obtained by rearranging the terms in the objective to isolate constants. -First, because the determinant of a triangular matrix is the product of its diagonal elements, and hence the log of the determinant is the sum of the logs of its diagonal elements, - -$$ -\begin{aligned} -\frac{\partial\log\left|\mathbf{L_{ZZ}}\right|^2}{\partial\theta_i}&= -2\frac{\partial\sum_{j=1}^q \log\left(L_{jj}\right)}{\partial\theta_i}\\ -&=2\sum_{j=1}^q\frac{1}{L_{jj}}\frac{\partial{L_{jj}}}{\partial\theta_i} -\end{aligned} -$$ {#eq-logdet} - -Again, because $\mathbf{L}$ is lower triangular, the division by $L_{jj}$ in the summands of @eq-logdet exactly cancels the effect of multiplication on the left by $\mathbf{L}$ in @eq-dL in these diagonal terms. -Also the multiplier of $2$ in @eq-logdet cancels the halving of the diagonal in @eq-Phi_dOmega, leaving only @eq-delterm1. - ### Penicillin - two completely crossed scalar random-effects terms The `penicillin` dataset in `MixedModelsDatasets.jl` contains 144 measurements of the `diameter` of the cleared area for each of six `sample`s of penicillin on each of 24 `plate`s. ```{julia} #| label: penicillin_data -const penicillin = Table(dataset(:penicillin)) +penicillin = Table(dataset(:penicillin)) ``` We construct a `LinearMixedModel` struct with a single fixed-effect parameter, representing the average diameter in the balanced design, and random effects for each `plate` and each `sample`, @@ -425,7 +363,9 @@ We construct a `LinearMixedModel` struct with a single fixed-effect parameter, r #| label: m02 #| output: false #| warn: false -m02 = LinearMixedModel(@formula(diameter ~ 1 + (1|plate) + (1|sample)), penicillin) +m02 = fit(MixedModel, @formula(diameter ~ 1 + (1|plate) + (1|sample)), penicillin) +θ = m02.θ +print(m02) ``` for which the concatenated matrix $\left[\mathbf{ZXy}\right]$ is @@ -441,10 +381,10 @@ The Cholesky factor, $\mathbf{L}$, at the initial value $\boldsymbol\theta=\left ```{julia} #| label: m02L -Lsparse = LowerTriangular(sparseL(updateL!(m02); full=true)) +Lsparse = LowerTriangular(sparseL(updateL!(setθ!(m02, ones(2))); full=true)) ``` -In practice, the full $\mathbf{L}$ matrix is stored in a blocked form +In practice, the $\mathbf{L}$ matrix is stored in a blocked form ```{julia} #| label: m02_blocks @@ -460,186 +400,81 @@ objective(m02) #### Evaluating terms in the gradient -For illustration of the gradient evaluation we create the lower-triangular sparse submatrix $\mathbf{L_{ZZ}}$ as +Again, we create the full Gram matrix $\mathbf{A}$ from the blocks stored in the `A` property of the model ```{julia} -#| label: m02LZZ -LZZsparse = LowerTriangular(sparseL(m02)) -``` - -from which $\log\left|\mathbf{L_{ZZ}}\right|^2$ can be evaluated as - -```{julia} -#| label: logdet_m02 -2. * sum(log, diag(LZZsparse)) -``` - -In practice we use the `logdet` function - -```{julia} -#| label: logdet__m02 -logdet(m02) -``` - -which evaluates this quantity from the blocked representation of $\mathbf{L}$. - -A finite-difference approximation to the gradient of the `logdet` at this value of $\boldsymbol{\theta}$ is - -```{julia} -ldfun(x::Vector{Float64}) = logdet(updateL!(setθ!(m02, x))) -FiniteDiff.finite_difference_gradient(ldfun, [1., 1.]) -``` - -The matrix $\mathbf{A_{ZZ}}=\mathbf{Z}^\top\mathbf{Z}$ for this model, as a dense matrix, is - -```{julia} -#| label: denseA -A = Int.(hvcat(2, first(m02.A), m02.A[2]', m02.A[2], m02.A[3])) -``` - -and the first face of $\nabla{\boldsymbol{\Lambda}}$ is - -```{julia} -#| label: nabla_Lambda -nabla1 = Int.(Diagonal(vcat(ones(Int, 24), zeros(Int, 6)))) -``` - -With $\boldsymbol{\Lambda(\theta)}$ being - -```{julia} -Λ(θ) = Diagonal(vcat(fill(first(θ), 24), fill(last(θ), 6))) -θ = ones(2) # initial parameter vector -Int.(Λ(θ)) # initial value of Λ -``` - -the first face of $\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+ -\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right)$ is - -```{julia} -#| label: symprod -symprod = nabla1 * A * Λ(θ) + Λ(θ) * A * nabla1 -Int.(symprod) -``` - -producing the matrix whose trace is desired as - -```{julia} -rdiv!(ldiv!(LZZsparse, symprod), LZZsparse') # overwrites the value of symprod -``` - -yielding the trace as - -```{julia} -sum(diag(symprod)) -``` - -One point to notice here is that the $[1,1]$ block of this matrix is diagonal, with elements of - -```{julia} -((2 * first(θ)) .* first(m02.A).diag) ./ abs2.(first(m02.L).diag) -``` - -which can be used to simplify the evaluation of the first gradient term. -In particular, the gradient of a model with a single, scalar random-effects term is, unsurprisingly, straightforward. - -For the second element of the gradient we define - -```{julia} -nabla2 = Diagonal(vcat(zeros(24), ones(6))) -Int.(nabla2) -``` - -and - -```{julia} -symprod = nabla2 * A * Λ(ones(2)) + Λ(ones(2)) * A * nabla2 -``` - -The matrix whose trace is required is - -```{julia} -rdiv!(ldiv!(LZZsparse, symprod), LZZsparse') -``` - -producing the second element of the gradient of `ldfun` as - -```{julia} -sum(diag(symprod)) +A2 = let blk = m02.A + hvcat(3, first(blk), blk[2]', blk[4]', blk[2], blk[3], blk[5]', blk[4], blk[5], blk[6]) +end ``` -Notice that the entire $[1,1]$ block of this matrix is zero and will not need to be evaluated explicitly. - -We evaluate $\boldsymbol{\hat{\theta}}$ using a derivative-free optimizer as +The $32\times32$ form of $\boldsymbol{\Lambda}(\boldsymbol(\theta))$ is ```{julia} -θ = refit!(m02).θ +function Λ2(θ::Vector{Float64}) + length(θ) == 2 || throw(DimensionMismatch("length(θ) should be 2")) + return Diagonal(vcat(fill(first(θ), 24), fill(last(θ), 6), ones(2))) +end +Λ2([1.,1.]) ``` -after which the first face of `symprod` becomes +producing $\boldsymbol{\Omega}$ as ```{julia} -symprod = nabla1 * A * Λ(θ) + Λ(θ) * A * nabla1 +function Ω2(θ) + length(θ) == 2 || throw(DimensionMismatch("length(θ) should be 2")) + return Λ2(θ) * A2 * Λ2(θ)' + Diagonal(vcat(ones(30), zeros(2))) +end +Ω2([1.,1.]) ``` -`LZZsparse` becomes +The partial derivatives of `Ω2` are constants ```{julia} -LZZsparse = LowerTriangular(sparseL(m02)) +#| output: false +dΛ2dθ1 = Diagonal(vcat(ones(24), zeros(8))) +dΛ2dθ2 = Diagonal(vcat(zeros(24), ones(6), zeros(2))) ``` -and the matrix whose trace is required is +For the ML objective (i.e. negative twice the log-likelihood) a finite difference gradient at the initial $\boldsymbol{\theta}=[1,1]^\top$ is ```{julia} -rdiv!(ldiv!(LZZsparse, symprod), LZZsparse') +FiniteDiff.finite_difference_gradient(objective!(m02), ones(2)) ``` -yielding the gradient term +To evaluate this quantity from the formula we create ```{julia} -sum(diag(symprod)) +dΩ2dθ1(θ) = dΛ2dθ1 * A2 * Λ2(θ)' + Λ2(θ) * A2 * dΛ2dθ1' +Int.(dΩ2dθ1([1., 1.])) # Int to save space when printing ``` -which can be compared to the finite difference value ```{julia} -FiniteDiff.finite_difference_gradient(ldfun, θ) +prePhi = rdiv!(ldiv!(Lsparse, dΩ2dθ1([1.,1.])), Lsparse') ``` -(Note, this is the first element of the gradient of the `logdet` term only, not the gradient of the objective which is near zero +yielding the first component of the gradient as ```{julia} -FiniteDiff.finite_difference_gradient(objective!(m02), θ) +dot(diag(prePhi), vcat(ones(30), 0., length(m02.y))) ``` -as it should be at the optimum.) - -For the second element of the gradient of `ldfun` we have +For the second component of the gradient ```{julia} -symprod = nabla2 * A * Λ(θ) + Λ(θ) * A * nabla2 +dΩ2dθ2(θ) = dΛ2dθ2 * A2 * Λ2(θ)' + Λ2(θ) * A2 * dΛ2dθ2' +Int.(dΩ2dθ2([1., 1.])) ``` -After pre- and post-division by `LZZsparse`, this becomes - ```{julia} -rdiv!(ldiv!(LZZsparse, symprod), LZZsparse') +prePhi = rdiv!(ldiv!(Lsparse, dΩ2dθ2([1.,1.])), Lsparse') ``` -yielding the second element of the gradient of `ldfun` as - ```{julia} -sum(diag(symprod)) +dot(diag(prePhi), vcat(ones(30), 0., length(m02.y))) ``` -#### Factoring the symmetric matrix - -The matrix $\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+ -\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right)$ is symmetric and has the same sparsity structure as $\mathbf{Z^\top Z}$, which is positive semi-definite. -However, it is not clear that the non-zero blocks in $\left(\nabla\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda}+ -\boldsymbol{\Lambda}^\top\mathbf{Z^\top Z}\nabla\boldsymbol{\Lambda}\right)$ will be positive semi-definite in the general case. -In the case of a single variance component it will be positive definite when $\theta_1>0$ because it is $2\theta_1\mathbf{A}$. - - ### References {.unnumbered} ::: {#refs} diff --git a/gradients/bibliography.bib b/gradients/bibliography.bib index 25109d8e4..decf33365 100644 --- a/gradients/bibliography.bib +++ b/gradients/bibliography.bib @@ -31,12 +31,14 @@ @article{bates2025mixed title={Mixed-model Log-likelihood Evaluation Via a Blocked Cholesky Factorization}, author={Bates, Douglas and Alday, Phillip M and Kokandakar, Ajinkya H}, journal={arXiv preprint arXiv:2505.11674}, - year={2025} + year={2025}, + url={https://arxiv.org/pdf/2505.11674} } @article{murray2016differentiation, title={Differentiation of the Cholesky decomposition}, author={Murray, Iain}, journal={arXiv preprint arXiv:1602.07527}, - year={2016} + year={2016}, + url={https://arxiv.org/pdf/1602.07527} } From 2e8ac3787d9ba2beff9de3880ca4e66c37b75ee0 Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Fri, 9 Jan 2026 11:53:18 -0600 Subject: [PATCH 14/23] Partial gradient for vector-valued r.e.'s --- gradients/GradientEvaluation.qmd | 125 +++++++++++++++++++++++++++---- gradients/gradient.jl | 39 ++++++++++ 2 files changed, 148 insertions(+), 16 deletions(-) create mode 100644 gradients/gradient.jl diff --git a/gradients/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd index ac652277d..174c786c4 100644 --- a/gradients/GradientEvaluation.qmd +++ b/gradients/GradientEvaluation.qmd @@ -45,7 +45,7 @@ Expressions for the gradient of the profiled log-likelihood were given in sec. 3 The purpose of this note is to provide an alternative derivation for the gradient of the profiled log-likelihood and of the REML criterion for linear mixed-effects models, along with concise algorithms for evaluation of these gradients. -### Model definition and evaluation of the objective and gradient +### Model definition and evaluation of the objective The linear mixed-effects models we consider are defined by the unconditional distribution of the $q$-dimensional random-effects vector, $\mathbfcal{B}$, and the conditional distribution of the $n$-dimensional response vector, $\mathbfcal{Y}$, given $\mathbfcal{B}=\mathbf{b}$, as @@ -58,31 +58,27 @@ $$ $$ {#eq-dists} where $\mathbf{X}$ is an $n\times p$ model matrix for the fixed-effects parameter vector, $\boldsymbol{\beta}$, and $\mathbf{Z}$ is an $n\times q$ model matrix for the random effects, $\mathbf{b}$. -Furthermore, $\boldsymbol{\Sigma}$, the covariance of $\mathbfcal{B}$, is positive semi-definite. +Furthermore, $\boldsymbol{\Sigma}$, the covariance of $\mathbfcal{B}$, is symmetric and positive semi-definite. We express it as $$ -\boldsymbol{\Sigma} = \sigma^2 -\boldsymbol{\Lambda_{\theta}}\boldsymbol{\Lambda^\top_{\theta}} +\boldsymbol{\Sigma} = \sigma^2\boldsymbol{\Lambda_{\theta}}\boldsymbol{\Lambda^\top_{\theta}} $$ {#eq-Sigma} for a lower-triangular *relative covariance factor*, $\boldsymbol{\Lambda_\theta}$, that depends on a *relative covariance parameter vector*, $\boldsymbol{\theta}$. In `MixedModels.jl` the profiled log-likelihood, a function of $\boldsymbol{\theta}$ only, is evaluated from the blocked lower-triangular Cholesky factor, $\mathbf{L}_\theta$, defined from the relationship + $$ \begin{aligned} \boldsymbol{\Omega_\theta}&= \begin{bmatrix} \boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda_\theta}+\mathbf{I}& -\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top X} & +\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top X}& \boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top y}\\ -\mathbf{X^\top Z}\boldsymbol{\Lambda_\theta} & -\mathbf{X^\top X} & -\mathbf{X^\top y}\\ -\mathbf{y^\top Z}\boldsymbol{\Lambda_\theta} & -\mathbf{y^\top X} & -\mathbf{y^\top y}\\ +\mathbf{X^\top Z}\boldsymbol{\Lambda_\theta} & \mathbf{X^\top X} & \mathbf{X^\top y}\\ +\mathbf{y^\top Z}\boldsymbol{\Lambda_\theta} & \mathbf{y^\top X} & \mathbf{y^\top y}\\ \end{bmatrix}\\ &=\mathbf{L}_\boldsymbol{\theta} \mathbf{L}^\top_\boldsymbol{\theta}\\ &= @@ -100,9 +96,7 @@ $$ $$ {#eq-blockedOmega} where the diagonal elements of $\mathbf{L}_\theta$ are chosen to be positive. -(It is assumed that $\mathbf{X}$ has full column rank and that $\mathbf{y}$ is not in the column span of $\mathbf{X}$.) - -(In `MixedModels.jl` the blocked Cholesky factor has a slightly different pattern of blocks in which the "X rows" and the "y row" are amalgamated into dense blocks and the column associated with $\mathbf{Z}$ is split into one or more columns according to the grouping factors determining the random effects, as shown in the examples below.) +(We assume that $\mathbf{X}$ has full column rank and that $\mathbf{y}$ is not in the column span of $\mathbf{X}$.) As shown in @bates2025mixed, the objective to be optimized, on the scale of the deviance, which is negative twice the profiled log-likelihood, can be expressed as @@ -131,7 +125,49 @@ $$ $$ {#eq-objective} where $c_r$ is likewise a constant. -This is also an affine function of the logarithms of the diagonal elements of $\mathbf{L_{ZZ}}$. +This is also an affine function of the logarithms of the diagonal elements of $\mathbf{L_\boldsymbol{\theta}}$. + +### Reformulation for evaluation of derivatives + +When differentiating the ML or REML objective with respect to elements of $\boldsymbol{\theta}$, it is convenient to amalgamate the blocks of $\boldsymbol{\Omega_\theta}$ derived from $\mathbf{X}$ and $\mathbf{y}$ and to re-express @eq-blockedOmega as + +$$ +\begin{aligned} +\boldsymbol{\Omega_\theta}&= +\begin{bmatrix} +\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top Z}\boldsymbol{\Lambda_\theta}& +\boldsymbol{\Lambda_\theta}^\top\mathbf{Z^\top[Xy]}\\ +\mathbf{[Xy]^\top Z}\boldsymbol{\Lambda_\theta} & +\mathbf{[Xy]^\top[Xy]} +\end{bmatrix} + +\begin{bmatrix} +\mathbf{I} & \mathbf{0}\\ +\mathbf{0} & \mathbf{0} +\end{bmatrix}\\ +&=\mathbf{L}_\boldsymbol{\theta} \mathbf{L}^\top_\boldsymbol{\theta}\\ +&= +\begin{bmatrix} +\mathbf{L_{Z,Z}} & \mathbf{0}\\ +\mathbf{L_{Xy,Z}} & \mathbf{L_{Xy,Xy}} +\end{bmatrix} +\begin{bmatrix} +\mathbf{L_{Z,Z}} & \mathbf{0} \\ +\mathbf{L_{Xy,Z}} & \mathbf{L_{Xy,Xy}} +\end{bmatrix}^\top +\end{aligned} +$$ {#eq-blockedOmega_mod} + +where $\mathbf{[Xy]}$ represents the $n\times(p+1)$ matrix that is the horizontal concatenation of $\mathbf{X}$ and $\mathbf{y}$. +The matrices $\mathbf{A_{11}}=\mathbf{Z^\top Z}$, $\mathbf{A_{21}}=\mathbf{[Xy]^\top Z}$ and $\mathbf{A_{22}}=\mathbf{[Xy]^\top[Xy]}$, assembled as + +$$ +\mathbf{A}=\begin{bmatrix} +\mathbf{A_{11}} & \mathbf{A_{21}^\top}\\ +\mathbf{A_{21}} & \mathbf{A_{22}} +\end{bmatrix} , +$$ {#eq-Amat} + +are precomputed and stored as the `A` property in a `LinearMixedModel` object when random effects are associated with a single grouping factor. ### General expressions for differentiating a Cholesky factor {#sec-Cholesky_derivative} @@ -178,8 +214,10 @@ Load the packages to be used #| label: load_packages #| warning: false #| output: false +using BenchmarkTools using CairoMakie using FiniteDiff +using ForwardDiff using LinearAlgebra using MixedModels using MixedModelsDatasets: dataset @@ -408,7 +446,7 @@ A2 = let blk = m02.A end ``` -The $32\times32$ form of $\boldsymbol{\Lambda}(\boldsymbol(\theta))$ is +The $32\times32$ form of $\boldsymbol{\Lambda}(\boldsymbol\theta)$ is ```{julia} function Λ2(θ::Vector{Float64}) @@ -475,6 +513,61 @@ prePhi = rdiv!(ldiv!(Lsparse, dΩ2dθ2([1.,1.])), Lsparse') dot(diag(prePhi), vcat(ones(30), 0., length(m02.y))) ``` +### Sleepstudy - a single vector-valued random-effects term + +```{julia} +m03 = fit(MixedModel, @formula(reaction ~ 1 + days + (1+days|subj)), dataset(:sleepstudy); progress) +θ03 = m03.θ +print(m03) +``` + +For this model we create a function to evaluate the gradient components. +First, the gradient of the logarithm of the square of the determinant of $\mathbf{L_{1,1}}$ + +```{julia} +ldfun(θ::Vector{Float64}) = logdet(updateL!(setθ!(m03, θ))) +ldfun(θ03) +``` + +A finite-difference approximation to the gradient is + +```{julia} +FiniteDiff.finite_difference_gradient(ldfun, θ03) +``` + +with the corresponding analytic value + +```{julia} +include("gradient.jl") +grad_comp(updateL!(setθ!(m03, θ03))) # need to reset θ after the finite_difference operation +``` + + +#### Speed of evaluation + +First, the evaluation of the log-determinant of $\mathbf{L_{1,1}}$ + +```{julia} +@benchmark logdet(updateL!(setθ!($m03, $θ03))) seconds=1 +``` + +Then the subsequent evaluation of the gradient of the log-determinant + +```{julia} +@benchmark grad_comp($m03) seconds=1 +``` + +For comparison, the finite-difference evaluation +```{julia} +@benchmark FiniteDiff.finite_difference_gradient($ldfun, $θ03) seconds=1 +``` + +and the forward-difference version + +```{julia} +@benchmark ForwardDiff.gradient($m03, $θ03) seconds=1 +``` + ### References {.unnumbered} ::: {#refs} diff --git a/gradients/gradient.jl b/gradients/gradient.jl new file mode 100644 index 000000000..8b0a8ff79 --- /dev/null +++ b/gradients/gradient.jl @@ -0,0 +1,39 @@ +using LinearAlgebra +using MixedModels +using MixedModelsDatasets: dataset + +""" + grad_comp(m::LinearMixedModel) + +Returns the gradient of the log-determinant part of the objective for `m`, +which must have a single, vector-valued random-effects term. +""" +function grad_comp(m::LinearMixedModel{T}) where {T} + (; reterms, parmap, A, L) = m + A11 = first(A).data + L11 = first(L).data + λ = only(reterms).λ # checks that there is exactly one random-effects term + λdot = similar(λ) + face = similar(λ.data) + grad = zeros(T, length(parmap)) + for (p, pm) in enumerate(parmap) + fill!(λdot, zero(T)) + λdot[pm[2], pm[3]] = one(T) + for k in axes(A11, 3) # loop over faces of A[1].data + rmul!(lmul!(λ', copyto!(face, view(A11, :, :, k))), λdot) + for i in axes(face, 1) # symmetrize the face and double the diagonal + for j in 1:(i - 1) + ijsum = face[i, j] + face[j, i] + face[j, i] = face[i, j] = ijsum + end + face[i, i] *= 2 + end + Lface = LowerTriangular(view(L11, :, :, k)) + rdiv!(ldiv!(Lface, face), Lface') + for i in diagind(face) + grad[p] += face[i] + end + end + end + return grad +end \ No newline at end of file From cfba091042cc8f3f8ec4dd4af3e2e3b5883d46a1 Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Tue, 13 Jan 2026 19:04:52 -0600 Subject: [PATCH 15/23] Expand docs, start src/gradient.jl --- gradients/GradientEvaluation.qmd | 41 ++++++-- gradients/Gradient_based_optimization.qmd | 123 ++++++++++++++++++---- src/MixedModels.jl | 1 + src/gradient.jl | 56 ++++++++++ test/grad.jl | 73 +++++++++++++ test/runtests.jl | 1 + 6 files changed, 269 insertions(+), 26 deletions(-) create mode 100644 src/gradient.jl create mode 100644 test/grad.jl diff --git a/gradients/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd index 174c786c4..9c11192ff 100644 --- a/gradients/GradientEvaluation.qmd +++ b/gradients/GradientEvaluation.qmd @@ -220,6 +220,7 @@ using FiniteDiff using ForwardDiff using LinearAlgebra using MixedModels +using MixedModels: Omega_dot_diag_block! using MixedModelsDatasets: dataset using TypedTables: Table @@ -251,11 +252,13 @@ print(m01) ```{julia} #| label: fig-obj_graph -#| fig-cap: "Graph of the objective for model m01 as a function of θ₁" +#| fig-cap: "Graph of the objective for model m01 as a function of θ₁. The light blue horizontal line is at the minimum of the objective. The vertical line is at the parameter estimate." #| code-fold: true let f = Figure() ax = Axis(f[1,1], xlabel="θ₁", ylabel="objective") lines!(ax, -0.25..1.5, objective!(m01)) + hlines!(objective(updateL!(setθ!(m01, θ))); alpha=0.4) + vlines!(only(θ); alpha=0.4) f end ``` @@ -264,7 +267,7 @@ Notice that the objective is well-defined for negative values of $\theta_1$ and This means that $\theta_1=0$ will always be a critical value (have a derivative of zero) for this function. -At the maximum likelihood estimate (i.e. the minimizer of the objective) of $\theta_1$ +At the maximum likelihood estimate (i.e. the minimizer of the objective) of $\theta_1$, which is the ratio of the standard deviation of the random effects to the residual standard deviation, is ```{julia} #| label: dyestuff_theta @@ -352,12 +355,16 @@ dΩdθ1([1.]) ``` Notice that this matrix, like $\mathbf{A}$, is symmetric and has the same block structure as $\mathbf{A}$. -In fact, the $[2,1]$ block of this matrix is the same as the $[2,1]$ block of $\mathbf{A}$. +In fact, the $[2,1]$ block of this matrix is exactly the same as the $[2,1]$ block of $\mathbf{A}$. -Premultiplying by $\mathbf{L}^{-1}$ and postmultiplying by $\mathbf{L}^{-\top}$ is equivalent to +We do the premultiplying by $\mathbf{L}^{-1}$ and postmultiplying by $\mathbf{L}^{-\top}$ in two stages, to check the form of the intermediate result ```{julia} -prePhi = rdiv!(ldiv!(L, dΩdθ1([1.])), L') +prePhi = ldiv!(L, dΩdθ1([1.])) +``` + +```{julia} +rdiv!(prePhi, L') ``` The derivative of the objective at $\theta_1=1$ is, therefore @@ -386,6 +393,25 @@ with the derivative being evaluated as dot(vcat(ones(6), 0., size(dyestuff, 1)), diag(prePhi)) ``` +#### Using blocked factors + +It would not be practical to create the `sparseL` matrix and a dense copy of $\mathbf{A}$, as is done here, for general cases. +The whole point of using a blocked Cholesky factor for evaluating the objective is that the $[1,1]$ block is either diagonal or uniform-block-diagonal, which can result in considerable savings of object size and execution time when evaluating the objective. + +We would want to retain this time and memory savings when evaluating the gradient by creating a blocked computation for evaluation of the gradient. + +The blocked representations of both the $\mathbf{A}$ and $\mathbf{L}$ arrays store only the lower triangle. +It appears that it will be necessary to store the upper triangle in addition to the lower triangle to be able to evaluate the gradient. +There may be a clever way around this but right now I can't see one. + +To create the blocked representation of $\Omega$ we begin with the diagonal block in which the parameter component to be differentiated occurs. +It is assumed that the block has already been allocated. + +```{julia} +dΩdθ1_blk = Omega_dot_diag_block!(similar(first(m01.A)), m01, 1) +``` + + ### Penicillin - two completely crossed scalar random-effects terms The `penicillin` dataset in `MixedModelsDatasets.jl` contains 144 measurements of the `diameter` of the cleared area for each of six `sample`s of penicillin on each of 24 `plate`s. @@ -444,6 +470,7 @@ Again, we create the full Gram matrix $\mathbf{A}$ from the blocks stored in the A2 = let blk = m02.A hvcat(3, first(blk), blk[2]', blk[4]', blk[2], blk[3], blk[5]', blk[4], blk[5], blk[6]) end +Int.(A2) ``` The $32\times32$ form of $\boldsymbol{\Lambda}(\boldsymbol\theta)$ is @@ -453,7 +480,7 @@ function Λ2(θ::Vector{Float64}) length(θ) == 2 || throw(DimensionMismatch("length(θ) should be 2")) return Diagonal(vcat(fill(first(θ), 24), fill(last(θ), 6), ones(2))) end -Λ2([1.,1.]) +Int.(Λ2([1.,1.])) ``` producing $\boldsymbol{\Omega}$ as @@ -463,7 +490,7 @@ function Ω2(θ) length(θ) == 2 || throw(DimensionMismatch("length(θ) should be 2")) return Λ2(θ) * A2 * Λ2(θ)' + Diagonal(vcat(ones(30), zeros(2))) end -Ω2([1.,1.]) +Int.(Ω2([1.,1.])) ``` The partial derivatives of `Ω2` are constants diff --git a/gradients/Gradient_based_optimization.qmd b/gradients/Gradient_based_optimization.qmd index d3167307f..96ee4eb08 100644 --- a/gradients/Gradient_based_optimization.qmd +++ b/gradients/Gradient_based_optimization.qmd @@ -48,6 +48,8 @@ Load the packages to be used ```{julia} #| label: load_packages +#| output: false +using BenchmarkTools using ForwardDiff using MixedModels using MixedModels: fd_deviance @@ -56,7 +58,7 @@ using NLopt using Tables: table using TypedTables: Table -const progress = false +const progress = isinteractive() ``` ## Examples {#sec-examples} @@ -64,6 +66,7 @@ const progress = false We create a function to take a `LinearMixedModel` that has been fit and refit it using the `:LD_LBFGS` optimizer applied to an objective function that evaluates the gradient using `ForwardDiff`. ```{julia} +#| output: false addinds(ch::Char, n::Integer) = Symbol.(lpad.(string.(ch, Base.OneTo(n)), ndigits(n), '0')) function gr_refit!(m::LinearMixedModel{T}) where {T} θ = copy(m.optsum.initial) @@ -83,8 +86,9 @@ function gr_refit!(m::LinearMixedModel{T}) where {T} return val end opt = NLopt.Opt(:LD_LBFGS, k) - NLopt.ftol_rel!(opt, 1.e-12) + NLopt.ftol_rel!(opt, 1.e-10) NLopt.ftol_abs!(opt, 1.e-8) + NLopt.initial_step!(opt, fill(0.5, k)) NLopt.min_objective!(opt, obj) min_f, min_x, ret = NLopt.optimize(opt, θ) header = vcat([:obj], addinds('θ', k), addinds('g', k)) @@ -98,31 +102,32 @@ Define a model for the `penicillin` data ```{julia} #| label: m1 -m1 = fit( +pnm01 = fit( MixedModel, @formula(diameter ~ 1 + (1|plate) + (1|sample)), dataset(:penicillin); progress, ) -print(m1) +pnm01_obj = objective(pnm01) +print(pnm01) ``` for which the optimization summary is ```{julia} -m1.optsum +pnm01.optsum ``` and refit the model using ForwardDiff gradient evaluations. ```{julia} -fitlog = gr_refit!(m1) +fitlog = gr_refit!(pnm01) ``` -The objective at convergence is +The objective at convergence, compared to the derivative-free optimum ```{julia} -last(fitlog.obj) +pnm01_obj - last(fitlog.obj) ``` and the last few evaluations are @@ -131,23 +136,89 @@ and the last few evaluations are last(fitlog, 5) ``` +### Pastes {#sec-pastes} + +```{julia} +psm01 = fit(MixedModel, @formula(strength ~ 1 + (1|batch/cask)), dataset(:pastes); progress) +psm01_obj = objective(psm01) +print(psm01) +``` + +```{julia} +psm01.optsum +``` + +```{julia} +fitlog = gr_refit!(psm01) +``` + +```{julia} +psm01_obj - last(fitlog.obj) +``` + +```{julia} +last(fitlog, 5) +``` + +### Insteval {#sec-insteval} + +```{julia} +insteval = dataset(:insteval) +contrasts = Dict(:service => EffectsCoding()) +iem01 = fit( + MixedModel, + @formula(y ~ 1 + service + (1|s) + (1|d) + (1|dept)), + insteval; + progress, contrasts, +) +iem01_obj = objective(iem01) +print(iem01) +``` + +```{julia} +iem01.optsum +``` + +```{julia} +fitlog = gr_refit!(iem01) +``` + +```{julia} +iem01_obj - last(fitlog.obj) +``` + +```{julia} +last(fitlog, 5) +``` + +This is an example where the number of evaluations to convergence is lower when using the gradient but the time to fit the model is much greater - primarily because the ForwardDiff gradient allocates so much memory. + +```{julia} +@benchmark refit!($iem01; progress=false) seconds=10 +``` + +```{julia} +@benchmark gr_refit!($iem01) seconds=30 +``` + ### Sleepstudy {#sec-sleepstudy} ```{julia} -m2 = fit(MixedModel, @formula(reaction ~ 1 + days + (1 + days|subj)), dataset(:sleepstudy); progress) -print(m2) +slm01 = fit(MixedModel, @formula(reaction ~ 1 + days + (1 + days|subj)), dataset(:sleepstudy); progress) +slm01_obj = objective(slm01) +print(slm01) ``` ```{julia} -m2.optsum +slm01.optsum ``` ```{julia} -fitlog = gr_refit!(m2) +fitlog = gr_refit!(slm01) ``` ```{julia} -last(fitlog.obj) +slm01_obj - last(fitlog.obj) ``` ```{julia} @@ -158,33 +229,47 @@ last(fitlog, 5) ```{julia} # this model is very overparameterized, but it's a test example -m3 = fit( +kbm01 = fit( MixedModel, @formula(rt_trunc ~ 1 + spkr * prec * load + (1 + spkr * prec * load | subj) + (1 + spkr * prec * load | item)), dataset(:kb07); progress, ) -print(m3) +print(kbm01) ``` ```{julia} -m3.optsum +kbm01.optsum ``` Several of the parameters on the diagonal of $\boldsymbol{\Lambda}$ are close to zero at convergence and are replaced by zero in the returned parameter vector ```{julia} -findall(iszero, m3.θ) +findall(iszero, kbm01.θ) ``` +Refitting with the gradient takes a very long time because ForwardDiff is poorly suited to optimization problems with many parameters. ```{julia} -fitlog = gr_refit!(m3) +#| eval: false +fitlog = gr_refit!(kbm01) ``` ```{julia} +#| eval: false last(fitlog.obj) ``` ```{julia} +#| eval: false last(fitlog, 5) -``` \ No newline at end of file +``` + +## Conclusions {#sec-conclusions} + +Generally the gradient-based optimizers converge in fewer evaluations than the derivative-free optimizers (`psm01` in @sec-pastes is an exception). +Although the `ftol_rel` criterion is looser for the gradient-based optimizer it usually achieves a lower optimum value, as shown by the differences like `pnm01_obj - last(fitlog.obj)` being positive. + +I think the most interesting result is for the `insteval` data where the three-parameter optimization takes 81 function evaluations for `LN_NEWUOA` but 34 evaluations for `LD_LBFGS`. +However, the ForwardDiff gradient evaluation takes much longer because it allocates so much memory (and it may be using a non-BLAS Cholesky factorization of $1128\times1128$ symmetric matrix). + +I think this is a case where an analytic gradient could be useful. diff --git a/src/MixedModels.jl b/src/MixedModels.jl index 61d184c84..f3400bc98 100644 --- a/src/MixedModels.jl +++ b/src/MixedModels.jl @@ -214,6 +214,7 @@ include("blockdescription.jl") include("grouping.jl") include("mimeshow.jl") include("serialization.jl") +include("gradient.jl") include("profile/profile.jl") include("MixedModelsNLoptExt.jl") using .MixedModelsNLoptExt diff --git a/src/gradient.jl b/src/gradient.jl new file mode 100644 index 000000000..b6c68bdbb --- /dev/null +++ b/src/gradient.jl @@ -0,0 +1,56 @@ +# Evaluate analytic gradient of the objective for ML or REML fitting of a LinearMixedModel + +""" + Omega_dot_diag_block!(blk, m::LinearMixedModel, p::Integer) + +Fill `blk` as the non-zero diagonal block of ∂Ω/∂θₚ for parameter number `p` of model `m`. + +For any `p` only one diagonal block of ∂Ω/∂θₚ will be non-zero. +""" +function Omega_dot_diag_block!( + blk::Diagonal{T, Vector{T}}, + m::LinearMixedModel{T}, + p::Integer, +) where {T} + (; parmap, A, reterms) = m + b, i, j = parmap[p] + isone(i) && isone(j) || throw(ArgumentError("parameter $b should be from a scalar r.e. term")) + blk_diag = blk.diag + A_diag = A[kp1choose2(b)].diag # will throw an error if the A[b,b] block is not Diagonal + length(blk_diag) == length(A_diag) || throw(DimensionMismatch("A_diag and blk_diag have different lengths")) + λ = only(reterms[b].λ) # will throw an error if reterms[b] is not of size (1,1) + for k in eachindex(blk_diag, A_diag) + blk_diag[k] = T(2) * λ * A_diag[k] + end + return blk +end + +function Omega_dot_diag_block!( + blk::UniformBlockDiagonal{T}, + m::LinearMixedModel{T}, + p::Integer, +) where {T} + (; parmap, A, reterms) = m + b, i, j = parmap[p] + Ablk = A[kp1choose2(b)] + if !isa(Ablk, UniformBlockDiagonal{T}) + throw(ArgumentError("parmap[p] = $(parmap[p]) but A[$(kp1choose2(b))] is not UniformBlockDiagonal")) + end + blk_dat = fill!(blk.data, zero(T)) + Ablk_dat = Ablk.data + λ = reterms[b].λ + for k in axes(blk_dat, 3) + # right multiply by λ-dot-transpose, which is zeros except for a single 1 at the i'th column and j'th row + # thus we copy the i'th column of the k'th face of Ablk_dat into the j'th column of the k'th face of blk_dat + copyto!(view(blk_dat, :, i, k), view(Ablk_dat, :, j, k)) + lmul!(λ, view(blk_dat, :, :, k)) # left-multiply by λ + for jj in axes(λ, 2) # symmetrize the face while multiplying the diagonal by 2 + for ii in 1:(jj - 1) + val = blk_dat[ii, jj, k] + blk_dat[jj, ii, k] + blk_dat[ii, jj, k] = blk_dat[jj, ii, k] = val + end + blk_dat[jj, jj, k] *= T(2) + end + end + return blk +end diff --git a/test/grad.jl b/test/grad.jl new file mode 100644 index 000000000..1e036be7d --- /dev/null +++ b/test/grad.jl @@ -0,0 +1,73 @@ +# using FiniteDiff +using LinearAlgebra +using MixedModels +using Test +using MixedModels: Omega_dot_diag_block! + +using MixedModelsDatasets: dataset + +include("modelcache.jl") + +@testset "gradient" begin + @testset "single_scalar" begin + fm1 = only(models(:dyestuff2)) + θ = fm1.θ + blk = Omega_dot_diag_block!(similar(first(fm1.A)), fm1, 1) + @test all(≈(10. * only(θ)), blk.diag) + Omega_dot_diag_block!(blk, updateL!(setθ!(fm1, ones(1))), 1) + @test all(==(10.), blk.diag) + updateL!(setθ!(fm1, θ)) # restore the estimated parameter values + + fm2 = first(models(:pastes)) + θ = fm2.θ + blk = Omega_dot_diag_block!(similar(first(fm2.A)), fm2, 1) + @test all(≈(4. * only(θ)), blk.diag) + Omega_dot_diag_block!(blk, updateL!(setθ!(fm2, ones(1))), 1) + @test all(==(4.), blk.diag) + updateL!(setθ!(fm2, θ)) + + fm3 = last(models(:pastes)) # first of two nested, scalar r.e. terms + θ = fm3.θ + blk = Omega_dot_diag_block!(similar(first(fm3.A)), fm3, 1) + @test all(≈(4. * first(θ)), blk.diag) + + fm4 = only(models(:penicillin)) + blk = Omega_dot_diag_block!(similar(first(fm4.A)), fm4, 1) + @test all(≈(12. * first(fm4.θ)), blk.diag) + + fm5 = first(models(:sleepstudy)) + blk = Omega_dot_diag_block!(similar(first(fm5.A)), fm5, 1) + @test all(≈(20. * only(fm5.θ)), blk.diag) + end + @testset "single_vector" begin + fm6 = last(models(:sleepstudy)) + λ = only(fm6.reterms).λ + θ = fm6.θ + blk = Omega_dot_diag_block!(UniformBlockDiagonal(similar(first(fm6.L).data)), fm6, 1) + blk_dat = blk.data + A11_dat = first(fm6.A).data + @test all(≈(20. * first(θ)), view(blk_dat, 1, 1, :)) + @test all(iszero, view(blk_dat, 2, 2, :)) + @test all(view(blk_dat, 1, 2, :) .== view(blk_dat, 2, 1, :)) + odiag = dot(view(λ, 2, :), view(A11_dat, :, 1, 1)) + @test all(≈(odiag), view(blk_dat, 1, 2, :)) + + Omega_dot_diag_block!(blk, fm6, 2) + @test all(iszero, view(blk_dat, 1, 1, :)) + @test all(view(blk_dat, 1, 2, :) .== view(blk, 2, 1, :)) # result should be symmetric + @test all(==(10. * first(θ)), view(blk_dat, 1, 2, :)) + diag2 = 2. * dot(view(λ.data, 2, :), view(A11_dat, :, 1, 1)) + @test all(≈(diag2), view(blk_dat, 2, 2, :)) + + Omega_dot_diag_block!(blk, fm6, 3) + @test all(iszero, view(blk_dat, 1, 1, :)) + @test all(view(blk_dat, 1, 2, :) .== view(blk, 2, 1, :)) # faces of result should be symmetric + @test all(≈(45. * first(θ)), view(blk_dat, 1, 2, :)) + diag2 = 2. * dot(view(λ.data, 2, :), view(A11_dat, :, 2, 1)) + @test all(≈(diag2), view(blk_dat, 2, 2, :)) + +# FiniteDiff.finite_difference_gradient(objective!(fm6), θ) +# ldfun(m::LinearMixedModel, θ::Vector{Float64}) = logdet(updateL!(setθ!(m, θ))) + + end +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 0631ca3a0..20cb6d4f5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -47,6 +47,7 @@ include("mime.jl") include("optsummary.jl") include("predict.jl") include("sigma.jl") +include("grad.jl") @testset "PRIMA" include("prima.jl") @testset "ForwardDiff" include("forwarddiff.jl") From ad2cb99abe165d310bc408ed3ed6815c4b642a64 Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Fri, 23 Jan 2026 14:50:12 -0600 Subject: [PATCH 16/23] Initial, clunky version of blocked grad eval. --- src/gradient.jl | 260 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 257 insertions(+), 3 deletions(-) diff --git a/src/gradient.jl b/src/gradient.jl index b6c68bdbb..3c229d0ce 100644 --- a/src/gradient.jl +++ b/src/gradient.jl @@ -14,9 +14,10 @@ function Omega_dot_diag_block!( ) where {T} (; parmap, A, reterms) = m b, i, j = parmap[p] - isone(i) && isone(j) || throw(ArgumentError("parameter $b should be from a scalar r.e. term")) + isone(i) && isone(j) || throw(ArgumentError("parameter $p should be from a scalar r.e. term")) + # It is common for 'b' to be one as well but nested models can result in diagonal blk for b > 1 blk_diag = blk.diag - A_diag = A[kp1choose2(b)].diag # will throw an error if the A[b,b] block is not Diagonal + A_diag = A[kp1choose2(b)].diag # will throw an error if the A[b,b] block is not Diagonal length(blk_diag) == length(A_diag) || throw(DimensionMismatch("A_diag and blk_diag have different lengths")) λ = only(reterms[b].λ) # will throw an error if reterms[b] is not of size (1,1) for k in eachindex(blk_diag, A_diag) @@ -26,9 +27,31 @@ function Omega_dot_diag_block!( end function Omega_dot_diag_block!( - blk::UniformBlockDiagonal{T}, + blk::Matrix{T}, m::LinearMixedModel{T}, p::Integer, +) where {T} + (; parmap, A, reterms) = m + b, i, j = parmap[p] + k = size(reterms[b].λ, 1) + if isone(k) + isone(i) && isone(j) || throw(ArgumentError("parameter $p should be from a scalar r.e. term")) + A_diag = A[kp1choose2(b)].diag # will throw an error if the A[b,b] block is not Diagonal + size(blk, 1) == size(blk, 2) == length(A_diag) || throw(DimensionMismatch("A_diag and blk_diag have different lengths")) + λ = only(reterms[b].λ) # will throw an error if reterms[b].λ is not of size (1,1) + _zero_out(blk) + for k in eachindex(A_diag) + blk[k, k] = T(2) * λ * A_diag[k] + end + return blk + end + throw(ArgumentError("Code not yet written for k > 1")) +end + +function Omega_dot_diag_block!( + blk::UniformBlockDiagonal{T}, + m::LinearMixedModel{T}, + p::Integer ) where {T} (; parmap, A, reterms) = m b, i, j = parmap[p] @@ -54,3 +77,234 @@ function Omega_dot_diag_block!( end return blk end + +function LinearAlgebra.ldiv!( + A::LowerTriangular{T,UniformBlockDiagonal{T}}, + B::UniformBlockDiagonal{T}, +) where {T} + A_dat = A.data.data + B_dat = B.data + if size(A_dat) ≠ size(B_dat) + throw(DimensionMismatch("size(A_dat) = $(size(A_dat)) ≠ $(size(B_dat)) = size(B_dat)")) + end + for k in axes(B_dat, 3) + ldiv!(LowerTriangular(view(A_dat, :, :, k)), view(B_dat, :, :, k)) + end + return B +end + +function LinearAlgebra.ldiv!( + A::LowerTriangular{T,UniformBlockDiagonal{T}}, + B::Matrix{T}, +) where {T} + if size(A, 2) ≠ size(B, 1) + throw(DimensionMismatch("size(A,2) = $(size(A,2)) ≠ $(size(B,1)) = size(B,1")) + end + A_dat = A.data.data + axis1 = axes(A_dat, 1) + offset = 0 + for k in axes(A_dat, 3) + ldiv!(LowerTriangular(view(A_dat, :, :, k)), view(B, offset .+ axis1, :)) + offset += length(axis1) + end + return B +end + +function LinearAlgebra.rdiv!( + A::UniformBlockDiagonal{T}, + B::UpperTriangular{T, LinearAlgebra.Transpose{T, UniformBlockDiagonal{T}}}, +) where {T} + A_dat = A.data + B_dat = B.data.parent.data + if size(A_dat) ≠ size(B_dat) + throw(DimensionMismatch("size(A_dat) = $(size(A_dat)) ≠ $(size(B_dat)) = size(B_dat)")) + end + for k in axes(A_dat, 3) + rdiv!(view(A_dat, :, :, k), LowerTriangular(view(B_dat, :, :, k))') + end + return B +end + +function Base.similar(A::UniformBlockDiagonal) + return UniformBlockDiagonal(similar(A.data)) +end + +""" + grad_blocks(m::LinearMixedModel{T}) + +Return Matrix{AbstractMatrix{T}} containing the gradient-evaluation blocks for model `m`. +""" +function grad_blocks(m::LinearMixedModel{T}) where {T} + (; L, reterms) = m + k = length(reterms) + 1 + val = sizehint!(AbstractMatrix{T}[], abs2(k)) + for j in 1:k + for i in 1:k + push!(val, similar(i ≥ j ? L[block(i, j)] : L[block(j, i)]')) + end + end + return reshape(val, (k, k)) +end + +function _zero_out(A::Matrix{T}) where {T} + return fill!(A, zero(T)) +end + +function _zero_out(A::Diagonal{T,Vector{T}}) where {T} + fill!(A.diag, zero(T)) + return A +end + +function _zero_out(A::UniformBlockDiagonal{T}) where {T} + fill!(A.data, zero(T)) + return A +end + +""" + copyskip!(B::Matrix{T}, A::Matrix{T}, i::Integer, j::Integer, k::Integer) where {T} + +Create `A * Ω_dot` in `B` where `Ω_dot` is the indicator for the `i`'th row and `j`'th column in a matrix of size `k` +""" +function copyskip!(B::Matrix{T}, A::Matrix{T}, i::Integer, j::Integer, k::Integer) where {T} + m, n = size(A) + (m, n) == size(B) || throw(DimensionMismatch("size(A) = $(size(A)) ≠ $(size(B)) = size(B)")) + isone(k) && return copyto!(B, A) + + fill!(B, zero(T)) + q, r = divrem(n, k) + iszero(r) || throw(DimensionMismatch("n = $n is not a multiple of k = $k")) + offset = 0 + for _ in 1:q + copyto!(view(B, :, offset + i), view(A, :, offset + j)) + offset += k + end + return B +end + +function copyskip!(B::SparseMatrixCSC{T}, A::SparseMatrixCSC{T}, i::Integer, j::Integer, k::Integer) where {T} + (A.m == B.m && A.n == B.n) || throw(DimensionMismatch("size(A) = $(size(A)) ≠ $(size(B)) = size(B)")) + if any(A.colptr .≠ B.colptr) || any(rowvals(A) .≠ rowvals(B)) + throw(ArgumentError("A and B must have the same sparsity pattern")) + end + isone(k) && return copyto!(B, A) + + fill!(B.nzval, zero(T)) + q, r = divrem(A.n, k) + iszero(r) || throw(DimensionMismatch("n = $n is not a multiple of k = $k")) + rvB = rowvals(B) + rvA = rowvals(A) + nzB = nonzeros(B) + nzA = nonzeros(A) + offset = 0 + for _ in 1:q + nzrB = nzrange(B, offset + i) + nzrA = nzrange(A, offset + j) + if !(view(rvB, nzrB) .== view(rvA, nzrA)) + throw(ArgumentError("A and B must have same sparsity pattern after shifting")) + end + copyto!(view(nzB, nzrB), view(nzA, nzrA)) + offset += k + end + return B +end + +""" + initialize_blocks!(blks::Matrix{AbstractMatrix{T}}, m::LinearMixedModel{T}, p::Integer) + +Initialize the grad evaluation blocks, `blks`, for model `m`, for parameter `p` +""" +function initialize_blocks!( + blks::Matrix{AbstractMatrix{T}}, + m::LinearMixedModel{T}, + p::Integer, +) where {T} + (; parmap, A, reterms) = m + b, i, j = parmap[p] + k = size(reterms[b].λ, 1) + for ii in 1:(b - 1) + for jj in 1:(b - 1) + _zero_out(blks[ii, jj]) + end + end + Omega_dot_diag_block!(blks[b, b], m, p) # populate the b'th diagonal block + for c in (b + 1):size(blks, 2) # zero out the blocks below and to the right of [b, b] + for r in (b + 1):size(blks, 1) + _zero_out(blks[r, c]) + end + end + for c in 1:(b - 1) + copyskip!(blks[b, c], A[block(b, c)], i, j, k) + copyto!(blks[c, b], blks[b, c]') + end + for r in (b + 1):size(blks, 1) + copyskip!(blks[r, b], A[block(r, b)], i, j, k) + copyto!(blks[b, r], blks[r, b]') + end + return blks +end + +""" + eval_grad_p!(blks, m, p) + +Evaluate the gradient component for parameter `p` in model `m` using blocks in `blks` for storage +""" +function eval_grad_p!(blks::Matrix{AbstractMatrix{T}}, m::LinearMixedModel{T}, p::Integer) where {T} + L = m.L +# b = first(parmap[p]) # block, row and column for parameter p + initialize_blocks!(blks, m, p) # change this to pass b, i, j, k separately + for kk in axes(blks, 2) # ldiv!(LowerTriangular(L), blks) +# if jj ≥ b # maybe hold off on this at the expense of some multiplications by zero + L11 = L[block(1, 1)] + isa(L11, Diagonal) || (L11 = LowerTriangular(L11)) + C1 = ldiv!(L11, blks[1, kk]) + for ii in axes(blks, 1)[2:end] + mul!(blks[ii, kk], L[block(ii, 1)], C1, -one(T), one(T)) + end +# end + for jj in axes(blks, 1)[2:end] + Cj = ldiv!(LowerTriangular(L[block(jj, jj)]), blks[jj, kk]) + for ii in jj+1:lastindex(blks, 1) + mul!(blks[ii, kk], L[block(ii, jj)], Cj, -one(T), one(T)) + end + end + end + # for k in axes(B,2) + # a11 = A[1,1] + # iszero(a11) && throw(SingularException(1)) + # C1 = C[1,k] = a11 \ B[1,k] + # # fill C-column + # for i in axes(B,1)[2:end] + # C[i,k] = oA \ B[i,k] - _ustrip(A[i,1]) * C1 + # end + # for j in axes(B,1)[2:end] + # ajj = A[j,j] + # iszero(ajj) && throw(SingularException(j)) + # Cj = C[j,k] = _ustrip(ajj) \ C[j,k] + # for i in j+1:lastindex(A,1) + # C[i,k] -= _ustrip(A[i,j]) * Cj + # end + # end + # end + for ii in axes(blks, 1) # rdiv!(blks, transpose(LowerTriangular(L))) + for jj in axes(blks, 2) + blkij = blks[ii, jj] + for kk in 1:(jj - 1) + mul!(blkij, blks[ii, kk], transpose(L[block(jj, kk)]), -one(T), one(T)) + end + rdiv!(blkij, transpose(LowerTriangular(L[block(jj, jj)]))) + end + end + # for i in axes(A,1) + # for j in axes(A,2) + # Aij = A[i,j] + # for k in firstindex(B,2):j - 1 + # Aij -= C[i,k]*tfun(B[j,k]) + # end + # unit || (iszero(B[j,j]) && throw(SingularException(j))) + # C[i,j] = Aij / (unit ? oB : tfun(B[j,j])) + # end + # end + return blks +end + + From afc23c6b85490c5601043cf6ce600f53ce324bfa Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Sat, 24 Jan 2026 12:19:46 -0600 Subject: [PATCH 17/23] Update document on gradient evaluation. --- gradients/GradientEvaluation.qmd | 261 +++++++++++++++---------------- 1 file changed, 130 insertions(+), 131 deletions(-) diff --git a/gradients/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd index 9c11192ff..61147ea38 100644 --- a/gradients/GradientEvaluation.qmd +++ b/gradients/GradientEvaluation.qmd @@ -220,7 +220,7 @@ using FiniteDiff using ForwardDiff using LinearAlgebra using MixedModels -using MixedModels: Omega_dot_diag_block! +using MixedModels: eval_grad_p!, grad_blocks, initialize_blocks! using MixedModelsDatasets: dataset using TypedTables: Table @@ -296,121 +296,178 @@ Int.(ZXy) # Int for more concise printing is stored as the $\mathbf{A}$ property of the model object. -Both $\mathbf{A}$ and $\mathbf{L}$ are stored as blocked matrices with blocks in the pattern +Both $\mathbf{A}$ and and the lower Cholesky factor $\mathbf{L}$ are stored as blocked matrices with blocks in the pattern ```{julia} BlockDescription(m01) ``` -In @eq-Sigma the matrix $\boldsymbol{\Lambda_\theta}$ is defined as a $6\times6$ diagonal matrix. -Here, for convenience, we extend it to an $8\times8$ matrix with a trailing diagonal block that is the identity, so that multiplication by $\boldsymbol{\Lambda_\theta}$ applies to the full matrix $\mathbf{A}$. +The upper left diagonal block in $\mathbf{A}$ has diagonal ```{julia} -Λ(θ) = Diagonal(vcat(fill(only(θ), 6), ones(eltype(θ), 2))) -Λ([1.]) +transpose(first(m01.A).diag) # transpose for compact printing ``` -The derivative of $\Lambda$ with respect to the first (and only) element of $\boldsymbol\theta$ is +To evaluate the gradient we create the blocked storage using `grad_blocks` and initialize these blocks for the `p`th parameter with `initialize_blocks!` ```{julia} -dΛdθ1 = Diagonal(vcat(ones(6), zeros(2))) +blks = initialize_blocks!(grad_blocks(m01), m01, 1) ``` -The matrix $\mathbf{A}$ from which $\boldsymbol{\Omega_\theta}$ is generated is stored in blocks. -We assemble these into a sparse matrix as +In practice we evaluate $\mathbf{L}^{-1}\dot{\boldsymbol\Omega}\mathbf{L}^{-\top}$ in the blocked format but here, for illustration, we create $\dot{\boldsymbol{\Omega}}$ in full and use a sparse form of $\mathbf{L}$ to verify the results. ```{julia} -A = sparse(hvcat(2, first(m01.A), m01.A[2]', m01.A[2], last(m01.A))) +Ω_dot = hvcat(2, blks[1,1], blks[1,2], blks[2,1], blks[2,2]) ``` -and check that these blocks are indeed the Gram matrix of the columns of $\mathbf{[ZXy]}$ ```{julia} -A == ZXy'ZXy +L = LowerTriangular(sparseL(m01; full=true)) +``` + +```{julia} +ldiv!(L, rdiv!(Ω_dot, L')) +``` + +The blocked form of this calculation is implemented in the `eval_grad_p!` function + +```{julia} +eval_grad_p!(blks, m01, 1) +``` + +We can see that the $1,1$ block, which is diagonal + +```{julia} +transpose(first(blks).diag) # transpose to print more compactly ``` -The matrix $\boldsymbol{\Omega_\theta}$ is +matches the diagonal in the full-scale calculation. + +Also, the $2,2$ block + +```{julia} +last(blks) +``` + +matches that from the full-scale calculation. + +A finite-difference approximation to this gradient (it is just a scalar derivative in this case) is ```{julia} -Ω(θ) = Λ(θ) * A * Λ(θ)' + Diagonal(vcat(ones(6), zeros(2))) -Ω([1.]) +FiniteDiff.finite_difference_gradient(m01, ones(1)) ``` -The lower Cholesky factor, $\mathbf{L}_\boldsymbol{\theta}$, which is stored in three blocks as described above, can be extracted as a sparse matrix as +The gradient evaluation from the blocked representation is ```{julia} +sum(first(blks).diag) + length(m01.y) * last(last(blks)) +``` + +If we repeat these steps at the parameter estimate we have + +```{julia} +updateL!(setθ!(m01, θ)) # reset the value of θ in the model L = LowerTriangular(sparseL(m01; full=true)) +initialize_blocks!(blks, m01, 1) +M = rdiv!(ldiv!(L, hvcat(2, blks[1,1], blks[1,2], blks[2,1], blks[2,2])), L') ``` -We can check that it is indeed the lower Cholesky factor +with the derivative being evaluated as ```{julia} -L * L' ≈ Ω([1.]) +dot(vcat(ones(6), 0., size(dyestuff, 1)), diag(M)) ``` -The derivative of $\boldsymbol{\Omega}$ with respect to $\theta$ is +Or, using blocked factors, ```{julia} -dΩdθ1(θ) = dΛdθ1 * A * Λ(θ)' + Λ(θ) * A * dΛdθ1 -dΩdθ1([1.]) +eval_grad_p!(blks, m01, 1) ``` -Notice that this matrix, like $\mathbf{A}$, is symmetric and has the same block structure as $\mathbf{A}$. -In fact, the $[2,1]$ block of this matrix is exactly the same as the $[2,1]$ block of $\mathbf{A}$. +```{julia} +sum(first(blks).diag) + length(m01.y) * last(last(blks)) +``` -We do the premultiplying by $\mathbf{L}^{-1}$ and postmultiplying by $\mathbf{L}^{-\top}$ in two stages, to check the form of the intermediate result +Or, using a finite-difference approximation ```{julia} -prePhi = ldiv!(L, dΩdθ1([1.])) +FiniteDiff.finite_difference_gradient(m01, θ) ``` + +### Sleepstudy - a single vector-valued random-effects term + ```{julia} -rdiv!(prePhi, L') +m03 = fit(MixedModel, @formula(reaction ~ 1 + days + (1+days|subj)), dataset(:sleepstudy); progress) +θ03 = m03.θ +print(m03) ``` -The derivative of the objective at $\theta_1=1$ is, therefore +Reset the parameters to the starting values, create the blocks for the gradient evaluation, and initialize the blocks for the evaluation of the first element of the gradient. ```{julia} -dot(vcat(ones(6), 0., size(dyestuff, 1)), diag(prePhi)) +updateL!(setθ!(m03, m03.optsum.initial)) +blks = initialize_blocks!(grad_blocks(m03), m03, 1) +view(first(blks), 1:4, 1:4) ``` -which we can compare to the finite-difference evaluation +```{julia} +L = LowerTriangular(sparseL(m03; full=true)) +``` ```{julia} -FiniteDiff.finite_difference_gradient(objective!(m01), [1.]) +M = ldiv!(L, rdiv!(hvcat(2, blks[1,1], blks[1,2], blks[2,1], blks[2,2]), L')) ``` -If we repeat these steps at the parameter estimate we have ```{julia} -#| output: false -updateL!(setθ!(m01, θ)) # reset the value of θ in the model -L = LowerTriangular(sparseL(m01; full=true)) -prePhi = rdiv!(ldiv!(L, dΩdθ1(θ)), L') +sum(M[i,i] for i in 1:36) + length(m03.y) * last(M) ``` -with the derivative being evaluated as +```{julia} +FiniteDiff.finite_difference_gradient(m03, m03.optsum.initial) +``` + +For the blocked evaluation + +```{julia} +eval_grad_p!(blks, m03, 1) +``` + +the value of the gradient component is ```{julia} -dot(vcat(ones(6), 0., size(dyestuff, 1)), diag(prePhi)) +dat = first(blks).data +sum(k -> (dat[1, 1, k] + dat[2, 2, k]), axes(dat, 3)) + length(m03.y) * last(last(blks)) ``` -#### Using blocked factors +For the second component of the gradient + +```{julia} +eval_grad_p!(blks, m03, 2) +view(first(blks), 1:4, 1:4) +``` -It would not be practical to create the `sparseL` matrix and a dense copy of $\mathbf{A}$, as is done here, for general cases. -The whole point of using a blocked Cholesky factor for evaluating the objective is that the $[1,1]$ block is either diagonal or uniform-block-diagonal, which can result in considerable savings of object size and execution time when evaluating the objective. +```{julia} +last(blks) +``` -We would want to retain this time and memory savings when evaluating the gradient by creating a blocked computation for evaluation of the gradient. +```{julia} +sum(k -> (dat[1, 1, k] + dat[2, 2, k]), axes(dat, 3)) + length(m03.y) * last(last(blks)) +``` -The blocked representations of both the $\mathbf{A}$ and $\mathbf{L}$ arrays store only the lower triangle. -It appears that it will be necessary to store the upper triangle in addition to the lower triangle to be able to evaluate the gradient. -There may be a clever way around this but right now I can't see one. +And, for the final component, -To create the blocked representation of $\Omega$ we begin with the diagonal block in which the parameter component to be differentiated occurs. -It is assumed that the block has already been allocated. +```{julia} +eval_grad_p!(blks, m03, 3) +view(first(blks), 1:4, 1:4) +``` ```{julia} -dΩdθ1_blk = Omega_dot_diag_block!(similar(first(m01.A)), m01, 1) +last(blks) ``` +```{julia} +sum(k -> (dat[1, 1, k] + dat[2, 2, k]), axes(dat, 3)) + length(m03.y) * last(last(blks)) +``` ### Penicillin - two completely crossed scalar random-effects terms @@ -464,137 +521,79 @@ objective(m02) #### Evaluating terms in the gradient -Again, we create the full Gram matrix $\mathbf{A}$ from the blocks stored in the `A` property of the model - -```{julia} -A2 = let blk = m02.A - hvcat(3, first(blk), blk[2]', blk[4]', blk[2], blk[3], blk[5]', blk[4], blk[5], blk[6]) -end -Int.(A2) -``` - -The $32\times32$ form of $\boldsymbol{\Lambda}(\boldsymbol\theta)$ is - -```{julia} -function Λ2(θ::Vector{Float64}) - length(θ) == 2 || throw(DimensionMismatch("length(θ) should be 2")) - return Diagonal(vcat(fill(first(θ), 24), fill(last(θ), 6), ones(2))) -end -Int.(Λ2([1.,1.])) -``` - -producing $\boldsymbol{\Omega}$ as - -```{julia} -function Ω2(θ) - length(θ) == 2 || throw(DimensionMismatch("length(θ) should be 2")) - return Λ2(θ) * A2 * Λ2(θ)' + Diagonal(vcat(ones(30), zeros(2))) -end -Int.(Ω2([1.,1.])) -``` - -The partial derivatives of `Ω2` are constants +For the first gradient component, at the initial values ```{julia} -#| output: false -dΛ2dθ1 = Diagonal(vcat(ones(24), zeros(8))) -dΛ2dθ2 = Diagonal(vcat(zeros(24), ones(6), zeros(2))) +updateL!(setθ!(m02, m02.optsum.initial)) +blks = eval_grad_p!(grad_blocks(m02), m02, 1) ``` -For the ML objective (i.e. negative twice the log-likelihood) a finite difference gradient at the initial $\boldsymbol{\theta}=[1,1]^\top$ is +the value of the gradient component is ```{julia} -FiniteDiff.finite_difference_gradient(objective!(m02), ones(2)) +sum(first(blks).diag) + sum(diag(blks[2,2])) + length(m02.y) * last(last(blks)) ``` -To evaluate this quantity from the formula we create +and the second component is ```{julia} -dΩ2dθ1(θ) = dΛ2dθ1 * A2 * Λ2(θ)' + Λ2(θ) * A2 * dΛ2dθ1' -Int.(dΩ2dθ1([1., 1.])) # Int to save space when printing +eval_grad_p!(blks, m02, 2) ``` +Notice that the $1,1$ block is zero. +At present we evaluate it but in the future we can skip that calculation. -```{julia} -prePhi = rdiv!(ldiv!(Lsparse, dΩ2dθ1([1.,1.])), Lsparse') -``` - -yielding the first component of the gradient as ```{julia} -dot(diag(prePhi), vcat(ones(30), 0., length(m02.y))) +sum(diag(blks[2,2])) + length(m02.y) * last(last(blks)) ``` -For the second component of the gradient +These can be compared to the finite-difference approximations ```{julia} -dΩ2dθ2(θ) = dΛ2dθ2 * A2 * Λ2(θ)' + Λ2(θ) * A2 * dΛ2dθ2' -Int.(dΩ2dθ2([1., 1.])) +FiniteDiff.finite_difference_gradient(m02, m02.optsum.initial) ``` -```{julia} -prePhi = rdiv!(ldiv!(Lsparse, dΩ2dθ2([1.,1.])), Lsparse') -``` +In the full matrix representation ```{julia} -dot(diag(prePhi), vcat(ones(30), 0., length(m02.y))) +initialize_blocks!(blks, m02, 2) +Ω_dot = let b = blks + hvcat(3, b[1,1], b[1,2], b[1,3], b[2,1], b[2,2], b[2,3], b[3,1], b[3,2], b[3,3]) +end ``` -### Sleepstudy - a single vector-valued random-effects term - ```{julia} -m03 = fit(MixedModel, @formula(reaction ~ 1 + days + (1+days|subj)), dataset(:sleepstudy); progress) -θ03 = m03.θ -print(m03) +L = LowerTriangular(sparseL(m02; full=true)) ``` -For this model we create a function to evaluate the gradient components. -First, the gradient of the logarithm of the square of the determinant of $\mathbf{L_{1,1}}$ - ```{julia} -ldfun(θ::Vector{Float64}) = logdet(updateL!(setθ!(m03, θ))) -ldfun(θ03) +ldiv!(L, rdiv!(Ω_dot, L')) ``` -A finite-difference approximation to the gradient is - ```{julia} -FiniteDiff.finite_difference_gradient(ldfun, θ03) +sum(Ω_dot[j,j] for j in 25:30) + length(m02.y) * last(Ω_dot) ``` -with the corresponding analytic value - ```{julia} -include("gradient.jl") -grad_comp(updateL!(setθ!(m03, θ03))) # need to reset θ after the finite_difference operation +eval_grad_p!(blks, m02, 2) ``` - -#### Speed of evaluation - -First, the evaluation of the log-determinant of $\mathbf{L_{1,1}}$ - ```{julia} -@benchmark logdet(updateL!(setθ!($m03, $θ03))) seconds=1 +view(Ω_dot, 25:30, 25:30) ``` -Then the subsequent evaluation of the gradient of the log-determinant - ```{julia} -@benchmark grad_comp($m03) seconds=1 +blks[2,2] ``` -For comparison, the finite-difference evaluation ```{julia} -@benchmark FiniteDiff.finite_difference_gradient($ldfun, $θ03) seconds=1 +last(blks) ``` -and the forward-difference version - ```{julia} -@benchmark ForwardDiff.gradient($m03, $θ03) seconds=1 +view(Ω_dot, 31:32, 31:32) ``` - ### References {.unnumbered} ::: {#refs} From 8f3981e85908ab9af928e7d02b475ac294aac0e5 Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Sun, 25 Jan 2026 10:29:37 -0600 Subject: [PATCH 18/23] Fixed, I hope, the initialization of the gradient blocked matrix --- gradients/GradientEvaluation.qmd | 8 ++++++-- src/gradient.jl | 32 ++++++++++++++++---------------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/gradients/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd index 61147ea38..5476ae2d1 100644 --- a/gradients/GradientEvaluation.qmd +++ b/gradients/GradientEvaluation.qmd @@ -557,7 +557,7 @@ FiniteDiff.finite_difference_gradient(m02, m02.optsum.initial) In the full matrix representation ```{julia} -initialize_blocks!(blks, m02, 2) +initialize_blocks!(blks, updateL!(setθ!(m02, m02.optsum.initial)), 2) Ω_dot = let b = blks hvcat(3, b[1,1], b[1,2], b[1,3], b[2,1], b[2,2], b[2,3], b[3,1], b[3,2], b[3,3]) end @@ -576,7 +576,7 @@ sum(Ω_dot[j,j] for j in 25:30) + length(m02.y) * last(Ω_dot) ``` ```{julia} -eval_grad_p!(blks, m02, 2) +eval_grad_p!(blks, updateL!(setθ!(m02, m02.optsum.initial)), 2) ``` ```{julia} @@ -594,6 +594,10 @@ last(blks) ```{julia} view(Ω_dot, 31:32, 31:32) ``` + +```{julia} +sum(diag(blks[2,2])) + length(m02.y) * last(last(blks)) +``` ### References {.unnumbered} ::: {#refs} diff --git a/src/gradient.jl b/src/gradient.jl index 3c229d0ce..bd4a17f1c 100644 --- a/src/gradient.jl +++ b/src/gradient.jl @@ -221,25 +221,25 @@ function initialize_blocks!( (; parmap, A, reterms) = m b, i, j = parmap[p] k = size(reterms[b].λ, 1) - for ii in 1:(b - 1) - for jj in 1:(b - 1) - _zero_out(blks[ii, jj]) - end - end Omega_dot_diag_block!(blks[b, b], m, p) # populate the b'th diagonal block - for c in (b + 1):size(blks, 2) # zero out the blocks below and to the right of [b, b] - for r in (b + 1):size(blks, 1) - _zero_out(blks[r, c]) + for r in axes(blks, 1) # iterate over the lower triangle, transpose-copying to upper triangle + if r ≠ b + for c in 1:r + if c ≠ b + _zero_out(blks[r, c]) + _zero_out(blks[c, r]) + else + copyskip!(blks[r, c], A[block(r, c)], i, j, k) + copyto!(blks[c, r], blks[r, c]') + end + end + else + for c in 1:(r - 1) + copyskip!(blks[r, c], A[block(r, c)], i, j, k) + copyto!(blks[c, r], blks[r, c]') + end end end - for c in 1:(b - 1) - copyskip!(blks[b, c], A[block(b, c)], i, j, k) - copyto!(blks[c, b], blks[b, c]') - end - for r in (b + 1):size(blks, 1) - copyskip!(blks[r, b], A[block(r, b)], i, j, k) - copyto!(blks[b, r], blks[r, b]') - end return blks end From 89ba9cdd3d3e7f09b427ee103321b36dc7bc7114 Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Sun, 25 Jan 2026 10:41:42 -0600 Subject: [PATCH 19/23] Formatting changes --- src/gradient.jl | 232 +++++++++++++++++++----------------- src/linalg.jl | 6 +- src/linalg/cholUnblocked.jl | 2 +- src/linalg/rankUpdate.jl | 2 +- src/linearmixedmodel.jl | 9 +- src/mixedmodel.jl | 4 +- src/remat.jl | 2 +- test/grad.jl | 34 +++--- 8 files changed, 153 insertions(+), 138 deletions(-) diff --git a/src/gradient.jl b/src/gradient.jl index bd4a17f1c..28c08b07a 100644 --- a/src/gradient.jl +++ b/src/gradient.jl @@ -8,74 +8,82 @@ Fill `blk` as the non-zero diagonal block of ∂Ω/∂θₚ for parameter number For any `p` only one diagonal block of ∂Ω/∂θₚ will be non-zero. """ function Omega_dot_diag_block!( - blk::Diagonal{T, Vector{T}}, - m::LinearMixedModel{T}, - p::Integer, -) where {T} - (; parmap, A, reterms) = m - b, i, j = parmap[p] - isone(i) && isone(j) || throw(ArgumentError("parameter $p should be from a scalar r.e. term")) - # It is common for 'b' to be one as well but nested models can result in diagonal blk for b > 1 - blk_diag = blk.diag - A_diag = A[kp1choose2(b)].diag # will throw an error if the A[b,b] block is not Diagonal - length(blk_diag) == length(A_diag) || throw(DimensionMismatch("A_diag and blk_diag have different lengths")) - λ = only(reterms[b].λ) # will throw an error if reterms[b] is not of size (1,1) - for k in eachindex(blk_diag, A_diag) - blk_diag[k] = T(2) * λ * A_diag[k] - end - return blk + blk::Diagonal{T,Vector{T}}, + m::LinearMixedModel{T}, + p::Integer, +) where {T} + (; parmap, A, reterms) = m + b, i, j = parmap[p] + isone(i) && isone(j) || + throw(ArgumentError("parameter $p should be from a scalar r.e. term")) + # It is common for 'b' to be one as well but nested models can result in diagonal blk for b > 1 + blk_diag = blk.diag + A_diag = A[kp1choose2(b)].diag # will throw an error if the A[b,b] block is not Diagonal + length(blk_diag) == length(A_diag) || + throw(DimensionMismatch("A_diag and blk_diag have different lengths")) + λ = only(reterms[b].λ) # will throw an error if reterms[b] is not of size (1,1) + for k in eachindex(blk_diag, A_diag) + blk_diag[k] = T(2) * λ * A_diag[k] + end + return blk end function Omega_dot_diag_block!( - blk::Matrix{T}, - m::LinearMixedModel{T}, - p::Integer, -) where {T} - (; parmap, A, reterms) = m - b, i, j = parmap[p] - k = size(reterms[b].λ, 1) - if isone(k) - isone(i) && isone(j) || throw(ArgumentError("parameter $p should be from a scalar r.e. term")) - A_diag = A[kp1choose2(b)].diag # will throw an error if the A[b,b] block is not Diagonal - size(blk, 1) == size(blk, 2) == length(A_diag) || throw(DimensionMismatch("A_diag and blk_diag have different lengths")) - λ = only(reterms[b].λ) # will throw an error if reterms[b].λ is not of size (1,1) - _zero_out(blk) - for k in eachindex(A_diag) - blk[k, k] = T(2) * λ * A_diag[k] - end - return blk - end - throw(ArgumentError("Code not yet written for k > 1")) + blk::Matrix{T}, + m::LinearMixedModel{T}, + p::Integer, +) where {T} + (; parmap, A, reterms) = m + b, i, j = parmap[p] + k = size(reterms[b].λ, 1) + if isone(k) + isone(i) && isone(j) || + throw(ArgumentError("parameter $p should be from a scalar r.e. term")) + A_diag = A[kp1choose2(b)].diag # will throw an error if the A[b,b] block is not Diagonal + size(blk, 1) == size(blk, 2) == length(A_diag) || + throw(DimensionMismatch("A_diag and blk_diag have different lengths")) + λ = only(reterms[b].λ) # will throw an error if reterms[b].λ is not of size (1,1) + _zero_out(blk) + for k in eachindex(A_diag) + blk[k, k] = T(2) * λ * A_diag[k] + end + return blk + end + throw(ArgumentError("Code not yet written for k > 1")) end function Omega_dot_diag_block!( - blk::UniformBlockDiagonal{T}, - m::LinearMixedModel{T}, - p::Integer + blk::UniformBlockDiagonal{T}, + m::LinearMixedModel{T}, + p::Integer, ) where {T} - (; parmap, A, reterms) = m - b, i, j = parmap[p] - Ablk = A[kp1choose2(b)] - if !isa(Ablk, UniformBlockDiagonal{T}) - throw(ArgumentError("parmap[p] = $(parmap[p]) but A[$(kp1choose2(b))] is not UniformBlockDiagonal")) - end - blk_dat = fill!(blk.data, zero(T)) - Ablk_dat = Ablk.data - λ = reterms[b].λ - for k in axes(blk_dat, 3) - # right multiply by λ-dot-transpose, which is zeros except for a single 1 at the i'th column and j'th row - # thus we copy the i'th column of the k'th face of Ablk_dat into the j'th column of the k'th face of blk_dat - copyto!(view(blk_dat, :, i, k), view(Ablk_dat, :, j, k)) - lmul!(λ, view(blk_dat, :, :, k)) # left-multiply by λ - for jj in axes(λ, 2) # symmetrize the face while multiplying the diagonal by 2 - for ii in 1:(jj - 1) - val = blk_dat[ii, jj, k] + blk_dat[jj, ii, k] - blk_dat[ii, jj, k] = blk_dat[jj, ii, k] = val - end - blk_dat[jj, jj, k] *= T(2) + (; parmap, A, reterms) = m + b, i, j = parmap[p] + Ablk = A[kp1choose2(b)] + if !isa(Ablk, UniformBlockDiagonal{T}) + throw( + ArgumentError( + "parmap[p] = $(parmap[p]) but A[$(kp1choose2(b))] is not UniformBlockDiagonal", + ), + ) end - end - return blk + blk_dat = fill!(blk.data, zero(T)) + Ablk_dat = Ablk.data + λ = reterms[b].λ + for k in axes(blk_dat, 3) + # right multiply by λ-dot-transpose, which is zeros except for a single 1 at the i'th column and j'th row + # thus we copy the i'th column of the k'th face of Ablk_dat into the j'th column of the k'th face of blk_dat + copyto!(view(blk_dat, :, i, k), view(Ablk_dat, :, j, k)) + lmul!(λ, view(blk_dat,:,:,k)) # left-multiply by λ + for jj in axes(λ, 2) # symmetrize the face while multiplying the diagonal by 2 + for ii in 1:(jj - 1) + val = blk_dat[ii, jj, k] + blk_dat[jj, ii, k] + blk_dat[ii, jj, k] = blk_dat[jj, ii, k] = val + end + blk_dat[jj, jj, k] *= T(2) + end + end + return blk end function LinearAlgebra.ldiv!( @@ -85,10 +93,12 @@ function LinearAlgebra.ldiv!( A_dat = A.data.data B_dat = B.data if size(A_dat) ≠ size(B_dat) - throw(DimensionMismatch("size(A_dat) = $(size(A_dat)) ≠ $(size(B_dat)) = size(B_dat)")) + throw( + DimensionMismatch("size(A_dat) = $(size(A_dat)) ≠ $(size(B_dat)) = size(B_dat)") + ) end for k in axes(B_dat, 3) - ldiv!(LowerTriangular(view(A_dat, :, :, k)), view(B_dat, :, :, k)) + ldiv!(LowerTriangular(view(A_dat,:,:,k)), view(B_dat,:,:,k)) end return B end @@ -104,7 +114,7 @@ function LinearAlgebra.ldiv!( axis1 = axes(A_dat, 1) offset = 0 for k in axes(A_dat, 3) - ldiv!(LowerTriangular(view(A_dat, :, :, k)), view(B, offset .+ axis1, :)) + ldiv!(LowerTriangular(view(A_dat,:,:,k)), view(B, offset .+ axis1, :)) offset += length(axis1) end return B @@ -112,15 +122,17 @@ end function LinearAlgebra.rdiv!( A::UniformBlockDiagonal{T}, - B::UpperTriangular{T, LinearAlgebra.Transpose{T, UniformBlockDiagonal{T}}}, + B::UpperTriangular{T,LinearAlgebra.Transpose{T,UniformBlockDiagonal{T}}}, ) where {T} A_dat = A.data B_dat = B.data.parent.data if size(A_dat) ≠ size(B_dat) - throw(DimensionMismatch("size(A_dat) = $(size(A_dat)) ≠ $(size(B_dat)) = size(B_dat)")) + throw( + DimensionMismatch("size(A_dat) = $(size(A_dat)) ≠ $(size(B_dat)) = size(B_dat)") + ) end for k in axes(A_dat, 3) - rdiv!(view(A_dat, :, :, k), LowerTriangular(view(B_dat, :, :, k))') + rdiv!(view(A_dat,:,:,k), LowerTriangular(view(B_dat,:,:,k))') end return B end @@ -167,7 +179,8 @@ Create `A * Ω_dot` in `B` where `Ω_dot` is the indicator for the `i`'th row an """ function copyskip!(B::Matrix{T}, A::Matrix{T}, i::Integer, j::Integer, k::Integer) where {T} m, n = size(A) - (m, n) == size(B) || throw(DimensionMismatch("size(A) = $(size(A)) ≠ $(size(B)) = size(B)")) + (m, n) == size(B) || + throw(DimensionMismatch("size(A) = $(size(A)) ≠ $(size(B)) = size(B)")) isone(k) && return copyto!(B, A) fill!(B, zero(T)) @@ -176,13 +189,16 @@ function copyskip!(B::Matrix{T}, A::Matrix{T}, i::Integer, j::Integer, k::Intege offset = 0 for _ in 1:q copyto!(view(B, :, offset + i), view(A, :, offset + j)) - offset += k - end + offset += k + end return B end -function copyskip!(B::SparseMatrixCSC{T}, A::SparseMatrixCSC{T}, i::Integer, j::Integer, k::Integer) where {T} - (A.m == B.m && A.n == B.n) || throw(DimensionMismatch("size(A) = $(size(A)) ≠ $(size(B)) = size(B)")) +function copyskip!( + B::SparseMatrixCSC{T}, A::SparseMatrixCSC{T}, i::Integer, j::Integer, k::Integer +) where {T} + (A.m == B.m && A.n == B.n) || + throw(DimensionMismatch("size(A) = $(size(A)) ≠ $(size(B)) = size(B)")) if any(A.colptr .≠ B.colptr) || any(rowvals(A) .≠ rowvals(B)) throw(ArgumentError("A and B must have the same sparsity pattern")) end @@ -203,8 +219,8 @@ function copyskip!(B::SparseMatrixCSC{T}, A::SparseMatrixCSC{T}, i::Integer, j:: throw(ArgumentError("A and B must have same sparsity pattern after shifting")) end copyto!(view(nzB, nzrB), view(nzA, nzrA)) - offset += k - end + offset += k + end return B end @@ -248,43 +264,45 @@ end Evaluate the gradient component for parameter `p` in model `m` using blocks in `blks` for storage """ -function eval_grad_p!(blks::Matrix{AbstractMatrix{T}}, m::LinearMixedModel{T}, p::Integer) where {T} +function eval_grad_p!( + blks::Matrix{AbstractMatrix{T}}, m::LinearMixedModel{T}, p::Integer +) where {T} L = m.L -# b = first(parmap[p]) # block, row and column for parameter p + # b = first(parmap[p]) # block, row and column for parameter p initialize_blocks!(blks, m, p) # change this to pass b, i, j, k separately for kk in axes(blks, 2) # ldiv!(LowerTriangular(L), blks) -# if jj ≥ b # maybe hold off on this at the expense of some multiplications by zero + # if jj ≥ b # maybe hold off on this at the expense of some multiplications by zero L11 = L[block(1, 1)] isa(L11, Diagonal) || (L11 = LowerTriangular(L11)) C1 = ldiv!(L11, blks[1, kk]) for ii in axes(blks, 1)[2:end] mul!(blks[ii, kk], L[block(ii, 1)], C1, -one(T), one(T)) end -# end + # end for jj in axes(blks, 1)[2:end] Cj = ldiv!(LowerTriangular(L[block(jj, jj)]), blks[jj, kk]) - for ii in jj+1:lastindex(blks, 1) + for ii in (jj + 1):lastindex(blks, 1) mul!(blks[ii, kk], L[block(ii, jj)], Cj, -one(T), one(T)) end end - end - # for k in axes(B,2) - # a11 = A[1,1] - # iszero(a11) && throw(SingularException(1)) - # C1 = C[1,k] = a11 \ B[1,k] - # # fill C-column - # for i in axes(B,1)[2:end] - # C[i,k] = oA \ B[i,k] - _ustrip(A[i,1]) * C1 - # end - # for j in axes(B,1)[2:end] - # ajj = A[j,j] - # iszero(ajj) && throw(SingularException(j)) - # Cj = C[j,k] = _ustrip(ajj) \ C[j,k] - # for i in j+1:lastindex(A,1) - # C[i,k] -= _ustrip(A[i,j]) * Cj - # end - # end - # end + end + # for k in axes(B,2) + # a11 = A[1,1] + # iszero(a11) && throw(SingularException(1)) + # C1 = C[1,k] = a11 \ B[1,k] + # # fill C-column + # for i in axes(B,1)[2:end] + # C[i,k] = oA \ B[i,k] - _ustrip(A[i,1]) * C1 + # end + # for j in axes(B,1)[2:end] + # ajj = A[j,j] + # iszero(ajj) && throw(SingularException(j)) + # Cj = C[j,k] = _ustrip(ajj) \ C[j,k] + # for i in j+1:lastindex(A,1) + # C[i,k] -= _ustrip(A[i,j]) * Cj + # end + # end + # end for ii in axes(blks, 1) # rdiv!(blks, transpose(LowerTriangular(L))) for jj in axes(blks, 2) blkij = blks[ii, jj] @@ -294,17 +312,15 @@ function eval_grad_p!(blks::Matrix{AbstractMatrix{T}}, m::LinearMixedModel{T}, p rdiv!(blkij, transpose(LowerTriangular(L[block(jj, jj)]))) end end - # for i in axes(A,1) - # for j in axes(A,2) - # Aij = A[i,j] - # for k in firstindex(B,2):j - 1 - # Aij -= C[i,k]*tfun(B[j,k]) - # end - # unit || (iszero(B[j,j]) && throw(SingularException(j))) - # C[i,j] = Aij / (unit ? oB : tfun(B[j,j])) - # end - # end + # for i in axes(A,1) + # for j in axes(A,2) + # Aij = A[i,j] + # for k in firstindex(B,2):j - 1 + # Aij -= C[i,k]*tfun(B[j,k]) + # end + # unit || (iszero(B[j,j]) && throw(SingularException(j))) + # C[i,j] = Aij / (unit ? oB : tfun(B[j,j])) + # end + # end return blks end - - diff --git a/src/linalg.jl b/src/linalg.jl index 91e628153..106941adf 100644 --- a/src/linalg.jl +++ b/src/linalg.jl @@ -54,7 +54,7 @@ function LinearAlgebra.ldiv!( m, n, k = size(Adat) bb = reshape(B, (n, k)) for j in axes(Adat, 3) - ldiv!(UpperTriangular(adjoint(view(Adat, :, :, j))), view(bb, :, j)) + ldiv!(UpperTriangular(adjoint(view(Adat,:,:,j))), view(bb, :, j)) end return B end @@ -71,7 +71,7 @@ function LinearAlgebra.rdiv!( coloffset = (b - 1) * s rdiv!( view(A, :, (coloffset + 1):(coloffset + s)), - UpperTriangular(adjoint(view(Bdd, :, :, b))), + UpperTriangular(adjoint(view(Bdd,:,:,b))), ) end return A @@ -89,7 +89,7 @@ function LinearAlgebra.rdiv!( for j in axes(Bdat, 3) rdiv!( reshape(view(nzv, cbpt[j]:(cbpt[j + 1] - 1)), :, P), - UpperTriangular(adjoint(view(Bdat, :, :, j))), + UpperTriangular(adjoint(view(Bdat,:,:,j))), ) end return A diff --git a/src/linalg/cholUnblocked.jl b/src/linalg/cholUnblocked.jl index b430e324a..dcfcb0e3f 100644 --- a/src/linalg/cholUnblocked.jl +++ b/src/linalg/cholUnblocked.jl @@ -39,7 +39,7 @@ end function cholUnblocked!(D::UniformBlockDiagonal, ::Type{Val{:L}}) Ddat = D.data for k in axes(Ddat, 3) - cholUnblocked!(view(Ddat, :, :, k), Val{:L}) + cholUnblocked!(view(Ddat,:,:,k), Val{:L}) end return D end diff --git a/src/linalg/rankUpdate.jl b/src/linalg/rankUpdate.jl index 705f87d53..9a1eb3993 100644 --- a/src/linalg/rankUpdate.jl +++ b/src/linalg/rankUpdate.jl @@ -180,7 +180,7 @@ function rankUpdate!( @inbounds for j in axes(Ac, 2) nzr = nzrange(Ac, j) - BLAS.syr!('L', α, view(nz, nzr), view(Cdat, :, :, div(rv[last(nzr)], S))) + BLAS.syr!('L', α, view(nz, nzr), view(Cdat,:,:,div(rv[last(nzr)], S))) end return C diff --git a/src/linearmixedmodel.jl b/src/linearmixedmodel.jl index 458508d16..66d5ebfa6 100644 --- a/src/linearmixedmodel.jl +++ b/src/linearmixedmodel.jl @@ -338,7 +338,7 @@ function condVar(m::LinearMixedModel{T}, fname) where {T} fill!(scratch, zero(T)) copyto!(view(scratch, (b - 1) * vsz .+ (1:vsz), :), λt) ldiv!(Lblk, scratch) - mul!(view(val, :, :, b), scratch', scratch) + mul!(view(val,:,:,b), scratch', scratch) end return val end @@ -347,7 +347,7 @@ function _cvtbl(arr::Array{T,3}, trm) where {T} return merge( NamedTuple{(fname(trm),)}((trm.levels,)), columntable([ - NamedTuple{(:σ, :ρ)}(sdcorr(view(arr, :, :, i))) for i in axes(arr, 3) + NamedTuple{(:σ, :ρ)}(sdcorr(view(arr,:,:,i))) for i in axes(arr, 3) ]), ) end @@ -729,7 +729,7 @@ end # use dispatch to distinguish Diagonal and UniformBlockDiagonal in first(L) _ldivB1!(B1::Diagonal{T}, rhs::AbstractVector{T}, ind) where {T} = rhs ./= B1.diag[ind] function _ldivB1!(B1::UniformBlockDiagonal{T}, rhs::AbstractVector{T}, ind) where {T} - return ldiv!(LowerTriangular(view(B1.data, :, :, ind)), rhs) + return ldiv!(LowerTriangular(view(B1.data,:,:,ind)), rhs) end """ @@ -800,8 +800,9 @@ Return the vector of _canonical_ lower bounds on the parameters, `θ`. Note that this method does not distinguish between constrained optimization and unconstrained optimization with post-fit canonicalization. """ -lowerbd(m::LinearMixedModel{T}) where {T} = +function lowerbd(m::LinearMixedModel{T}) where {T} [(pm[2] == pm[3]) ? zero(T) : T(-Inf) for pm in m.parmap] +end function mkparmap(reterms::Vector{<:AbstractReMat{T}}) where {T} parmap = NTuple{3,Int}[] diff --git a/src/mixedmodel.jl b/src/mixedmodel.jl index 036e65253..3a4bb2503 100644 --- a/src/mixedmodel.jl +++ b/src/mixedmodel.jl @@ -144,8 +144,8 @@ StatsAPI.predict(m::MixedModel) = fitted(m) function retbl(mat, trm) nms = (fname(trm), Symbol.(trm.cnames)...) return Table( - [NamedTuple{nms}((l, view(mat, :, i)...),) for (i, l) in enumerate(trm.levels)] -) + [NamedTuple{nms}((l, view(mat, :, i)...),) for (i, l) in enumerate(trm.levels)] + ) end StatsAPI.adjr2(m::MixedModel) = r2(m) diff --git a/src/remat.jl b/src/remat.jl index 17379d927..b093a49f1 100644 --- a/src/remat.jl +++ b/src/remat.jl @@ -593,7 +593,7 @@ function copyscaleinflate!( dind = diagind(S, S) Ldat = copyto!(Ljj.data, Ajj.data) for k in axes(Ldat, 3) - f = view(Ldat, :, :, k) + f = view(Ldat,:,:,k) lmul!(λ', rmul!(f, λ)) for i in dind f[i] += one(T) # inflate diagonal diff --git a/test/grad.jl b/test/grad.jl index 1e036be7d..5fc866b0b 100644 --- a/test/grad.jl +++ b/test/grad.jl @@ -2,33 +2,33 @@ using LinearAlgebra using MixedModels using Test -using MixedModels: Omega_dot_diag_block! - -using MixedModelsDatasets: dataset +using MixedModels: grad_blocks, eval_grad_p!, initialize_blocks! include("modelcache.jl") @testset "gradient" begin @testset "single_scalar" begin - fm1 = only(models(:dyestuff2)) + fm1 = only(models(:dyestuff)) θ = fm1.θ - blk = Omega_dot_diag_block!(similar(first(fm1.A)), fm1, 1) - @test all(≈(10. * only(θ)), blk.diag) - Omega_dot_diag_block!(blk, updateL!(setθ!(fm1, ones(1))), 1) - @test all(==(10.), blk.diag) + blks = initialize_blocks!(grad_blocks(fm1), fm1, 1) + dblk = blks[1,1].diag + @test all(≈(10. * only(θ)), dblk) + ldiv_blk = ldiv!(first(fm1.L), dblk) + @test all(≈(10. * only(θ) / first(first(fm1.L))), dblk) updateL!(setθ!(fm1, θ)) # restore the estimated parameter values fm2 = first(models(:pastes)) θ = fm2.θ - blk = Omega_dot_diag_block!(similar(first(fm2.A)), fm2, 1) - @test all(≈(4. * only(θ)), blk.diag) - Omega_dot_diag_block!(blk, updateL!(setθ!(fm2, ones(1))), 1) - @test all(==(4.), blk.diag) + blks = initialize_blocks!(grad_blocks(fm2), fm2, 1) + @test all(≈(4. * only(θ)), blks[1, 1].diag) + initialize_blocks!(blks, updateL!(setθ!(fm2, ones(1))), 1) + @test all(==(4.), blks[1, 1].diag) updateL!(setθ!(fm2, θ)) fm3 = last(models(:pastes)) # first of two nested, scalar r.e. terms θ = fm3.θ - blk = Omega_dot_diag_block!(similar(first(fm3.A)), fm3, 1) + blks = initialize_blocks!(grad_blocks(fm3), fm3, 1) + @test all(≈(4. * first(θ)), blk.diag) fm4 = only(models(:penicillin)) @@ -39,11 +39,12 @@ include("modelcache.jl") blk = Omega_dot_diag_block!(similar(first(fm5.A)), fm5, 1) @test all(≈(20. * only(fm5.θ)), blk.diag) end + @testset "single_vector" begin fm6 = last(models(:sleepstudy)) λ = only(fm6.reterms).λ θ = fm6.θ - blk = Omega_dot_diag_block!(UniformBlockDiagonal(similar(first(fm6.L).data)), fm6, 1) + blk = Omega_dot_diag_block!(similar(first(fm6.L)), fm6, 1) blk_dat = blk.data A11_dat = first(fm6.A).data @test all(≈(20. * first(θ)), view(blk_dat, 1, 1, :)) @@ -51,6 +52,7 @@ include("modelcache.jl") @test all(view(blk_dat, 1, 2, :) .== view(blk_dat, 2, 1, :)) odiag = dot(view(λ, 2, :), view(A11_dat, :, 1, 1)) @test all(≈(odiag), view(blk_dat, 1, 2, :)) + ldiv!(LowerTriangular(first(fm6.L)), blk) Omega_dot_diag_block!(blk, fm6, 2) @test all(iszero, view(blk_dat, 1, 1, :)) @@ -65,9 +67,5 @@ include("modelcache.jl") @test all(≈(45. * first(θ)), view(blk_dat, 1, 2, :)) diag2 = 2. * dot(view(λ.data, 2, :), view(A11_dat, :, 2, 1)) @test all(≈(diag2), view(blk_dat, 2, 2, :)) - -# FiniteDiff.finite_difference_gradient(objective!(fm6), θ) -# ldfun(m::LinearMixedModel, θ::Vector{Float64}) = logdet(updateL!(setθ!(m, θ))) - end end \ No newline at end of file From aa54be87e9449764a8b4048ec22e84c40be812d5 Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Mon, 2 Feb 2026 17:13:54 -0600 Subject: [PATCH 20/23] Update gradient code and documents --- gradients/GradientEvaluation.qmd | 15 ++-- gradients/Gradient_based_optimization.qmd | 11 ++- src/gradient.jl | 100 ++++++++++++++++------ 3 files changed, 90 insertions(+), 36 deletions(-) diff --git a/gradients/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd index 5476ae2d1..e6322fe0e 100644 --- a/gradients/GradientEvaluation.qmd +++ b/gradients/GradientEvaluation.qmd @@ -217,7 +217,6 @@ Load the packages to be used using BenchmarkTools using CairoMakie using FiniteDiff -using ForwardDiff using LinearAlgebra using MixedModels using MixedModels: eval_grad_p!, grad_blocks, initialize_blocks! @@ -267,14 +266,14 @@ Notice that the objective is well-defined for negative values of $\theta_1$ and This means that $\theta_1=0$ will always be a critical value (have a derivative of zero) for this function. -At the maximum likelihood estimate (i.e. the minimizer of the objective) of $\theta_1$, which is the ratio of the standard deviation of the random effects to the residual standard deviation, is +At the maximum likelihood estimate (i.e. the minimizer of the objective), the value of $\theta_1$, which is the ratio of the standard deviation of the random effects to the residual standard deviation, is ```{julia} #| label: dyestuff_theta only(θ) ``` -the derivative should also be zero (in practice, close to zero). +Here the derivative should be zero (in practice, close to zero). We can see from @fig-obj_graph that the derivative at $\theta_1=1.$ will be positive. @@ -311,9 +310,11 @@ transpose(first(m01.A).diag) # transpose for compact printing To evaluate the gradient we create the blocked storage using `grad_blocks` and initialize these blocks for the `p`th parameter with `initialize_blocks!` ```{julia} -blks = initialize_blocks!(grad_blocks(m01), m01, 1) +blks = initialize_blocks!(grad_blocks(m01), m01, 1, 1, 1, 1) ``` +(We have switched from passing `p` from which `b`, the block, `i` and `j` the indices within the block, and `k` the size of the template $\lambda$ matrix in the block, are derived to passing `b`, `i`, `j` and `k` directly.) + In practice we evaluate $\mathbf{L}^{-1}\dot{\boldsymbol\Omega}\mathbf{L}^{-\top}$ in the blocked format but here, for illustration, we create $\dot{\boldsymbol{\Omega}}$ in full and use a sparse form of $\mathbf{L}$ to verify the results. ```{julia} @@ -367,7 +368,7 @@ If we repeat these steps at the parameter estimate we have ```{julia} updateL!(setθ!(m01, θ)) # reset the value of θ in the model L = LowerTriangular(sparseL(m01; full=true)) -initialize_blocks!(blks, m01, 1) +initialize_blocks!(blks, m01, 1, 1, 1, 1) M = rdiv!(ldiv!(L, hvcat(2, blks[1,1], blks[1,2], blks[2,1], blks[2,2])), L') ``` @@ -406,7 +407,7 @@ Reset the parameters to the starting values, create the blocks for the gradient ```{julia} updateL!(setθ!(m03, m03.optsum.initial)) -blks = initialize_blocks!(grad_blocks(m03), m03, 1) +blks = initialize_blocks!(grad_blocks(m03), m03, 1, 1, 1, 2) view(first(blks), 1:4, 1:4) ``` @@ -557,7 +558,7 @@ FiniteDiff.finite_difference_gradient(m02, m02.optsum.initial) In the full matrix representation ```{julia} -initialize_blocks!(blks, updateL!(setθ!(m02, m02.optsum.initial)), 2) +initialize_blocks!(blks, updateL!(setθ!(m02, m02.optsum.initial)), 2, 1, 1, 1) Ω_dot = let b = blks hvcat(3, b[1,1], b[1,2], b[1,3], b[2,1], b[2,2], b[2,3], b[3,1], b[3,2], b[3,3]) end diff --git a/gradients/Gradient_based_optimization.qmd b/gradients/Gradient_based_optimization.qmd index 96ee4eb08..4a766315f 100644 --- a/gradients/Gradient_based_optimization.qmd +++ b/gradients/Gradient_based_optimization.qmd @@ -235,6 +235,7 @@ kbm01 = fit( dataset(:kb07); progress, ) +kbm01_obj = objective(kbm01) print(kbm01) ``` @@ -249,21 +250,23 @@ findall(iszero, kbm01.θ) ``` Refitting with the gradient takes a very long time because ForwardDiff is poorly suited to optimization problems with many parameters. + ```{julia} -#| eval: false fitlog = gr_refit!(kbm01) ``` ```{julia} -#| eval: false -last(fitlog.obj) +kbm01_obj - last(fitlog.obj) ``` ```{julia} -#| eval: false last(fitlog, 5) ``` +```{julia} +findall(x -> abs(x) < 1.0e-5, kbm01.θ) +``` + ## Conclusions {#sec-conclusions} Generally the gradient-based optimizers converge in fewer evaluations than the derivative-free optimizers (`psm01` in @sec-pastes is an exception). diff --git a/src/gradient.jl b/src/gradient.jl index 28c08b07a..7d1e28b46 100644 --- a/src/gradient.jl +++ b/src/gradient.jl @@ -1,26 +1,28 @@ # Evaluate analytic gradient of the objective for ML or REML fitting of a LinearMixedModel """ - Omega_dot_diag_block!(blk, m::LinearMixedModel, p::Integer) + Omega_dot_diag_block!(blk, m::LinearMixedModel, b::Integer, i::Integer, j::Integer) -Fill `blk` as the non-zero diagonal block of ∂Ω/∂θₚ for parameter number `p` of model `m`. +Fill `blk` as the non-zero diagonal block of ∂Ω/∂θₚ for the parameter in block `b`, position `i, j` of model `m`. For any `p` only one diagonal block of ∂Ω/∂θₚ will be non-zero. """ function Omega_dot_diag_block!( blk::Diagonal{T,Vector{T}}, m::LinearMixedModel{T}, - p::Integer, + b::Integer, + i::Integer, + j::Integer, ) where {T} - (; parmap, A, reterms) = m - b, i, j = parmap[p] + (; A, reterms) = m isone(i) && isone(j) || throw(ArgumentError("parameter $p should be from a scalar r.e. term")) # It is common for 'b' to be one as well but nested models can result in diagonal blk for b > 1 blk_diag = blk.diag - A_diag = A[kp1choose2(b)].diag # will throw an error if the A[b,b] block is not Diagonal - length(blk_diag) == length(A_diag) || + A_diag = A[kp1choose2(b)].diag # will throw an error if the A[b,b] block is not Diagonal + if length(blk_diag) ≠ length(A_diag) throw(DimensionMismatch("A_diag and blk_diag have different lengths")) + end λ = only(reterms[b].λ) # will throw an error if reterms[b] is not of size (1,1) for k in eachindex(blk_diag, A_diag) blk_diag[k] = T(2) * λ * A_diag[k] @@ -31,10 +33,11 @@ end function Omega_dot_diag_block!( blk::Matrix{T}, m::LinearMixedModel{T}, - p::Integer, + b::Integer, + i::Integer, + j::Integer, ) where {T} - (; parmap, A, reterms) = m - b, i, j = parmap[p] + (; A, reterms) = m k = size(reterms[b].λ, 1) if isone(k) isone(i) && isone(j) || @@ -55,10 +58,11 @@ end function Omega_dot_diag_block!( blk::UniformBlockDiagonal{T}, m::LinearMixedModel{T}, - p::Integer, + b::Integer, + i::Integer, + j::Integer, ) where {T} - (; parmap, A, reterms) = m - b, i, j = parmap[p] + (; A, reterms) = m Ablk = A[kp1choose2(b)] if !isa(Ablk, UniformBlockDiagonal{T}) throw( @@ -225,19 +229,20 @@ function copyskip!( end """ - initialize_blocks!(blks::Matrix{AbstractMatrix{T}}, m::LinearMixedModel{T}, p::Integer) + initialize_blocks!(blks::Matrix{AbstractMatrix{T}}, m::LinearMixedModel{T}, b, i, j, k) -Initialize the grad evaluation blocks, `blks`, for model `m`, for parameter `p` +Initialize the grad evaluation blocks, `blks`, for model `m`, for parameter in block `b`, position `(i,j)` of block size `k` """ function initialize_blocks!( blks::Matrix{AbstractMatrix{T}}, m::LinearMixedModel{T}, - p::Integer, + b::Integer, + i::Integer, + j::Integer, + k::Integer, ) where {T} - (; parmap, A, reterms) = m - b, i, j = parmap[p] - k = size(reterms[b].λ, 1) - Omega_dot_diag_block!(blks[b, b], m, p) # populate the b'th diagonal block + A = m.A + Omega_dot_diag_block!(blks[b, b], m, b, i, j) # populate the b'th diagonal block for r in axes(blks, 1) # iterate over the lower triangle, transpose-copying to upper triangle if r ≠ b for c in 1:r @@ -259,6 +264,30 @@ function initialize_blocks!( return blks end +""" + diag_sum(A::AbstractMatrix) + +Return the sum of the diagonal elements of `A` +""" +function diag_sum(A::Matrix) + return sum(A[i] for i in diagind(A)) +end + +function diag_sum(A::UniformBlockDiagonal{T}) where T + dat = A.data + val = zero(T) + for k in axes(dat, 3) + for i in axes(dat, 1) + val += dat[i, i, k] + end + end + return val +end + +function diag_sum(A::Diagonal) + return sum(A.diag) +end + """ eval_grad_p!(blks, m, p) @@ -267,11 +296,12 @@ Evaluate the gradient component for parameter `p` in model `m` using blocks in ` function eval_grad_p!( blks::Matrix{AbstractMatrix{T}}, m::LinearMixedModel{T}, p::Integer ) where {T} - L = m.L - # b = first(parmap[p]) # block, row and column for parameter p - initialize_blocks!(blks, m, p) # change this to pass b, i, j, k separately + (; L, parmap, reterms) = m + (b, i, j) = parmap[p] # block, row and column for parameter p + k = size(reterms[b].λ, 1) + initialize_blocks!(blks, m, b, i, j, k) for kk in axes(blks, 2) # ldiv!(LowerTriangular(L), blks) - # if jj ≥ b # maybe hold off on this at the expense of some multiplications by zero + # if jj ≥ b # maybe hold off on this at the expense of some multiplications by zero L11 = L[block(1, 1)] isa(L11, Diagonal) || (L11 = LowerTriangular(L11)) C1 = ldiv!(L11, blks[1, kk]) @@ -286,6 +316,7 @@ function eval_grad_p!( end end end + # code in LinearAlgebra on which this is patterned # for k in axes(B,2) # a11 = A[1,1] # iszero(a11) && throw(SingularException(1)) @@ -322,5 +353,24 @@ function eval_grad_p!( # C[i,j] = Aij / (unit ? oB : tfun(B[j,j])) # end # end - return blks + return sum(diag_sum(blks[i, i]) for i in 1:(size(blks, 1) - 1)) + length(m.y) * last(last(blks)) +end + +""" + gradient!(g::Vector{T}, m::LinearMixedModel{T}) where {T} + +Overwrite `g` with the gradient of the ML objective of the model `m` at its current parameter values +""" +function gradient!(g::Vector{T}, blks::Matrix{AbstractMatrix{T}}, m::LinearMixedModel{T}) where {T} + if length(g) ≠ length(m.parmap) + throw(DimensionMismatch("length(g) = $(length(g)) should be $(length(m.parmap)) for this model")) + end + for p in axes(g, 1) + g[p] = eval_grad_p!(blks, m, p) + end + return g +end + +function gradient(blks::Matrix{AbstractMatrix{T}}, m::LinearMixedModel{T}) where {T} + return gradient!(similar(m.parmap, T), blks, m) end From b05b788c7c5f14ad6aa4c8cf4a6c4a1d3dff8a77 Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Mon, 2 Feb 2026 21:20:52 -0600 Subject: [PATCH 21/23] Adjust tests on gradient --- test/grad.jl | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/test/grad.jl b/test/grad.jl index 5fc866b0b..eb2573ab3 100644 --- a/test/grad.jl +++ b/test/grad.jl @@ -10,7 +10,7 @@ include("modelcache.jl") @testset "single_scalar" begin fm1 = only(models(:dyestuff)) θ = fm1.θ - blks = initialize_blocks!(grad_blocks(fm1), fm1, 1) + blks = initialize_blocks!(grad_blocks(fm1), fm1, 1, 1, 1, 1) dblk = blks[1,1].diag @test all(≈(10. * only(θ)), dblk) ldiv_blk = ldiv!(first(fm1.L), dblk) @@ -19,53 +19,52 @@ include("modelcache.jl") fm2 = first(models(:pastes)) θ = fm2.θ - blks = initialize_blocks!(grad_blocks(fm2), fm2, 1) + blks = initialize_blocks!(grad_blocks(fm2), fm2, 1, 1, 1, 1) @test all(≈(4. * only(θ)), blks[1, 1].diag) - initialize_blocks!(blks, updateL!(setθ!(fm2, ones(1))), 1) + initialize_blocks!(blks, updateL!(setθ!(fm2, ones(1))), 1, 1, 1, 1) @test all(==(4.), blks[1, 1].diag) updateL!(setθ!(fm2, θ)) fm3 = last(models(:pastes)) # first of two nested, scalar r.e. terms θ = fm3.θ - blks = initialize_blocks!(grad_blocks(fm3), fm3, 1) - - @test all(≈(4. * first(θ)), blk.diag) + blks = initialize_blocks!(grad_blocks(fm3), fm3, 1, 1, 1, 1) + @test all(≈(4. * first(θ)), blks[1,1].diag) fm4 = only(models(:penicillin)) - blk = Omega_dot_diag_block!(similar(first(fm4.A)), fm4, 1) - @test all(≈(12. * first(fm4.θ)), blk.diag) + blks = initialize_blocks!(grad_blocks(fm4), fm4, 1, 1, 1, 1) + @test all(≈(12. * first(fm4.θ)), blks[1,1].diag) fm5 = first(models(:sleepstudy)) - blk = Omega_dot_diag_block!(similar(first(fm5.A)), fm5, 1) - @test all(≈(20. * only(fm5.θ)), blk.diag) + blks = initialize_blocks!(grad_blocks(fm5), fm5, 1, 1, 1, 1) + @test all(≈(20. * only(fm5.θ)), blks[1, 1].diag) end @testset "single_vector" begin fm6 = last(models(:sleepstudy)) λ = only(fm6.reterms).λ θ = fm6.θ - blk = Omega_dot_diag_block!(similar(first(fm6.L)), fm6, 1) - blk_dat = blk.data + blks = initialize_blocks!(grad_blocks(fm6), fm6, 1, 1, 1, 2) + blk_dat = blks[1,1].data A11_dat = first(fm6.A).data @test all(≈(20. * first(θ)), view(blk_dat, 1, 1, :)) @test all(iszero, view(blk_dat, 2, 2, :)) @test all(view(blk_dat, 1, 2, :) .== view(blk_dat, 2, 1, :)) odiag = dot(view(λ, 2, :), view(A11_dat, :, 1, 1)) @test all(≈(odiag), view(blk_dat, 1, 2, :)) - ldiv!(LowerTriangular(first(fm6.L)), blk) + ldiv!(LowerTriangular(first(fm6.L)), blks[1,1]) - Omega_dot_diag_block!(blk, fm6, 2) + initialize_blocks!(blks, fm6, 1, 2, 2, 2) @test all(iszero, view(blk_dat, 1, 1, :)) - @test all(view(blk_dat, 1, 2, :) .== view(blk, 2, 1, :)) # result should be symmetric - @test all(==(10. * first(θ)), view(blk_dat, 1, 2, :)) - diag2 = 2. * dot(view(λ.data, 2, :), view(A11_dat, :, 1, 1)) - @test all(≈(diag2), view(blk_dat, 2, 2, :)) + @test all(view(blk_dat, 1, 2, :) .== view(blk_dat, 2, 1, :)) # result should be symmetric + # @test all(==(10. * first(θ)), view(blk_dat, 1, 2, :)) + # diag2 = 2. * dot(view(λ.data, 2, :), view(A11_dat, :, 1, 1)) + # @test all(≈(diag2), view(blk_dat, 2, 2, :)) - Omega_dot_diag_block!(blk, fm6, 3) - @test all(iszero, view(blk_dat, 1, 1, :)) - @test all(view(blk_dat, 1, 2, :) .== view(blk, 2, 1, :)) # faces of result should be symmetric - @test all(≈(45. * first(θ)), view(blk_dat, 1, 2, :)) - diag2 = 2. * dot(view(λ.data, 2, :), view(A11_dat, :, 2, 1)) - @test all(≈(diag2), view(blk_dat, 2, 2, :)) + # Omega_dot_diag_block!(blk, fm6, 3) + # @test all(iszero, view(blk_dat, 1, 1, :)) + # @test all(view(blk_dat, 1, 2, :) .== view(blk, 2, 1, :)) # faces of result should be symmetric + # @test all(≈(45. * first(θ)), view(blk_dat, 1, 2, :)) + # diag2 = 2. * dot(view(λ.data, 2, :), view(A11_dat, :, 2, 1)) + # @test all(≈(diag2), view(blk_dat, 2, 2, :)) end end \ No newline at end of file From a2f9e311bab7f1befb390799f6b62ee712f4a0d9 Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Tue, 3 Feb 2026 15:58:11 -0600 Subject: [PATCH 22/23] Expand exploration of gradient methods --- gradients/Gradient_based_optimization.qmd | 258 +++++++++++++++++++--- 1 file changed, 230 insertions(+), 28 deletions(-) diff --git a/gradients/Gradient_based_optimization.qmd b/gradients/Gradient_based_optimization.qmd index 4a766315f..7bc56fc3c 100644 --- a/gradients/Gradient_based_optimization.qmd +++ b/gradients/Gradient_based_optimization.qmd @@ -50,9 +50,10 @@ Load the packages to be used #| label: load_packages #| output: false using BenchmarkTools +using FiniteDiff using ForwardDiff using MixedModels -using MixedModels: fd_deviance +using MixedModels: fd_deviance, grad_blocks, gradient, gradient! using MixedModelsDatasets: dataset using NLopt using Tables: table @@ -61,8 +62,203 @@ using TypedTables: Table const progress = isinteractive() ``` +## Comparison of MixedModels.gradient! and ForwardDiff.gradient! + +With the [ForwardDiff package](https://github.com/JuliaDiff/ForwardDiff.jl) and the corresponding extension to the [MixedModels package](https://github.com/JuliaStats/MixedModels.jl) we can use forward-differencing automatic differentiation for the objective of a Linear Mixed Model, defined as `fd_deviance`. + +Forward differencing requires that the objective be evaluated using [dual numbers](https://en.wikipedia.org/wiki/Dual_number), which for large problems, either in terms of the number of random effects or the number of free parameters in the objective, makes for slow evaluation. + +The number of free parameters affects the speed because the evaluation of the objective must be repeated using dual numbers with respect to each parameter in order to evaluate the gradient. +Thus if the relative covariance parameter, $\boldsymbol{\theta}$, is of length $k$ then the objective is evaluated $k + 1$ times for each objective, gradient calculation. + +The number of random effects becomes important in models with multiple grouping factors because the evaluation of the objective and the gradient often requires evaluating the Cholesky factor of a large, dense, symmetric positive-definite matrix [@bates2025mixed]. +The floating-point version of this calculation can use Lapack routines and their implementation in "accelerated BLAS". +With dual numbers we must fall back on generic code which can be much slower. + +Recently we have coded up a formulation of the gradient of the objective as `MixedModels.gradient!` that uses the same, Lapack-based, calculations as the evaluation of the objective. +There still may be areas for optimization of this calculation but probably there won't be order-of-magnitude increases in speed for this method. + ## Examples {#sec-examples} +We will introduce a number of models from our collection of examples and check the gradient using finite-differences, forward-differencing and our analytic approach at the starting estimates. +The finite differencing methods will have lower accuracy than the other two methods. + +### A single, simple, scalar random-effects term {#sec-m1} + +The first model has a single, scalar random-effects term - about as simple as you can get. + +```{julia} +#| label: m1 +m1 = LinearMixedModel( + @formula(yield ~ 1 + (1|batch)), + dataset(:dyestuff) +); +ff_gr_m1 = FiniteDiff.finite_difference_gradient(m1, m1.optsum.initial) +``` + +```{julia} +fd_gr_m1 = ForwardDiff.gradient(m1, m1.optsum.initial) +``` + +```{julia} +m1_blks = grad_blocks(m1) +a_gr_m1 = gradient(m1_blks, updateL!(setθ!(m1, m1.optsum.initial))) +``` + +We don't bother benchmarking these evaluations because this model is so simple. + +### Two non-nested, simple, scalar random-effects terms {#sec-m2} + +The second model has two, non-nested, scalar random-effects terms. +It happens that these terms are completely crossed, in the sense that every `sample` appears on every `plate`, but the important characteristic is that they are non-nested. + +```{julia} +m2 = LinearMixedModel( + @formula(diameter ~ 1 + (1|plate) + (1|sample)), + dataset(:penicillin) +) +m2_blks = grad_blocks(m2) +ff_gr_m2 = FiniteDiff.finite_difference_gradient(m2, m2.optsum.initial) +``` + +```{julia} +fd_gr_m2 = ForwardDiff.gradient(m2, m2.optsum.initial) +``` + +```{julia} +a_gr_m2 = MixedModels.gradient(m2_blks, updateL!(setθ!(m2, m2.optsum.initial))) +``` + +```{julia} +#| label: fd_m2_benchmark +@benchmark ForwardDiff.gradient!($fd_gr_m2, $m2, $(m2.optsum.initial)) seconds=1 +``` + +```{julia} +#| label: a_m2_benchmark +@benchmark gradient!($a_gr_m2, $m2_blks, $m2) seconds=1 +``` + +We see that the analytic gradient takes longer to evaluate but uses less memory than does the ForwardDiff gradient. + +### Two nested, simple, scalar random-effects terms {#sec-m3} + +The third example has two nested scalar random-effects terms + +```{julia} +m3 = LinearMixedModel( + @formula(strength ~ 1 + (1|batch / cask)), + dataset(:pastes) +) +m3_blks = grad_blocks(m3) +ff_gr_m3 = FiniteDiff.finite_difference_gradient(m3, m3.optsum.initial) +``` + +```{julia} +fd_gr_m3 = ForwardDiff.gradient(m3, m3.optsum.initial) +``` + +```{julia} +a_gr_m3 = MixedModels.gradient(m3_blks, updateL!(setθ!(m3, m3.optsum.initial))) +``` + +```{julia} +#| label: fd_m3_benchmark +@benchmark ForwardDiff.gradient!($fd_gr_m3, $m3, $(m3.optsum.initial)) seconds=1 +``` + +```{julia} +#| label: a_m3_benchmark +@benchmark gradient!($a_gr_m3, $m3_blks, $m3) seconds=1 +``` + +This is a curious result because the analytic gradient result is much slower on this model than on `m2`. +It may be related to the structure of the blocks - in particular the $[2,2]$ block of $\mathbf{L}$ which is dense in `m2` and diagonal in `m3`. + +```{julia} +BlockDescription(m2) +``` + +```{julia} +BlockDescription(m3) +``` + +It may be that some of the evaluations of elements in the $[2,2]$ block must keep checking that the element being updated is indeed on the diagonal, producing a lot of indexing overhead. + +### Vector-valued random effects for a single grouping factor + +The fourth example has vector-valued random effects for each level of a single grouping factor, `subj`. + +```{julia} +m4 = LinearMixedModel( + @formula(reaction ~ 1 + days + (1 + days|subj)), + dataset(:sleepstudy) +) +m4_blks = grad_blocks(m4) +ff_gr_m4 = FiniteDiff.finite_difference_gradient(m4, m4.optsum.initial) +``` + +```{julia} +fd_gr_m4 = ForwardDiff.gradient(m4, m4.optsum.initial) +``` + +```{julia} +a_gr_m4 = MixedModels.gradient(m4_blks, updateL!(setθ!(m4, m4.optsum.initial))) +``` + +```{julia} +#| label: fd_m4_benchmark +@benchmark ForwardDiff.gradient!($fd_gr_m4, $m4, $(m4.optsum.initial)) seconds=1 +``` + +```{julia} +#| label: a_m4_benchmark +@benchmark gradient!($a_gr_m4, $m4_blks, $m4) seconds=1 +``` + +### Incompletely crossed grouping factors for scalar random effects {#sec-m5} + + +```{julia} +m5 = LinearMixedModel( + @formula(y ~ 1 + (1|s) + (1|d)), + dataset(:insteval) +) +m5_blks = grad_blocks(m5) +BlockDescription(m5) +``` + +```{julia} +ff_gr_m5 = FiniteDiff.finite_difference_gradient(m5, m5.optsum.initial) +``` + +```{julia} +fd_gr_m5 = ForwardDiff.gradient(m5, m5.optsum.initial) +``` + +With the analytic gradient this evaluation takes a very long time. +We should profile the calculation to see where it is spending its time. + +For now we will skip this evaluation. + +```{julia} +#| eval: false +a_gr_m5 = MixedModels.gradient(m5_blks, updateL!(setθ!(m5, m5.optsum.initial))) +``` + +```{julia} +#| label: fd_m5_benchmark +@benchmark ForwardDiff.gradient!($fd_gr_m5, $m5, $(m5.optsum.initial)) seconds=5 +``` + +```{julia} +#| label: a_m5_benchmark +#| eval: false +@benchmark gradient!($a_gr_m5, $m5_blks, $m5) seconds=1 +``` + +## Optimizing the objective using LD_LBFGS and ForwardDiff {#sec-optimizing} + We create a function to take a `LinearMixedModel` that has been fit and refit it using the `:LD_LBFGS` optimizer applied to an objective function that evaluates the gradient using `ForwardDiff`. ```{julia} @@ -101,33 +297,28 @@ end Define a model for the `penicillin` data ```{julia} -#| label: m1 -pnm01 = fit( - MixedModel, - @formula(diameter ~ 1 + (1|plate) + (1|sample)), - dataset(:penicillin); - progress, -) -pnm01_obj = objective(pnm01) -print(pnm01) +#| label: m02_fit +refit!(m2; progress) +m2_obj = objective(m2) +print(m2) ``` for which the optimization summary is ```{julia} -pnm01.optsum +m2.optsum ``` and refit the model using ForwardDiff gradient evaluations. ```{julia} -fitlog = gr_refit!(pnm01) +fitlog = gr_refit!(m2) ``` -The objective at convergence, compared to the derivative-free optimum +The objective at convergence is slightly less than the optimum from the derivative-free method ```{julia} -pnm01_obj - last(fitlog.obj) +m2_obj - last(fitlog.obj) ``` and the last few evaluations are @@ -139,21 +330,21 @@ last(fitlog, 5) ### Pastes {#sec-pastes} ```{julia} -psm01 = fit(MixedModel, @formula(strength ~ 1 + (1|batch/cask)), dataset(:pastes); progress) -psm01_obj = objective(psm01) -print(psm01) +refit!(m3; progress) +m3_obj = objective(m3) +print(m3) ``` ```{julia} -psm01.optsum +m3.optsum ``` ```{julia} -fitlog = gr_refit!(psm01) +fitlog = gr_refit!(m3) ``` ```{julia} -psm01_obj - last(fitlog.obj) +m3_obj - last(fitlog.obj) ``` ```{julia} @@ -162,6 +353,8 @@ last(fitlog, 5) ### Insteval {#sec-insteval} +We fit an alternative model to the `insteval` data with simple, scalar random effects for three partially crossed grouping factors. + ```{julia} insteval = dataset(:insteval) contrasts = Dict(:service => EffectsCoding()) @@ -204,21 +397,21 @@ This is an example where the number of evaluations to convergence is lower when ### Sleepstudy {#sec-sleepstudy} ```{julia} -slm01 = fit(MixedModel, @formula(reaction ~ 1 + days + (1 + days|subj)), dataset(:sleepstudy); progress) -slm01_obj = objective(slm01) -print(slm01) +refit!(m4; progress) +m4_obj = objective(m4) +print(m4) ``` ```{julia} -slm01.optsum +m4.optsum ``` ```{julia} -fitlog = gr_refit!(slm01) +fitlog = gr_refit!(m4) ``` ```{julia} -slm01_obj - last(fitlog.obj) +m4_obj - last(fitlog.obj) ``` ```{julia} @@ -227,7 +420,10 @@ last(fitlog, 5) ### Kronmueller-Barr 2007 {#sec-kb07} +This model has a 72 free parameters and takes a very long time to fit, so we suppress evaluation of these blocks for now. + ```{julia} +#| eval: false # this model is very overparameterized, but it's a test example kbm01 = fit( MixedModel, @@ -240,37 +436,43 @@ print(kbm01) ``` ```{julia} +#| eval: false kbm01.optsum ``` Several of the parameters on the diagonal of $\boldsymbol{\Lambda}$ are close to zero at convergence and are replaced by zero in the returned parameter vector ```{julia} +#| eval: false findall(iszero, kbm01.θ) ``` Refitting with the gradient takes a very long time because ForwardDiff is poorly suited to optimization problems with many parameters. ```{julia} +#| eval: false fitlog = gr_refit!(kbm01) ``` ```{julia} +#| eval: false kbm01_obj - last(fitlog.obj) ``` ```{julia} +#| eval: false last(fitlog, 5) ``` ```{julia} +#| eval: false findall(x -> abs(x) < 1.0e-5, kbm01.θ) ``` ## Conclusions {#sec-conclusions} -Generally the gradient-based optimizers converge in fewer evaluations than the derivative-free optimizers (`psm01` in @sec-pastes is an exception). -Although the `ftol_rel` criterion is looser for the gradient-based optimizer it usually achieves a lower optimum value, as shown by the differences like `pnm01_obj - last(fitlog.obj)` being positive. +Generally the gradient-based optimizers converge in fewer evaluations than the derivative-free optimizers (`m3` in @sec-pastes is an exception). +Although the `ftol_rel` criterion is looser for the gradient-based optimizer it usually achieves a lower optimum value, as shown by the differences like `m2_obj - last(fitlog.obj)` being positive. I think the most interesting result is for the `insteval` data where the three-parameter optimization takes 81 function evaluations for `LN_NEWUOA` but 34 evaluations for `LD_LBFGS`. However, the ForwardDiff gradient evaluation takes much longer because it allocates so much memory (and it may be using a non-BLAS Cholesky factorization of $1128\times1128$ symmetric matrix). From ca8d1a95618bacf8c845436331e3b876d2538941 Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Mon, 23 Feb 2026 13:44:27 -0600 Subject: [PATCH 23/23] Baseline code before correcting gradient! evaluation --- gradients/GradientEvaluation.qmd | 68 +++++++--------- gradients/Gradient_based_optimization.qmd | 3 - src/gradient.jl | 99 ++++++++++++++++++++--- test/grad.jl | 59 +++++++++----- 4 files changed, 157 insertions(+), 72 deletions(-) diff --git a/gradients/GradientEvaluation.qmd b/gradients/GradientEvaluation.qmd index e6322fe0e..004cc4c1f 100644 --- a/gradients/GradientEvaluation.qmd +++ b/gradients/GradientEvaluation.qmd @@ -217,9 +217,10 @@ Load the packages to be used using BenchmarkTools using CairoMakie using FiniteDiff +using ForwardDiff using LinearAlgebra using MixedModels -using MixedModels: eval_grad_p!, grad_blocks, initialize_blocks! +using MixedModels: eval_grad_p!, grad_blocks, initialize_blocks!, gradient!, blks2dense using MixedModelsDatasets: dataset using TypedTables: Table @@ -241,7 +242,6 @@ A mixed-effects model for these data includes an overall "intercept" term (whose ```{julia} #| label: dyestuff_model m01 = fit(MixedModel, @formula(yield ~ 1 + (1|batch)), dyestuff; progress) -θ = m01.θ # retain a copy of the estimate of θ print(m01) ``` @@ -253,7 +253,7 @@ print(m01) #| label: fig-obj_graph #| fig-cap: "Graph of the objective for model m01 as a function of θ₁. The light blue horizontal line is at the minimum of the objective. The vertical line is at the parameter estimate." #| code-fold: true -let f = Figure() +let f = Figure(), θ = m01.optsum.final ax = Axis(f[1,1], xlabel="θ₁", ylabel="objective") lines!(ax, -0.25..1.5, objective!(m01)) hlines!(objective(updateL!(setθ!(m01, θ))); alpha=0.4) @@ -270,7 +270,7 @@ At the maximum likelihood estimate (i.e. the minimizer of the objective), the va ```{julia} #| label: dyestuff_theta -only(θ) +only(m01.optsum.final) ``` Here the derivative should be zero (in practice, close to zero). @@ -318,7 +318,7 @@ blks = initialize_blocks!(grad_blocks(m01), m01, 1, 1, 1, 1) In practice we evaluate $\mathbf{L}^{-1}\dot{\boldsymbol\Omega}\mathbf{L}^{-\top}$ in the blocked format but here, for illustration, we create $\dot{\boldsymbol{\Omega}}$ in full and use a sparse form of $\mathbf{L}$ to verify the results. ```{julia} -Ω_dot = hvcat(2, blks[1,1], blks[1,2], blks[2,1], blks[2,2]) +Ω_dot = blks2dense(blks) ``` ```{julia} @@ -357,6 +357,12 @@ A finite-difference approximation to this gradient (it is just a scalar derivati FiniteDiff.finite_difference_gradient(m01, ones(1)) ``` +or using automatic differentiation with [ForwardDiff.jl](https://github.com/JuliaDiff/ForwardDiff.jl) and the `ForwardDiff` extension to [MixedModels.jl](https://github.com/JuliaStats/MixedModels.jl). + +```{julia} +g01 = ForwardDiff.gradient!(zeros(1), m01) +``` + The gradient evaluation from the blocked representation is ```{julia} @@ -366,10 +372,10 @@ sum(first(blks).diag) + length(m01.y) * last(last(blks)) If we repeat these steps at the parameter estimate we have ```{julia} -updateL!(setθ!(m01, θ)) # reset the value of θ in the model +updateL!(setθ!(m01, m01.optsum.final)) # reset the value of θ in the model L = LowerTriangular(sparseL(m01; full=true)) initialize_blocks!(blks, m01, 1, 1, 1, 1) -M = rdiv!(ldiv!(L, hvcat(2, blks[1,1], blks[1,2], blks[2,1], blks[2,2])), L') +M = rdiv!(ldiv!(L, blks2dense(blks)), L') ``` with the derivative being evaluated as @@ -391,9 +397,12 @@ sum(first(blks).diag) + length(m01.y) * last(last(blks)) Or, using a finite-difference approximation ```{julia} -FiniteDiff.finite_difference_gradient(m01, θ) +FiniteDiff.finite_difference_gradient(m01, m01.optsum.final) ``` +```{julia} +ForwardDiff.gradient!(g01, m01) +``` ### Sleepstudy - a single vector-valued random-effects term @@ -416,7 +425,7 @@ L = LowerTriangular(sparseL(m03; full=true)) ``` ```{julia} -M = ldiv!(L, rdiv!(hvcat(2, blks[1,1], blks[1,2], blks[2,1], blks[2,2]), L')) +M = ldiv!(L, rdiv!(blks2dense(blks), L')) ``` ```{julia} @@ -427,47 +436,33 @@ sum(M[i,i] for i in 1:36) + length(m03.y) * last(M) FiniteDiff.finite_difference_gradient(m03, m03.optsum.initial) ``` -For the blocked evaluation - ```{julia} -eval_grad_p!(blks, m03, 1) +g03 = ForwardDiff.gradient!(zeros(3), updateL!(setθ!(m03, m03.optsum.initial))) ``` -the value of the gradient component is +The blocked evaluation ```{julia} -dat = first(blks).data -sum(k -> (dat[1, 1, k] + dat[2, 2, k]), axes(dat, 3)) + length(m03.y) * last(last(blks)) +gradient!(g03, blks, m03) ``` -For the second component of the gradient - -```{julia} -eval_grad_p!(blks, m03, 2) -view(first(blks), 1:4, 1:4) -``` +gives the same value of the gradient at the initial parameter values. -```{julia} -last(blks) -``` +However, at the converged parameter estimates, the gradient values are different ```{julia} -sum(k -> (dat[1, 1, k] + dat[2, 2, k]), axes(dat, 3)) + length(m03.y) * last(last(blks)) +ForwardDiff.gradient!(g03, updateL!(setθ!(m03, m03.optsum.final))) ``` -And, for the final component, - ```{julia} -eval_grad_p!(blks, m03, 3) -view(first(blks), 1:4, 1:4) +blks2dense(initialize_blocks!(blks, m03, 1, 1, 1, 2)) ``` ```{julia} -last(blks) +eval_grad_p!(blks, m03, 1) ``` - ```{julia} -sum(k -> (dat[1, 1, k] + dat[2, 2, k]), axes(dat, 3)) + length(m03.y) * last(last(blks)) +gradient!(g03, blks, m03) ``` ### Penicillin - two completely crossed scalar random-effects terms @@ -526,7 +521,8 @@ For the first gradient component, at the initial values ```{julia} updateL!(setθ!(m02, m02.optsum.initial)) -blks = eval_grad_p!(grad_blocks(m02), m02, 1) +blks = grad_blocks(m02) +eval_grad_p!(blks, m02, 1) ``` the value of the gradient component is @@ -558,10 +554,8 @@ FiniteDiff.finite_difference_gradient(m02, m02.optsum.initial) In the full matrix representation ```{julia} -initialize_blocks!(blks, updateL!(setθ!(m02, m02.optsum.initial)), 2, 1, 1, 1) -Ω_dot = let b = blks - hvcat(3, b[1,1], b[1,2], b[1,3], b[2,1], b[2,2], b[2,3], b[3,1], b[3,2], b[3,3]) -end +initialize_blocks!(blks, updateL!(setθ!(m02, m02.optsum.final)), 2, 1, 1, 1) +Ω_dot = blks2dense(blks) ``` ```{julia} diff --git a/gradients/Gradient_based_optimization.qmd b/gradients/Gradient_based_optimization.qmd index 7bc56fc3c..edb760af8 100644 --- a/gradients/Gradient_based_optimization.qmd +++ b/gradients/Gradient_based_optimization.qmd @@ -423,7 +423,6 @@ last(fitlog, 5) This model has a 72 free parameters and takes a very long time to fit, so we suppress evaluation of these blocks for now. ```{julia} -#| eval: false # this model is very overparameterized, but it's a test example kbm01 = fit( MixedModel, @@ -436,14 +435,12 @@ print(kbm01) ``` ```{julia} -#| eval: false kbm01.optsum ``` Several of the parameters on the diagonal of $\boldsymbol{\Lambda}$ are close to zero at convergence and are replaced by zero in the returned parameter vector ```{julia} -#| eval: false findall(iszero, kbm01.θ) ``` diff --git a/src/gradient.jl b/src/gradient.jl index 7d1e28b46..e70456451 100644 --- a/src/gradient.jl +++ b/src/gradient.jl @@ -15,18 +15,16 @@ function Omega_dot_diag_block!( j::Integer, ) where {T} (; A, reterms) = m - isone(i) && isone(j) || + if !(isone(i) && isone(j)) throw(ArgumentError("parameter $p should be from a scalar r.e. term")) + end # It is common for 'b' to be one as well but nested models can result in diagonal blk for b > 1 blk_diag = blk.diag - A_diag = A[kp1choose2(b)].diag # will throw an error if the A[b,b] block is not Diagonal + A_diag = A[kp1choose2(b)].diag # will throw an error if the A[b,b] block is not Diagonal if length(blk_diag) ≠ length(A_diag) throw(DimensionMismatch("A_diag and blk_diag have different lengths")) end - λ = only(reterms[b].λ) # will throw an error if reterms[b] is not of size (1,1) - for k in eachindex(blk_diag, A_diag) - blk_diag[k] = T(2) * λ * A_diag[k] - end + blk_diag .= (T(2) * only(reterms[b].λ)) .* A_diag return blk end @@ -66,7 +64,7 @@ function Omega_dot_diag_block!( Ablk = A[kp1choose2(b)] if !isa(Ablk, UniformBlockDiagonal{T}) throw( - ArgumentError( + ArgumentError( # this error message cannot be evaluated because p is no longer passed "parmap[p] = $(parmap[p]) but A[$(kp1choose2(b))] is not UniformBlockDiagonal", ), ) @@ -300,19 +298,22 @@ function eval_grad_p!( (b, i, j) = parmap[p] # block, row and column for parameter p k = size(reterms[b].λ, 1) initialize_blocks!(blks, m, b, i, j, k) +# @info b, i, j, k for kk in axes(blks, 2) # ldiv!(LowerTriangular(L), blks) # if jj ≥ b # maybe hold off on this at the expense of some multiplications by zero L11 = L[block(1, 1)] +# @info typeof(L11) isa(L11, Diagonal) || (L11 = LowerTriangular(L11)) C1 = ldiv!(L11, blks[1, kk]) +# @info typeof(C1) for ii in axes(blks, 1)[2:end] - mul!(blks[ii, kk], L[block(ii, 1)], C1, -one(T), one(T)) + mm_mul!(blks[ii, kk], L[block(ii, 1)], C1, -one(T), one(T)) end # end for jj in axes(blks, 1)[2:end] Cj = ldiv!(LowerTriangular(L[block(jj, jj)]), blks[jj, kk]) for ii in (jj + 1):lastindex(blks, 1) - mul!(blks[ii, kk], L[block(ii, jj)], Cj, -one(T), one(T)) + mm_mul!(blks[ii, kk], L[block(ii, jj)], Cj, -one(T), one(T)) end end end @@ -340,7 +341,9 @@ function eval_grad_p!( for kk in 1:(jj - 1) mul!(blkij, blks[ii, kk], transpose(L[block(jj, kk)]), -one(T), one(T)) end - rdiv!(blkij, transpose(LowerTriangular(L[block(jj, jj)]))) + Ljj = L[block(jj, jj)] + isa(Ljj, Diagonal) || (Ljj = LowerTriangular(Ljj)) + rdiv!(blkij, transpose(Ljj)) end end # for i in axes(A,1) @@ -363,7 +366,7 @@ Overwrite `g` with the gradient of the ML objective of the model `m` at its curr """ function gradient!(g::Vector{T}, blks::Matrix{AbstractMatrix{T}}, m::LinearMixedModel{T}) where {T} if length(g) ≠ length(m.parmap) - throw(DimensionMismatch("length(g) = $(length(g)) should be $(length(m.parmap)) for this model")) + throw(DimensionMismatch("length(g) = $(length(g)) should be $(length(m.parmap))")) end for p in axes(g, 1) g[p] = eval_grad_p!(blks, m, p) @@ -374,3 +377,77 @@ end function gradient(blks::Matrix{AbstractMatrix{T}}, m::LinearMixedModel{T}) where {T} return gradient!(similar(m.parmap, T), blks, m) end + +function mm_mul!( # exploit a fast path for mul! when C and A have the same sparsity pattern + C::SparseMatrixCSC{Tv,Ti}, + A::SparseMatrixCSC{Tv,Ti}, + B::Diagonal{Tv,Vector{Tv}}, + α::Number, + β::Number +) where {Tv, Ti} + # check if fast path exists + Bdiag = B.diag + if C.m ≠ A.m || C.n ≠ A.n || C.n ≠ length(B.diag) + throw(DimensionMismatch("dimensions not compatible for mul!")) + end + if C.colptr == A.colptr && C.rowval == A.rowval + Cnz = C.nzval + if !isone(β) + Cnz .*= β + end + Anz = A.nzval + for (j, bd) in enumerate(Bdiag) + nzr = nzrange(A, j) + Cnz[nzr] .+= Anz[nzr] * bd * α + end + return C + end + LinearAlgebra.mul!(C, A, B, α, β) +end + +function mm_mul!(C::AbstractMatrix{T}, A::AbstractMatrix{T}, B::AbstractMatrix{T}, α::Number, β::Number) where {T} + mul!(C, A, B, α, β) +end + +function Lldiv!(m::LinearMixedModel{T}, blks::Matrix{AbstractMatrix{T}}) where {T} + L = m.L + for kk in axes(blks, 2) # ldiv!(LowerTriangular(L), blks) + L11 = first(L) + isa(L11, Diagonal) || (L11 = LowerTriangular(L11)) + C1 = ldiv!(L11, blks[1, kk]) + for ii in axes(blks, 1)[2:end] + mm_mul!(blks[ii, kk], L[block(ii, 1)], C1, -one(T), one(T)) + end + for jj in axes(blks, 1)[2:end] + Cj = ldiv!(LowerTriangular(L[block(jj, jj)]), blks[jj, kk]) + for ii in (jj + 1):lastindex(blks, 1) + mm_mul!(blks[ii, kk], L[block(ii, jj)], Cj, -one(T), one(T)) + end + end + end + return blks +end + +function blks2dense(blks) + ncol = size(blks, 2) + if ncol == 2 + return hvcat(2, blks[1, 1], blks[1, 2], blks[2, 1], blks[2, 2]) + elseif ncol == 3 + return hvcat( + 3, + blks[1, 1], blks[1, 2], blks[1, 3], + blks[2, 1], blks[2, 2], blks[2, 3], + blks[3, 1], blks[3, 2], blks[3, 3], + ) + elseif ncol == 4 + return hvcat( + 4, + blks[1, 1], blks[1, 2], blks[1, 3], blks[1, 4], + blks[2, 1], blks[2, 2], blks[2, 3], blks[2, 4], + blks[3, 1], blks[3, 2], blks[3, 3], blks[3, 4], + blks[4, 1], blks[4, 2], blks[4, 3], blks[4, 4], + ) + else + throw(ArgumentError("size(blks, 2) == $ncol is too large")) + end +end diff --git a/test/grad.jl b/test/grad.jl index eb2573ab3..d104ae6dc 100644 --- a/test/grad.jl +++ b/test/grad.jl @@ -1,8 +1,9 @@ # using FiniteDiff using LinearAlgebra +using ForwardDiff using MixedModels using Test -using MixedModels: grad_blocks, eval_grad_p!, initialize_blocks! +using MixedModels: grad_blocks, eval_grad_p!, initialize_blocks!, gradient! include("modelcache.jl") @@ -10,50 +11,66 @@ include("modelcache.jl") @testset "single_scalar" begin fm1 = only(models(:dyestuff)) θ = fm1.θ - blks = initialize_blocks!(grad_blocks(fm1), fm1, 1, 1, 1, 1) - dblk = blks[1,1].diag + blks1 = initialize_blocks!(grad_blocks(fm1), fm1, 1, 1, 1, 1) + dblk = blks1[1,1].diag @test all(≈(10. * only(θ)), dblk) ldiv_blk = ldiv!(first(fm1.L), dblk) @test all(≈(10. * only(θ) / first(first(fm1.L))), dblk) + g1 = gradient!(similar(θ), blks1, fm1) + @test abs(only(g1)) < 1.e-6 updateL!(setθ!(fm1, θ)) # restore the estimated parameter values fm2 = first(models(:pastes)) θ = fm2.θ - blks = initialize_blocks!(grad_blocks(fm2), fm2, 1, 1, 1, 1) - @test all(≈(4. * only(θ)), blks[1, 1].diag) - initialize_blocks!(blks, updateL!(setθ!(fm2, ones(1))), 1, 1, 1, 1) - @test all(==(4.), blks[1, 1].diag) + blks2 = initialize_blocks!(grad_blocks(fm2), fm2, 1, 1, 1, 1) + @test all(≈(4. * only(θ)), blks2[1, 1].diag) + g2 = gradient!(similar(θ), blks2, fm2) + @test abs(only(g2)) < 9.e-6 + initialize_blocks!(blks2, updateL!(setθ!(fm2, ones(1))), 1, 1, 1, 1) + @test all(==(4.), blks2[1, 1].diag) updateL!(setθ!(fm2, θ)) fm3 = last(models(:pastes)) # first of two nested, scalar r.e. terms θ = fm3.θ - blks = initialize_blocks!(grad_blocks(fm3), fm3, 1, 1, 1, 1) - @test all(≈(4. * first(θ)), blks[1,1].diag) + blks3 = initialize_blocks!(grad_blocks(fm3), fm3, 1, 1, 1, 1) + @test all(≈(4. * first(θ)), blks3[1,1].diag) + g3_fd = ForwardDiff.gradient!(similar(θ), fm3) + @test norm(g3_fd) < 9.e-5 + g3_an = gradient!(similar(θ), blks3, fm3) fm4 = only(models(:penicillin)) - blks = initialize_blocks!(grad_blocks(fm4), fm4, 1, 1, 1, 1) - @test all(≈(12. * first(fm4.θ)), blks[1,1].diag) + θ4 = fm4.θ + blks4 = initialize_blocks!(grad_blocks(fm4), fm4, 1, 1, 1, 1) + @test all(≈(12. * first(fm4.θ)), blks4[1,1].diag) + g4_fd = ForwardDiff.gradient!(similar(θ4), fm4) + g4_ad = gradient!(similar(g4_fd), blks4, fm4) # not properly evaluated yet fm5 = first(models(:sleepstudy)) - blks = initialize_blocks!(grad_blocks(fm5), fm5, 1, 1, 1, 1) - @test all(≈(20. * only(fm5.θ)), blks[1, 1].diag) + θ5 = fm5.θ + blks5 = initialize_blocks!(grad_blocks(fm5), fm5, 1, 1, 1, 1) + @test all(≈(20. * only(θ5)), blks5[1, 1].diag) + g5_fd = ForwardDiff.gradient!(similar(θ5), fm5) + @test abs(only(g5_fd)) < 5.e-7 + g5_ad = gradient!(similar(θ5), blks5, fm5) + @test abs(only(g5_ad)) < 5.e-7 + end @testset "single_vector" begin fm6 = last(models(:sleepstudy)) - λ = only(fm6.reterms).λ - θ = fm6.θ - blks = initialize_blocks!(grad_blocks(fm6), fm6, 1, 1, 1, 2) - blk_dat = blks[1,1].data + λ6 = only(fm6.reterms).λ + θ6 = fm6.θ + blks6 = initialize_blocks!(grad_blocks(fm6), fm6, 1, 1, 1, 2) + blk_dat = blks6[1,1].data A11_dat = first(fm6.A).data - @test all(≈(20. * first(θ)), view(blk_dat, 1, 1, :)) + @test all(≈(20. * first(θ6)), view(blk_dat, 1, 1, :)) @test all(iszero, view(blk_dat, 2, 2, :)) @test all(view(blk_dat, 1, 2, :) .== view(blk_dat, 2, 1, :)) - odiag = dot(view(λ, 2, :), view(A11_dat, :, 1, 1)) + odiag = dot(view(λ6, 2, :), view(A11_dat, :, 1, 1)) @test all(≈(odiag), view(blk_dat, 1, 2, :)) - ldiv!(LowerTriangular(first(fm6.L)), blks[1,1]) + ldiv!(LowerTriangular(first(fm6.L)), blks6[1,1]) - initialize_blocks!(blks, fm6, 1, 2, 2, 2) + initialize_blocks!(blks6, fm6, 1, 2, 2, 2) @test all(iszero, view(blk_dat, 1, 1, :)) @test all(view(blk_dat, 1, 2, :) .== view(blk_dat, 2, 1, :)) # result should be symmetric # @test all(==(10. * first(θ)), view(blk_dat, 1, 2, :))