diff --git a/docs/src/contrasts.md b/docs/src/contrasts.md index 98e1b69c..6006bc77 100644 --- a/docs/src/contrasts.md +++ b/docs/src/contrasts.md @@ -22,7 +22,7 @@ The default contrast coding system is `DummyCoding`. To override this, use the `contrasts` argument when constructing a `ModelFrame`: ```julia -mf = ModelFrame(y ~ 1 + x, df, contrasts = Dict(:x => EffectsCoding())) +mf = ModelFrame(@formula(y ~ 1 + x), df, contrasts = Dict(:x => EffectsCoding())) ``` To change the contrast coding for one or more variables in place, use diff --git a/docs/src/formula.md b/docs/src/formula.md index 211a67eb..98727b24 100644 --- a/docs/src/formula.md +++ b/docs/src/formula.md @@ -21,13 +21,16 @@ goal is to support any tabular data format that adheres to a minimal API, ## The `Formula` type The basic conceptual tool for this is the `Formula`, which has a left side and a -right side, separated by `~`: +right side, separated by `~`. Formulas are constructed using the `@formula` macro: ```jldoctest -julia> y ~ 1 + a +julia> @formula(y ~ 1 + a) Formula: y ~ 1 + a ``` +Note that the `@formula` macro **must** be called with parentheses to ensure that +the formula is parsed properly. + The left side of a formula conventionally represents *dependent* variables, and the right side *independent* variables (or regressors). *Terms* are separated by `+`. Basic terms are the integers `1` or `0`—evaluated as the presence or @@ -43,7 +46,7 @@ It's often convenient to include main effects and interactions for a number of variables. The `*` operator does this, expanding in the following way: ```jldoctest -julia> Formula(StatsModels.Terms(y ~ 1 + a*b)) +julia> Formula(StatsModels.Terms(@formula(y ~ 1 + a*b))) Formula: y ~ 1 + a + b + a & b ``` @@ -54,7 +57,7 @@ This applies to higher-order interactions, too: `a*b*c` expands to the main effects, all two-way interactions, and the three way interaction `a&b&c`: ```jldoctest -julia> Formula(StatsModels.Terms(y ~ 1 + a*b*c)) +julia> Formula(StatsModels.Terms(@formula(y ~ 1 + a*b*c))) Formula: y ~ 1 + a + b + c + a & b + a & c + b & c + &(a,b,c) ``` @@ -62,13 +65,20 @@ Both the `*` and the `&` operators act like multiplication, and are distributive over addition: ```jldoctest -julia> Formula(StatsModels.Terms(y ~ 1 + (a+b) & c)) +julia> Formula(StatsModels.Terms(@formula(y ~ 1 + (a+b) & c))) Formula: y ~ 1 + c & a + c & b -julia> Formula(StatsModels.Terms(y ~ 1 + (a+b) * c)) +julia> Formula(StatsModels.Terms(@formula(y ~ 1 + (a+b) * c))) Formula: y ~ 1 + a + b + c + c & a + c & b ``` +You may be wondering why formulas in Julia require a macro, while in R they appear "bare." +R supports nonstandard evaluation, allowing the formula to remain an unevaluated object +while its terms are parsed out. Julia uses a much more standard evaluation mechanism, +making this impossible using normal expressions. However, unlike R, Julia provides macros to +explicitly indicate when code itself will be manipulated before it's evaluated. By constructing +a formula using a macro, we're able to provide convenient, R-like syntax and semantics. + ## The `ModelFrame` and `ModelMatrix` types The main use of `Formula`s is for fitting statistical models based on tabular diff --git a/src/StatsModels.jl b/src/StatsModels.jl index c62f9d7d..686b0cb4 100644 --- a/src/StatsModels.jl +++ b/src/StatsModels.jl @@ -9,7 +9,7 @@ using NullableArrays using CategoricalArrays -export @~, +export @formula, Formula, ModelFrame, ModelMatrix, diff --git a/src/formula.jl b/src/formula.jl index 813bd8f6..b25b577c 100644 --- a/src/formula.jl +++ b/src/formula.jl @@ -1,6 +1,6 @@ # Formulas for representing and working with linear-model-type expressions -# Original by Harlan D. Harris. Later modifications by John Myles White -# and Douglas M. Bates. +# Original by Harlan D. Harris. Later modifications by John Myles White, +# Douglas M. Bates, and other contributors. ## Formulas are written as expressions and parsed by the Julia parser. ## For example :(y ~ a + b + log(c)) @@ -12,16 +12,19 @@ ## The rhs of a formula can be 1 type Formula - lhs::@compat(Union{Symbol, Expr, Void}) - rhs::@compat(Union{Symbol, Expr, Integer}) + lhs::Union{Symbol, Expr, Void} + rhs::Union{Symbol, Expr, Integer} end -macro ~(lhs, rhs) - ex = Expr(:call, - :Formula, - Base.Meta.quot(lhs), - Base.Meta.quot(rhs)) - return ex +macro formula(ex) + if (ex.head === :macrocall && ex.args[1] === Symbol("@~")) || (ex.head === :call && ex.args[1] === :(~)) + length(ex.args) == 3 || error("malformed expression in formula") + lhs = Base.Meta.quot(ex.args[2]) + rhs = Base.Meta.quot(ex.args[3]) + else + error("expected formula separator ~, got $(ex.head)") + end + return Expr(:call, :Formula, lhs, rhs) end """ @@ -46,9 +49,7 @@ end Base.:(==)(t1::Terms, t2::Terms) = all(getfield(t1, f)==getfield(t2, f) for f in fieldnames(t1)) function Base.show(io::IO, f::Formula) - print(io, - string("Formula: ", - f.lhs == nothing ? "" : f.lhs, " ~ ", f.rhs)) + print(io, "Formula: ", f.lhs === nothing ? "" : f.lhs, " ~ ", f.rhs) end # special operators in formulas diff --git a/test/formula.jl b/test/formula.jl index e67c1aa1..6455ff5b 100644 --- a/test/formula.jl +++ b/test/formula.jl @@ -12,8 +12,7 @@ using Compat # - support more transformations with I()? ## Formula parsing -import StatsModels: @~, Formula -import StatsModels.Terms +import StatsModels: @formula, Formula, Terms ## totally empty t = Terms(Formula(nothing, 0)) @@ -23,87 +22,90 @@ t = Terms(Formula(nothing, 0)) @test t.eterms == [] ## empty RHS -t = Terms(y ~ 0) +t = Terms(@formula(y ~ 0)) @test t.intercept == false @test t.terms == [] @test t.eterms == [:y] -t = Terms(y ~ -1) +t = Terms(@formula(y ~ -1)) @test t.intercept == false @test t.terms == [] ## intercept-only -t = Terms(y ~ 1) +t = Terms(@formula(y ~ 1)) @test t.response == true @test t.intercept == true @test t.terms == [] @test t.eterms == [:y] ## terms add -t = Terms(y ~ 1 + x1 + x2) +t = Terms(@formula(y ~ 1 + x1 + x2)) @test t.intercept == true @test t.terms == [:x1, :x2] @test t.eterms == [:y, :x1, :x2] ## implicit intercept behavior: -t = Terms(y ~ x1 + x2) +t = Terms(@formula(y ~ x1 + x2)) @test t.intercept == true @test t.terms == [:x1, :x2] @test t.eterms == [:y, :x1, :x2] ## no intercept -t = Terms(y ~ 0 + x1 + x2) +t = Terms(@formula(y ~ 0 + x1 + x2)) @test t.intercept == false @test t.terms == [:x1, :x2] -@test t == Terms(y ~ -1 + x1 + x2) == Terms(y ~ x1 - 1 + x2) == Terms(y ~ x1 + x2 -1) +@test t == Terms(@formula(y ~ -1 + x1 + x2)) == Terms(@formula(y ~ x1 - 1 + x2)) == Terms(@formula(y ~ x1 + x2 -1)) ## can't subtract terms other than 1 -@test_throws ErrorException Terms(y ~ x1 - x2) +@test_throws ErrorException Terms(@formula(y ~ x1 - x2)) -t = Terms(y ~ x1 & x2) +t = Terms(@formula(y ~ x1 & x2)) @test t.terms == [:(x1 & x2)] @test t.eterms == [:y, :x1, :x2] ## `*` expansion -t = Terms(y ~ x1 * x2) +t = Terms(@formula(y ~ x1 * x2)) @test t.terms == [:x1, :x2, :(x1 & x2)] @test t.eterms == [:y, :x1, :x2] ## associative rule: ## + -t = Terms(y ~ x1 + x2 + x3) +t = Terms(@formula(y ~ x1 + x2 + x3)) @test t.terms == [:x1, :x2, :x3] ## & -t = Terms(y ~ x1 & x2 & x3) +t = Terms(@formula(y ~ x1 & x2 & x3)) @test t.terms == [:((&)(x1, x2, x3))] @test t.eterms == [:y, :x1, :x2, :x3] ## distributive property of + and & -t = Terms(y ~ x1 & (x2 + x3)) +t = Terms(@formula(y ~ x1 & (x2 + x3))) @test t.terms == [:(x1&x2), :(x1&x3)] ## FAILS: ordering of expanded interaction terms is wrong ## (only has an observable effect when both terms are categorical and ## produce multiple model matrix columns that are multiplied together...) ## -## t = Terms(y ~ (x2 + x3) & x1) +## t = Terms(@formula(y ~ (x2 + x3)) & x1) ## @test t.terms == [:(x2&x1), :(x3&x1)] ## three-way * -t = Terms(y ~ x1 * x2 * x3) +t = Terms(@formula(y ~ x1 * x2 * x3)) @test t.terms == [:x1, :x2, :x3, :(x1&x2), :(x1&x3), :(x2&x3), :((&)(x1, x2, x3))] @test t.eterms == [:y, :x1, :x2, :x3] ## Interactions with `1` reduce to main effect. All fail at the moment. -## t = Terms(y ~ 1 & x1) +## t = Terms(@formula(y ~ 1 & x1)) ## @test t.terms == [:x1] # == [:(1 & x1)] ## @test t.eterms == [:y, :x1] -## t = Terms(y ~ (1 + x1) & x2) +## t = Terms(@formula(y ~ (1 + x1)) & x2) ## @test t.terms == [:x2, :(x1&x2)] # == [:(1 & x1)] ## @test t.eterms == [:y, :x1, :x2] +# Incorrect formula separator +@test_throws ErrorException eval(:(@formula(y => x + 1))) + end diff --git a/test/modelmatrix.jl b/test/modelmatrix.jl index 7a970d9d..e59b5ee2 100644 --- a/test/modelmatrix.jl +++ b/test/modelmatrix.jl @@ -6,7 +6,7 @@ using DataFrames using Compat # for testing while DataFrames still exports these: -import StatsModels: @~, Formula, ModelMatrix, ModelFrame, DummyCoding, EffectsCoding, HelmertCoding, ContrastsCoding, setcontrasts!, coefnames +import StatsModels: @formula, Formula, ModelMatrix, ModelFrame, DummyCoding, EffectsCoding, HelmertCoding, ContrastsCoding, setcontrasts!, coefnames ## Tests for constructing ModelFrame and ModelMatrix @@ -24,7 +24,7 @@ x1 = [5.:8;] x2 = [9.:12;] x3 = [13.:16;] x4 = [17.:20;] -f = y ~ x1 + x2 +f = @formula(y ~ x1 + x2) mf = ModelFrame(f, d) ## @test mm.response_colnames == ["y"] # nope: no response_colnames @test coefnames(mf) == ["(Intercept)","x1","x2"] @@ -41,7 +41,7 @@ smm = ModelMatrix{sparsetype}(mf) #test_group("expanding a nominal array into a design matrix of indicators for each dummy variable") d[:x1p] = NullableCategoricalArray(d[:x1]) -mf = ModelFrame(y ~ x1p, d) +mf = ModelFrame(@formula(y ~ x1p), d) mm = ModelMatrix(mf) @test mm.m[:,2] == [0, 1., 0, 0] @@ -104,16 +104,16 @@ mm = ModelMatrix(mf) ## @test r[:,1:3] == expand(CategoricalArray(x1), "x1", DataFrame()) ## @test r[:,4:6] == expand(CategoricalArray(x2), "x2", DataFrame()) -#test_group("Creating a model matrix using full formulas: y ~ x1 + x2, etc") +#test_group("Creating a model matrix using full formulas: y => x1 + x2, etc") df = deepcopy(d) -f = y ~ x1 & x2 +f = @formula(y ~ x1 & x2) mf = ModelFrame(f, df) mm = ModelMatrix(mf) @test mm.m == [ones(4) x1.*x2] @test mm.m == ModelMatrix{sparsetype}(mf).m -f = y ~ x1 * x2 +f = @formula(y ~ x1 * x2) mf = ModelFrame(f, df) mm = ModelMatrix(mf) @test mm.m == [ones(4) x1 x2 x1.*x2] @@ -121,7 +121,7 @@ mm = ModelMatrix(mf) df[:x1] = CategoricalArray(x1) x1e = [[0, 1, 0, 0] [0, 0, 1, 0] [0, 0, 0, 1]] -f = y ~ x1 * x2 +f = @formula(y ~ x1 * x2) mf = ModelFrame(f, df) mm = ModelMatrix(mf) @test mm.m == [ones(4) x1e x2 [0, 10, 0, 0] [0, 0, 11, 0] [0, 0, 0, 12]] @@ -173,7 +173,7 @@ mm = ModelMatrix(mf) # additional tests from Tom y = [1., 2, 3, 4] -mf = ModelFrame(y ~ x2, d) +mf = ModelFrame(@formula(y ~ x2), d) mm = ModelMatrix(mf) @test mm.m == [ones(4) x2] @test mm.m == ModelMatrix{sparsetype}(mf).m @@ -182,12 +182,12 @@ mm = ModelMatrix(mf) df = deepcopy(d) df[:x1] = NullableCategoricalArray(df[:x1]) -f = y ~ x2 + x3 + x3*x2 +f = @formula(y ~ x2 + x3 + x3*x2) mm = ModelMatrix(ModelFrame(f, df)) @test mm.m == [ones(4) x2 x3 x2.*x3] -mm = ModelMatrix(ModelFrame(y ~ x3*x2 + x2 + x3, df)) +mm = ModelMatrix(ModelFrame(@formula(y ~ x3*x2 + x2 + x3), df)) @test mm.m == [ones(4) x3 x2 x2.*x3] -mm = ModelMatrix(ModelFrame(y ~ x1 + x2 + x3 + x4, df)) +mm = ModelMatrix(ModelFrame(@formula(y ~ x1 + x2 + x3 + x4), df)) @test mm.m[:,2] == [0, 1., 0, 0] @test mm.m[:,3] == [0, 0, 1., 0] @test mm.m[:,4] == [0, 0, 0, 1.] @@ -195,24 +195,24 @@ mm = ModelMatrix(ModelFrame(y ~ x1 + x2 + x3 + x4, df)) @test mm.m[:,6] == x3 @test mm.m[:,7] == x4 -mm = ModelMatrix(ModelFrame(y ~ x2 + x3 + x4, df)) +mm = ModelMatrix(ModelFrame(@formula(y ~ x2 + x3 + x4), df)) @test mm.m == [ones(4) x2 x3 x4] -mm = ModelMatrix(ModelFrame(y ~ x2 + x2, df)) +mm = ModelMatrix(ModelFrame(@formula(y ~ x2 + x2), df)) @test mm.m == [ones(4) x2] -mm = ModelMatrix(ModelFrame(y ~ x2*x3 + x2&x3, df)) +mm = ModelMatrix(ModelFrame(@formula(y ~ x2*x3 + x2&x3), df)) @test mm.m == [ones(4) x2 x3 x2.*x3] -mm = ModelMatrix(ModelFrame(y ~ x2*x3*x4, df)) +mm = ModelMatrix(ModelFrame(@formula(y ~ x2*x3*x4), df)) @test mm.m == [ones(4) x2 x3 x4 x2.*x3 x2.*x4 x3.*x4 x2.*x3.*x4] -mm = ModelMatrix(ModelFrame(y ~ x2&x3 + x2*x3, df)) +mm = ModelMatrix(ModelFrame(@formula(y ~ x2&x3 + x2*x3), df)) @test mm.m == [ones(4) x2 x3 x2.*x3] -f = y ~ x2 & x3 & x4 +f = @formula(y ~ x2 & x3 & x4) mf = ModelFrame(f, df) mm = ModelMatrix(mf) @test mm.m == [ones(4) x2.*x3.*x4] @test mm.m == ModelMatrix{sparsetype}(mf).m -f = y ~ x1 & x2 & x3 +f = @formula(y ~ x1 & x2 & x3) mf = ModelFrame(f, df) mm = ModelMatrix(mf) @test mm.m[:, 2:end] == diagm(x2.*x3) @@ -260,23 +260,23 @@ mm = ModelMatrix(mf) ## Distributive property of :& over :+ df = deepcopy(d) -f = y ~ (x1+x2) & (x3+x4) +f = @formula(y ~ (x1+x2) & (x3+x4)) mf = ModelFrame(f, df) mm = ModelMatrix(mf) @test mm.m == hcat(ones(4), x1.*x3, x1.*x4, x2.*x3, x2.*x4) @test mm.m == ModelMatrix{sparsetype}(mf).m ## Condensing nested :+ calls -f = y ~ x1 + (x2 + (x3 + x4)) +f = @formula(y ~ x1 + (x2 + (x3 + x4))) @test ModelMatrix(ModelFrame(f, df)).m == hcat(ones(4), x1, x2, x3, x4) ## Extra levels in categorical column -mf_full = ModelFrame(y ~ x1p, d) +mf_full = ModelFrame(@formula(y ~ x1p), d) mm_full = ModelMatrix(mf_full) @test size(mm_full) == (4,4) -mf_sub = ModelFrame(y ~ x1p, d[2:4, :]) +mf_sub = ModelFrame(@formula(y ~ x1p), d[2:4, :]) mm_sub = ModelMatrix(mf_sub) ## should have only three rows, and only three columns (intercept plus two ## levels of factor) @@ -284,13 +284,13 @@ mm_sub = ModelMatrix(mf_sub) ## Missing data d[:x1m] = NullableArray(Nullable{Int}[5, 6, Nullable(), 7]) -mf = ModelFrame(y ~ x1m, d) +mf = ModelFrame(@formula(y ~ x1m), d) mm = ModelMatrix(mf) @test isequal(NullableArray(mm.m[:, 2]), d[complete_cases(d), :x1m]) @test mm.m == ModelMatrix{sparsetype}(mf).m ## Same variable on left and right side -mf = ModelFrame(x1 ~ x1, df) +mf = ModelFrame(@formula(x1 ~ x1), df) mm = ModelMatrix(mf) mm.m == float(model_response(mf)) @@ -305,7 +305,7 @@ d[:n] = 1.:8 ## No intercept -mf = ModelFrame(n ~ 0 + x, d, contrasts=cs) +mf = ModelFrame(@formula(n ~ 0 + x), d, contrasts=cs) mm = ModelMatrix(mf) @test mm.m == [1 0 0 1 @@ -319,7 +319,7 @@ mm = ModelMatrix(mf) @test coefnames(mf) == ["x: a", "x: b"] ## No first-order term for interaction -mf = ModelFrame(n ~ 1 + x + x&y, d, contrasts=cs) +mf = ModelFrame(@formula(n ~ 1 + x + x&y), d, contrasts=cs) mm = ModelMatrix(mf) @test mm.m[:, 2:end] == [-1 -1 0 1 0 -1 @@ -333,7 +333,7 @@ mm = ModelMatrix(mf) @test coefnames(mf) == ["(Intercept)", "x: b", "x: a & y: d", "x: b & y: d"] ## When both terms of interaction are non-redundant: -mf = ModelFrame(n ~ 0 + x&y, d, contrasts=cs) +mf = ModelFrame(@formula(n ~ 0 + x&y), d, contrasts=cs) mm = ModelMatrix(mf) @test mm.m == [1 0 0 0 0 1 0 0 @@ -348,7 +348,7 @@ mm = ModelMatrix(mf) "x: a & y: d", "x: b & y: d"] # only a three-way interaction: every term is promoted. -mf = ModelFrame(n ~ 0 + x&y&z, d, contrasts=cs) +mf = ModelFrame(@formula(n ~ 0 + x&y&z), d, contrasts=cs) mm = ModelMatrix(mf) @test mm.m == eye(8) @test mm.m == ModelMatrix{sparsetype}(mf).m @@ -357,7 +357,7 @@ mm = ModelMatrix(mf) # first (both x and y), but only the old term (x) in the second (because # dropping x gives z which isn't found elsewhere, but dropping z gives x # which is found (implicitly) in the promoted interaction x&y). -mf = ModelFrame(n ~ 0 + x&y + x&z, d, contrasts=cs) +mf = ModelFrame(@formula(n ~ 0 + x&y + x&z), d, contrasts=cs) mm = ModelMatrix(mf) @test mm.m == [1 0 0 0 -1 0 0 1 0 0 0 -1 @@ -375,7 +375,7 @@ mm = ModelMatrix(mf) # ...and adding a three-way interaction, only the shared term (x) is promoted. # this is because dropping x gives y&z which isn't present, but dropping y or z # gives x&z or x&z respectively, which are both present. -mf = ModelFrame(n ~ 0 + x&y + x&z + x&y&z, d, contrasts=cs) +mf = ModelFrame(@formula(n ~ 0 + x&y + x&z + x&y&z), d, contrasts=cs) mm = ModelMatrix(mf) @test mm.m == [1 0 0 0 -1 0 1 0 0 1 0 0 0 -1 0 1 @@ -394,7 +394,7 @@ mm = ModelMatrix(mf) # two two-way interactions, with common lower-order term. the common term x is # promoted in both (along with lower-order term), because in every case, when # x is dropped, the remaining terms (1, y, and z) aren't present elsewhere. -mf = ModelFrame(n ~ 0 + x + x&y + x&z, d, contrasts=cs) +mf = ModelFrame(@formula(n ~ 0 + x + x&y + x&z), d, contrasts=cs) mm = ModelMatrix(mf) @test mm.m == [1 0 -1 0 -1 0 0 1 0 -1 0 -1 @@ -435,17 +435,17 @@ mm = ModelMatrix(mf) # Ensure that random effects terms are dropped from coefnames df = DataFrame(x = [1,2,3], y = [4,5,6]) -mf = ModelFrame(y ~ 1 + (1 | x), df) +mf = ModelFrame(@formula(y ~ 1 + (1 | x)), df) @test coefnames(mf) == ["(Intercept)"] -mf = ModelFrame(y ~ 0 + (1 | x), df) +mf = ModelFrame(@formula(y ~ 0 + (1 | x)), df) @test_throws ErrorException ModelMatrix(mf) @test coefnames(mf) == Vector{Compat.UTF8String}() # Ensure X is not a view on df column df = DataFrame(x = [1.0,2.0,3.0], y = [4.0,5.0,6.0]) -mf = ModelFrame(y ~ 0 + x, df) +mf = ModelFrame(@formula(y ~ 0 + x), df) X = ModelMatrix(mf).m X[1] = 0.0 @test mf.df[1, :x] === Nullable(1.0) diff --git a/test/statsmodel.jl b/test/statsmodel.jl index be09486a..f1825104 100644 --- a/test/statsmodel.jl +++ b/test/statsmodel.jl @@ -33,7 +33,7 @@ d[:x2] = [9:12;] d[:x3] = [13:16;] d[:x4] = [17:20;] -f = y ~ x1 * x2 +f = @formula(y ~ x1 * x2) m = fit(DummyMod, f, d) @test model_response(m) == Array(d[:y]) @@ -64,7 +64,7 @@ show(io, m) ## with categorical variables d[:x1p] = NullableCategoricalArray(d[:x1]) -f2 = y ~ x1p +f2 = @formula(y ~ x1p) m2 = fit(DummyMod, f2, d) @test coeftable(m2).rownms == ["(Intercept)", "x1p: 6", "x1p: 7", "x1p: 8"] @@ -80,7 +80,7 @@ d3[:x1p] = NullableCategoricalVector(d3[:x1]) ## fit with contrasts specified d[:x2p] = NullableCategoricalVector(d[:x2]) -f3 = y ~ x1p + x2p +f3 = @formula(y ~ x1p + x2p) m3 = fit(DummyMod, f3, d) fit(DummyMod, f3, d, contrasts = Dict(:x1p => EffectsCoding())) fit(DummyMod, f3, d, contrasts = Dict(:x1p => EffectsCoding(),