New version

lbittarello · Dec 21, 2019 · c0848e1 · c0848e1
1 parent 403cfe2
commit c0848e1
Show file tree

Hide file tree

Showing 38 changed files with 652 additions and 873 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -7,6 +7,7 @@ julia:
   - 1.0
   - 1.1
   - 1.2
+  - 1.3
   - nightly
 matrix:
   allow_failures:
@@ -20,7 +21,7 @@ notifications:
 jobs:
   include:
     - stage: "Documentation"
-      julia: 1.2
+      julia: 1.3
       os: linux
       script:
         - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'

diff --git a/LICENSE.md b/LICENSE.md
@@ -1,22 +1,22 @@
 The Microeconometrics.jl package is licensed under the MIT "Expat" License:
 
 > Copyright (c) 2017: Luca Bittarello.
-> 
+>
 > Permission is hereby granted, free of charge, to any person obtaining a copy
 > of this software and associated documentation files (the "Software"), to deal
 > in the Software without restriction, including without limitation the rights
 > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 > copies of the Software, and to permit persons to whom the Software is
 > furnished to do so, subject to the following conditions:
-> 
+>
 > The above copyright notice and this permission notice shall be included in all
 > copies or substantial portions of the Software.
-> 
+>
 > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 > AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 > LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 > SOFTWARE.
-> 
+>
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "Microeconometrics"
 uuid = "5d8a72c5-1e5a-5861-b6ae-c8e9564f9d17"
-version = "0.5.0"
+version = "0.6.0"
 
 [deps]
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
@@ -18,14 +18,14 @@ SuiteSparse = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9"
 
 [compat]
 julia = "≥ 1.0.0"
-CSV = "≥ 0.3.0"
-DataFrames = "≥ 0.19.2"
-Format = "≥ 0.7.0"
-Optim = "≥ 0.17.0"
-SpecialFunctions = "≥ 0.7.0"
-StatsBase = "≥ 0.25.0"
-StatsFuns = "≥ 0.7.0"
-StatsModels = "≥ 0.6.3"
+CSV = "≥ 0.5.18"
+DataFrames = "≥ 0.19.4"
+Format = "≥ 1.0.1"
+Optim = "≥ 0.19.6"
+SpecialFunctions = "≥ 0.9.0"
+StatsBase = "≥ 0.32.0"
+StatsFuns = "≥ 0.9.2"
+StatsModels = "≥ 0.6.7"
 
 [extras]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"

diff --git a/docs/src/contributing.md b/docs/src/contributing.md
@@ -10,7 +10,7 @@ mutable struct OLS <: MLE
 
     sample::Microdata     # estimation sample
     β::Vector{Float64}    # coefficient vector
-    V::Matrix{Float64}    # variance matrix
+    V::AbstractMatrix{Float64}    # variance matrix
 
     OLS() = new()
 end
@@ -87,10 +87,9 @@ end
 
 We do not need to extend `_vcov!`. The default method will call `score` and `jacobian` and construct the appropriate estimator, accounting for the correlation structure of the data and the type of weights.
 
-We now overload `predict` and `fitted`. For OLS, these functions are equivalent.
+We now overload `predict`:
 ```julia
-predict(obj::OLS) = getmatrix(obj, :control) * obj.β
-fitted(obj::OLS)  = predict(obj)
+predict(obj::OLS, MD::Microdata) = getmatrix(MD, :control) * obj.β
 ```
 The next step is optional. We extend `jacobexp`, which computes the derivative of fitted values.
 ```julia

diff --git a/docs/src/correlation_structures.md b/docs/src/correlation_structures.md
@@ -2,16 +2,16 @@
 
 Before fitting the model, you must specify the correlation between observations (a `CorrStructure`). It determines the calculation of covariance matrices. The default is always `Heteroscedastic`, i.e. independent but not identically distributed observations.
 
-All constructors accept the Boolean keyword `adj` (omitted in the following), which defaults to `true`. If `true`, a finite-sample adjustment is applied to the covariance matrix. The adjustment factor is n / (n - 1), where n is the number of clusters for clustered data and the number of observations otherwise.
+All constructors accept the Boolean keyword `corrected` (omitted in the following), which defaults to `true`. If `true`, a finite-sample adjustment is applied to the covariance matrix. The adjustment factor is n / (n - 1), where n is the number of clusters for clustered data and the number of observations otherwise.
 
 Four subtypes are currently available: `Homoscedastic`, `Heteroscedastic`, `Clustered` and `CrossCorrelated`.
 
 ## `Homoscedastic`
 
 ```julia
-Homoscedastic(method::String = "OIM")
+Homoscedastic(; expected::Bool = false)
 ```
-Observations are independent and identically distributed. The optional argument `method` is only relevant for maximum-likelihood estimators. It controls the estimation of the covariance matrix: `"OIM"` uses the observed information matrix, whereas `"OPG"` uses the outer product of the gradient. Only linear and maximum-likelihood estimators support homoscedastic errors.
+Observations are independent and identically distributed. The optional keyword argument `expected` controls the estimation of the covariance matrix of maximum-likelihood estimators: `false` uses the observed information matrix, whereas `"true"` uses the outer product of the gradient. Only linear and maximum-likelihood estimators support homoscedastic errors.
 
 ## `Heteroscedastic`
 

diff --git a/docs/src/estimation_tables.md b/docs/src/estimation_tables.md
@@ -19,14 +19,12 @@ etable(
     args...;
     digits::Int = 4,
     aux::Union{Function, Nothing} = nothing,
-    stars::Matrix{Any} = [0.1 "*"; 0.05 "**"; 0.01 "***"],
-    titles::Vector{String} = [""]
+    stars::AbstractMatrix{Any} = [0.1 "*"; 0.05 "**"; 0.01 "***"]
 )
 ```
 
 This function displays a simple regression table. The keyword arguments are:
 
-- `digits`: the number of digits on display.
-- `aux`: an auxiliary statistic (e.g., stderr), displayed below each coefficient.
+- `digits`: the number of digits on display;
+- `aux`: an auxiliary statistic (e.g., `stderror`), displayed below each coefficient;
 - `stars`: the star scheme.
-- `titles`: the title of each regression (defaults to numbers).
diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md
@@ -49,15 +49,13 @@ julia> C = Homoscedastic() ;
 
 We now construct the [estimation sample](model_specification.md):
 ```julia
-julia> D = Microdata(S, M, vcov = C) ;
+julia> D = Microdata(S, M, corr = C) ;
 ```
 
 We can finally fit the model and visualize the results:
 ```julia
 julia> E = fit(OLS, D)
 
-OLS
-
               Estimate  St. Err.   t-stat.   p-value      C.I. (95%)  
 gre             0.0004    0.0002    2.0384    0.0415    0.0000  0.0008
 gpa             0.1555    0.0640    2.4317    0.0150    0.0302  0.2809
@@ -94,8 +92,6 @@ julia> M₁ = @micromodel(response => admit, control => gre + gpa + rank + 1) ;
 julia> D₁ = Microdata(S, M₁) ;
 julia> E₁ = fit(OLS, D₁)
 
-OLS
-
               Estimate  St. Err.   t-stat.   p-value      C.I. (95%)  
 gre             0.0004    0.0002    2.0501    0.0404    0.0000  0.0008
 gpa             0.1555    0.0653    2.3833    0.0172    0.0276  0.2834
@@ -105,7 +101,7 @@ rank: 4        -0.3230    0.0780   -4.1408    0.0000   -0.4759 -0.1701
 (Intercept)    -0.2589    0.2110   -1.2268    0.2199   -0.6725  0.1547
 ```
 
-Before we estimate the reduced model, we must redefine the control set.:
+Before we estimate the reduced model, we must first redefine the control set:
 ```julia
 julia> M₂ = @micromodel(response => admit, :control => gre + gpa + 1) ;
 julia> D₂ = Microdata(S, M₂) ;
@@ -114,8 +110,6 @@ We can now fit the reduced model:
 ```julia
 julia> E₂ = fit(OLS, D₂)
 
-OLS
-
               Estimate  St. Err.   t-stat.   p-value      C.I. (95%)  
 gre             0.0005    0.0002    2.5642    0.0103    0.0001  0.0010
 gpa             0.1542    0.0650    2.3737    0.0176    0.0269  0.2816
@@ -124,7 +118,7 @@ gpa             0.1542    0.0650    2.3737    0.0176    0.0269  0.2816
 
 The coefficients on `gre` and `gpa` seem to be robust. For a formal equality test, we use a Hausman test:
 ```julia
-julia> H = hausman_1s(E₁, E₂, ["gre", "gpa"]) ;
+julia> H = hausman_test(E₁, E₂, ["gre", "gpa"]) ;
 julia> tstat(H)
 
 2-element Array{Float64,1}:
@@ -141,8 +135,6 @@ julia> I₁ = (S[:rank] .== 1) ;
 julia> D₁ = Microdata(S, M, subset = I₁) ;
 julia> E₁ = fit(OLS, D₁)
 
-OLS
-
               Estimate  St. Err.   t-stat.   p-value      C.I. (95%)  
 gre             0.0006    0.0006    1.0386    0.2990   -0.0005  0.0017
 gpa             0.2508    0.1929    1.3000    0.1936   -0.1273  0.6290
@@ -152,18 +144,16 @@ julia> I₂ = (S[:rank] .== 2) ;
 julia> D₂ = Microdata(S, M, subset = I₂) ;
 julia> E₂ = fit(OLS, D₂)
 
-OLS
-
               Estimate  St. Err.   t-stat.   p-value      C.I. (95%)  
 gre             0.0004    0.0004    0.8794    0.3792   -0.0004  0.0011
 gpa             0.1771    0.1028    1.7219    0.0851   -0.0245  0.3787
 (Intercept)    -0.4470    0.3641   -1.2277    0.2196   -1.1607  0.2666
 
-julia> H = hausman_2s(E₁, E₂, ["gre", "gpa"]) ;
+julia> H = chow_test(E₁, E₂, ["gre", "gpa"]) ;
 julia> tstat(H)
 
 2-element Array{Float64,1}:
   0.334261
   0.337304
 ```
-We used the function `hausman_2s` because these estimates are based on different samples. The difference in the effect of `gre` between ranks 1 and 2 is not significant.
+We used the function `chow_test` because these estimates are based on different samples. The difference in the effect of `gre` between ranks 1 and 2 is not significant.
diff --git a/docs/src/hypothesis_tests.md b/docs/src/hypothesis_tests.md
@@ -16,22 +16,26 @@ This function returns the *p*-value of a two-sided significance test.
 
 ## Hausman test
 
-This procedure tests the difference in coefficients between two parametric models ([Hausman, 1984](http://jstor.org/stable/1913827)). It can be used in replacement of the Chow test and the Sobel test. Our implementation is based on the GMM representation of the joint estimation problem (see Subsection 8.3.2 of Cameron and Trivedi (2005)). Neither model need be efficient.
-
-The optional argument `names` specifies the coefficients of interest as they appear on regression tables (be careful with categorical variables!). The output is a `ParEstimate`, which contains the vector of differences, their covariance matrix and labels.
-
 ```julia
-hausman_1s(
+hausman_test(
         model₁::Union{GMM, ParModel, TwoStageModel},
         model₂::Union{GMM, ParModel, TwoStageModel},
         names::Vector{String} = intersect(coefnames(obj₁), coefnames(obj₂))
 )
 ```
 
-This function is appropriate when `model₁` and `model₂` were based on a single estimation sample.
+This procedure tests the difference in coefficients between two parametric models ([Hausman, 1984](http://jstor.org/stable/1913827)). Both models must have been estimated on the sample. It can be used in replacement of the Sobel test. This implementation is based on the GMM representation of the joint estimation problem (see Subsection 8.3.2 of Cameron and Trivedi (2005)). Neither model need be efficient.
+
+The optional argument `names` specifies the coefficients of interest as they appear on regression tables (be careful with categorical variables!). The output is a `ParEstimate`, which contains the vector of differences, their covariance matrix and labels.
+
+## Chow test
+
+This procedure tests the difference in coefficients between two parametric models ([Hausman, 1984](http://jstor.org/stable/1913827)). The models need not have been estimated on the sample. This implementation is based on the GMM representation of the joint estimation problem (see Subsection 8.3.2 of Cameron and Trivedi (2005)).
+
+The optional argument `names` specifies the coefficients of interest as they appear on regression tables (be careful with categorical variables!). The output is a `ParEstimate`, which contains the vector of differences, their covariance matrix and labels.
 
 ```julia
- hausman_2s(
+ chow_test(
          model₁::Union{GMM, ParModel, TwoStageModel},
          model₂::Union{GMM, ParModel, TwoStageModel},
          names::Vector{String} = intersect(coefnames(obj₁), coefnames(obj₂))
@@ -41,12 +45,14 @@ This function is appropriate when `model₁` and `model₂` were based on a sing
 This function is appropriate when `model₁` and `model₂` were based on independent samples. For example, the samples might consist of independent observations with no overlap.
 
 ```julia
- hausman_2s(
+ chow_test(
          model₁::Union{GMM, ParModel, TwoStageModel},
          model₂::Union{GMM, ParModel, TwoStageModel},
          corr::CorrStructure,
          names::Vector{String} = intersect(coefnames(obj₁), coefnames(obj₂))
  )
 ```
 
-This function is appropriate when `model₁` and `model₂` were based on dependent samples. For example, the samples might consist of independent observations with some overlap or clustered observations with shared clusters. The correlation structure `corr` must specify the correlation between all observations of both estimation samples. For example, you could construct `corr` for the entire dataset and construct the samples via the `subset` keyword to `Microdata`.
+This function is appropriate when `model₁` and `model₂` were based on dependent samples. For example, two clustered samples might have one or more clusters in common. We must then take the correlation across samples within shared clusters into account.
+
+The correlation structure `corr` must specify the correlation between all observations of both estimation samples. For instance, you could construct `corr` for the entire dataset and construct the subsamples via the `subset` keyword of `Microdata`.
diff --git a/docs/src/methods.md b/docs/src/methods.md
@@ -4,8 +4,8 @@ This package supports the methods for regression models of [*StatsBase.jl*](http
 
 The following functions are available for all models: `nobs` and `response`.
 
-The following functions are available for parametric models: `dof`, `dof_residual`, `coef`, `stderr`, `vcov`, `confint` and `coefnames`. Note that all methods refer to the second stage of two-stage models.
+The following functions are available for parametric models: `dof`, `dof_residual`, `coef`, `stderror`, `vcov`, `confint` and `coefnames`. Note that all methods refer to the second stage of two-stage models.
 
 The following functions are available for maximum-likelihood estimators: `deviance`, `nulldeviance`, `loglikelihood`, `nullloglikelihood`, `r2` and `adjr2`. There are also R² methods for OLS and IV. The following functions are available from *StatsBase.jl*: `aic`, `aicc` and `bic`.
 
-Most models support `predict` and `fitted`. `predict` estimates the index of single-index models. `fitted` computes the conditional outcome expectation. For example, `predict` estimates the Xβ of a logit model and `fitted` computes logistic(Xβ). Support for `residuals` depends on the availability of `fitted`. Out-of-sample forecast is supported.
+Most models support `predict` and its alias `fitted`. Support for `residuals` depends on the availability of `fitted`. Out-of-sample forecast is supported.
diff --git a/docs/src/model_specification.md b/docs/src/model_specification.md
@@ -7,9 +7,9 @@ Microdata(
         DF::DataFrame,
         model::Dict{Symbol, String};
         hints::Dict{Symbol, TermOrTerms},
-        subset::AbstractVector{Bool} = trues(size(DF, 1)),
-        vcov::CorrStructure = Heteroscedastic(),
+        subset::AbstractVector{Bool},
         weights::AbstractWeights = UnitWeights(size(DF, 1))
+        corr::CorrStructure,
     )
 ```
 
@@ -25,6 +25,6 @@ All regression models need a `response`, but other requirements may vary. (Check
 
 As for the keywords:
 - `hints`: a dictionary from column labels to [schemas](https://juliastats.github.io/StatsModels.jl/latest/internals.html) or [contrasts](https://juliastats.github.io/StatsModels.jl/latest/contrasts/).
-- `subset` determines the estimation sample. Set an entry to `true` if the corresponding row of `DF` should be included and `false` if it should be excluded. This keyword is useful if you are comparing subgroups and observations in different subgroups may correlate (e.g., they may belong to the same cluster). [`hausman_2s`](hypothesis_tests.md#hausman-test) will account for that correlation if the `Microdata` were constructed with `subset`.
+- `subset` determines the estimation sample. Set an entry to `true` if the corresponding row of `DF` should be included and `false` if it should be excluded. This keyword is useful if you are comparing subgroups and observations in different subgroups may correlate (e.g., they may belong to the same cluster). [`chow_test`](hypothesis_tests.md#hausman-test) will take that correlation into account if the `Microdata` were constructed with `subset`.
 - `weights` is a [weight vector](http://juliastats.github.io/StatsBase.jl/stable/weights.html). Except for frequency weights, the weight vector is normalized to sum up to the number of observations in the sample.
-- `vcov` is a [correlation structure](correlation_structures.md).
+- `corr` is a [correlation structure](correlation_structures.md).
diff --git a/src/Microeconometrics.jl b/src/Microeconometrics.jl
@@ -8,36 +8,37 @@ using Format: format
 using LinearAlgebra
 using Optim
 using SparseArrays
-using SpecialFunctions: lgamma
+using SpecialFunctions: loggamma
 using StatsFuns
 using StatsModels
     const SM = StatsModels
 using SuiteSparse
 
+using Base:      @propagate_inbounds
 using StatsBase: AbstractWeights, CoefTable, RegressionModel
 using StatsBase: AnalyticWeights, FrequencyWeights, ProbabilityWeights, Weights
-using StatsBase: mean, sum, pweights
+using StatsBase: mean, sum
 
-import Base:        copy, isequal, show, sum
-import Statistics:  mean
-import StatsBase:   fit, coef, coefnames, coeftable, confint, stderror, vcov
-import StatsBase:   deviance, loglikelihood, nulldeviance, nullloglikelihood
-import StatsBase:   adjr2, dof, dof_residual, nobs, r2
-import StatsBase:   fitted, predict, residuals, response
-import StatsModels: terms, termvars, schema, apply_schema, has_schema, modelcols
+import LinearAlgebra: lmul!
+import Statistics:    mean
+import StatsBase:     varcorrection
+import StatsBase:     fit, coef, coefnames, coeftable, confint, stderror, vcov
+import StatsBase:     deviance, loglikelihood, nulldeviance, nullloglikelihood
+import StatsBase:     adjr2, dof, dof_residual, nobs, r2
+import StatsBase:     fitted, predict, residuals, response
+import StatsModels:   terms, termvars, schema, apply_schema, has_schema, modelcols
 
-include("./inference/corr.jl")
-include("./data/types.jl")
-include("./data/weights.jl")
 include("./general/types.jl")
+include("./data/corrstructures.jl")
+include("./data/microdata.jl")
+include("./data/micromodel.jl")
+include("./data/weights.jl")
 include("./data/utils.jl")
 include("./general/etable.jl")
 include("./general/statsmodels.jl")
 include("./general/utils.jl")
-include("./inference/adjfactor.jl")
-include("./inference/hausman.jl")
-include("./inference/utils.jl")
-include("./inference/vcov.jl")
+include("./inference/covariance_matrix.jl")
+include("./inference/model_comparison.jl")
 include("./estimation/ols.jl")
 include("./estimation/iv.jl")
 include("./estimation/logit.jl")
@@ -65,6 +66,6 @@ export
     adjr2, dof, dof_residual, nobs, r2,
     fitted, predict, residuals, response,
     coefnames, coeftable, etable,
-    hausman_1s, hausman_2s, pval, tstat
+    chow_test, hausman_test, pval, tstat
 
 end # module