Merge pull request #2 from jmboehm/withDataFrameRegressionModels

With DataFrameRegressionModels to support GLM.jl
jmboehm · Dec 4, 2017 · 452290d · 452290d
2 parents d72c7f7 + 1ccbc98
commit 452290d
Show file tree

Hide file tree

Showing 14 changed files with 178 additions and 47 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -2,7 +2,6 @@
 language: julia
 julia:
   - 0.6
-  - nightly
 notifications:
   email: false
 git:

diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 ## RegressionTables.jl
 
-This package provides publication-quality regression tables for use with [FixedEffectModels.jl](https://github.com/matthieugomez/FixedEffectModels.jl).
+This package provides publication-quality regression tables for use with [FixedEffectModels.jl](https://github.com/matthieugomez/FixedEffectModels.jl) and [GLM.jl](https://github.com/JuliaStats/GLM.jl).
 
 In its objective it is similar to  (and heavily inspired by) the Stata command [`esttab`](http://repec.sowi.unibe.ch/stata/estout/esttab.html) and the R package [`stargazer`](https://cran.r-project.org/web/packages/stargazer/).
 
@@ -99,11 +99,48 @@ then use `\input` in LaTeX to include that file in your code. Be sure to use the
 
 \end{document}
 ```
+`regtable()` can also print `DataFrameRegressionModel`'s from [GLM.jl](https://github.com/JuliaStats/GLM.jl):
+```julia
+dobson = DataFrame(Counts = [18.,17,15,20,10,20,25,13,12],
+    Outcome = pool(repeat(["A", "B", "C"], outer = 3)),
+    Treatment = pool(repeat(["a","b", "c"], inner = 3)))
+lm1 = fit(LinearModel, @formula(SepalLength ~ SepalWidth), df)
+gm1 = fit(GeneralizedLinearModel, @formula(Counts ~ 1 + Outcome + Treatment), dobson,
+                  Poisson())
+
+regtable(rr1,lm1,gm1; renderSettings = asciiOutput())
+```
+yields
+```
+---------------------------------------------
+                   SepalLength        Counts
+               -------------------   --------
+                    (1)        (2)        (3)
+---------------------------------------------
+(Intercept)    6.526***   6.526***   3.045***
+                (0.479)    (0.479)    (0.171)
+SepalWidth       -0.223     -0.223           
+                (0.155)    (0.155)           
+Outcome: B                             -0.454
+                                      (0.202)
+Outcome: C                             -0.293
+                                      (0.193)
+Treatment: b                            0.000
+                                      (0.200)
+Treatment: c                            0.000
+                                      (0.200)
+---------------------------------------------
+Estimator           OLS        OLS         NL
+---------------------------------------------
+N                   150        150          9
+R2                0.014      0.014           
+---------------------------------------------
+```
 
 ## Options
 
 ### Function Arguments
-* `rr::AbstractRegressionResult...` are the `AbstractRegressionResult`s from `FixedEffectModels.jl` that should be printed. Only required argument.
+* `rr::rr::Union{AbstractRegressionResult,DataFrames.DataFrameRegressionModel}...` are the `AbstractRegressionResult`s from `FixedEffectModels.jl` (or `DataFrameRegressionModel`s from `GLM.jl`) that should be printed. Only required argument.
 * `regressors` is a `Vector` of regressor names (`String`s) that should be shown, in that order. Defaults to an empty vector, in which case all regressors will be shown.
 * `fixedeffects` is a `Vector` of FE names (`String`s) that should be shown, in that order. Defaults to an empty vector, in which case all FE's will be shown.
 * `labels` is a `Dict` that contains displayed labels for variables (strings) and other text in the table. If no label for a variable is found, it default to variable names. See documentation for special values.
@@ -130,6 +167,7 @@ to change the label for the row showing the number of observations in each regre
 * `__LABEL_ESTIMATOR__` (default: "Estimator")
 * `__LABEL_ESTIMATOR_OLS__` (default: "OLS")
 * `__LABEL_ESTIMATOR_IV__` (default: "IV")
+* `__LABEL_ESTIMATOR_NL__` (default: "NL")
 
 * `__LABEL_FE_YES__` (default: "Yes")
 * `__LABEL_FE_NO__` (default: "")

diff --git a/REQUIRE b/REQUIRE
@@ -1,6 +1,7 @@
 julia 0.6.0
 FixedEffectModels 0.4.0
 StatsBase 0.7.1
+GLM 0.8.1
 RDatasets 0.2.0
 Distributions 0.4.6
 Formatting 0.3.0
diff --git a/src/RegressionTables.jl b/src/RegressionTables.jl
@@ -10,6 +10,7 @@ module RegressionTables
     #   - write more serious tests
     #   - allow custom ordering of blocks (e.g. [:estimates, :fe, :estimator, :statistics])
     #   - HTML or CSV output
+    #   - custom statistics
     #
     #   TECHNICAL:
     #   - Rewrite table cell/row formats using an encapsulating function instead
@@ -26,9 +27,13 @@ module RegressionTables
     ##
     ##############################################################################
 
+    using DataFrames
+
     import Distributions: ccdf, FDist
     import FixedEffectModels: AbstractRegressionResult, RegressionResult, RegressionResultIV, RegressionResultFE, RegressionResultFEIV
     import Formatting: sprintf1
+    import DataFrames: DataFrameRegressionModel, ModelFrame , coef, coefnames, vcov, nobs, dof_residual, r2
+    import GLM: LinearModel
 
 
     ##############################################################################
@@ -46,8 +51,8 @@ module RegressionTables
     ##############################################################################
 
     # main types
-    include("RenderSettings.jl")
-    include("RegressionTable.jl")
+    include("rendersettings.jl")
+    include("regressiontable.jl")
 
     # misc
     include("util/util.jl")

diff --git a/src/RegressionTable.jl → src/regressiontable.jl b/src/RegressionTable.jl → src/regressiontable.jl
diff --git a/src/regtable.jl b/src/regtable.jl
@@ -16,7 +16,7 @@ Produces a publication-quality regression table, similar to Stata's `esttab` and
 * `number_regressions` is a `Bool` that governs whether regressions should be numbered. Defaults to `true`.
 * `number_regressions_decoration` is a `Function` that governs the decorations to the regression numbers. Defaults to `s -> "(\$s)"`.
 * `print_fe_section` is a `Bool` that governs whether a section on fixed effects should be shown. Defaults to `true`.
-* `print_estimator_section`  is a `Bool` that governs whether to print a section on which estimator (OLS/IV) is used. Defaults to `true`.
+* `print_estimator_section`  is a `Bool` that governs whether to print a section on which estimator (OLS/IV/NL) is used. Defaults to `true`.
 * `renderSettings::RenderSettings` is a `RenderSettings` composite type that governs how the table should be rendered. Standard supported types are ASCII (via `asciiOutput(outfile::String)`) and LaTeX (via `latexOutput(outfile::String)`). If no argument to these two functions are given, the output is sent to STDOUT. Defaults to ASCII with STDOUT.
 
 ### Details
@@ -67,7 +67,7 @@ regtable(rr1,rr2,rr3,rr4; renderSettings = latexOutput("myoutfile.tex"))
 ```
 """
 
-function regtable(rr::AbstractRegressionResult...;
+function regtable(rr::Union{AbstractRegressionResult,DataFrameRegressionModel}...;
     regressors::Vector{String} = Vector{String}(),
     fixedeffects::Vector{String} = Vector{String}(),
     labels::Dict{String,String} = Dict{String,String}(),
@@ -84,8 +84,22 @@ function regtable(rr::AbstractRegressionResult...;
     renderSettings::RenderSettings = asciiOutput()
     )
 
+    # define some functions that makes use of DataFrames' RegressionModels
+    coefnames(r::DataFrameRegressionModel) = DataFrames.coefnames(r.mf)
+    coefnames(r::AbstractRegressionResult) = r.coefnames
+    coef(r::AbstractRegressionResult) = r.coef
+    coef(r::DataFrameRegressionModel) = DataFrames.coef(r)
+    vcov(r::AbstractRegressionResult) = r.vcov
+    vcov(r::DataFrameRegressionModel) = DataFrames.vcov(r)
+    df_residual(r::AbstractRegressionResult) = r.df_residual
+    df_residual(r::DataFrameRegressionModel) = dof_residual(r)
+    yname(r::AbstractRegressionResult) = r.yname
+    yname(r::DataFrameRegressionModel) = r.mf.terms.eterms[1]
+    ther2(r::AbstractRegressionResult) = r.r2
+    ther2(r::DataFrameRegressionModel) = isa(r.model, LinearModel) ? r2(r) : NaN
+
+
     numberOfResults = size(rr,1)
-    #println("Found $numberOfResults regression results.")
 
     # Create an RegressionTable from the regression results
 
@@ -94,10 +108,11 @@ function regtable(rr::AbstractRegressionResult...;
         # construct default ordering: from ordering in regressions (like in Stata)
         regressorList = Vector{String}()
         for r in rr # AbstractRegressionResult
-            for regressorIndex = 1:length(r.coefnames)
-                if !(any(regressorList .== r.coefnames[regressorIndex]))
+            names = coefnames(r)
+            for regressorIndex = 1:length(names)
+                if !(any(regressorList .== names[regressorIndex]))
                     # add to list
-                    push!(regressorList, r.coefnames[regressorIndex])
+                    push!(regressorList, names[regressorIndex])
                 end
             end
         end
@@ -111,15 +126,19 @@ function regtable(rr::AbstractRegressionResult...;
     for regressor in regressorList
         estimateLine = fill("", 2, numberOfResults+1)
         for resultIndex = 1:numberOfResults
-            index = find(regressor .== rr[resultIndex].coefnames)
+            thiscnames = coefnames(rr[resultIndex])
+            thiscoef = coef(rr[resultIndex])
+            thisvcov = vcov(rr[resultIndex])
+            thisdf_residual = df_residual(rr[resultIndex])
+            index = find(regressor .== thiscnames)
             if !isempty(index)
-                pval = ccdf(FDist(1, rr[resultIndex].df_residual ), abs2(rr[resultIndex].coef[index[1]]/sqrt(rr[resultIndex].vcov[index[1],index[1]])))
-                estimateLine[1,resultIndex+1] = estim_decoration(sprintf1(estimformat,rr[resultIndex].coef[index[1]]),pval)
+                pval = ccdf(FDist(1, thisdf_residual ), abs2(thiscoef[index[1]]/sqrt(thisvcov[index[1],index[1]])))
+                estimateLine[1,resultIndex+1] = estim_decoration(sprintf1(estimformat,thiscoef[index[1]]),pval)
                 if below_statistic == :tstat
-                    s = sprintf1(statisticformat, rr[resultIndex].coef[index[1]]/sqrt(rr[resultIndex].vcov[index[1],index[1]]))
+                    s = sprintf1(statisticformat, thiscoef[index[1]]/sqrt(thisvcov[index[1],index[1]]))
                     estimateLine[2,resultIndex+1] = below_decoration(s)
                 elseif below_statistic == :se
-                    s = sprintf1(statisticformat, sqrt(rr[resultIndex].vcov[index[1],index[1]]))
+                    s = sprintf1(statisticformat, sqrt(thisvcov[index[1],index[1]]))
                     estimateLine[2,resultIndex+1] = below_decoration(s)
                 elseif below_statistic == :blank
                     estimateLine[2,resultIndex+1] = "" # for the sake of completeness
@@ -142,7 +161,7 @@ function regtable(rr::AbstractRegressionResult...;
     regressandBlock = fill("", 1, numberOfResults+1)
     for rIndex = 1:numberOfResults
         # keep in mind that yname is a Symbol
-        regressandBlock[1,rIndex+1] = haskey(labels,string(rr[rIndex].yname)) ? labels[string(rr[rIndex].yname)] : string(rr[rIndex].yname)
+        regressandBlock[1,rIndex+1] = haskey(labels,string(yname(rr[rIndex]))) ? labels[string(yname(rr[rIndex]))] : string(yname(rr[rIndex]))
     end
 
     # Regression numbering block (if we do it)
@@ -232,9 +251,13 @@ function regtable(rr::AbstractRegressionResult...;
         estimatorBlock = fill("", 1, numberOfResults+1)
         estimatorBlock[1,1] = haskey(labels, "__LABEL_ESTIMATOR__") ? labels["__LABEL_ESTIMATOR__"] : renderSettings.label_estimator
         for i = 1:numberOfResults
-            estimatorBlock[1,i+1] = isIVRegressionResult(rr[i]) ?
-                (haskey(labels, "__LABEL_ESTIMATOR_IV__") ? labels["__LABEL_ESTIMATOR_IV__"] : renderSettings.label_estimator_iv) :
-                (haskey(labels, "__LABEL_ESTIMATOR_OLS__") ? labels["__LABEL_ESTIMATOR_OLS__"] : renderSettings.label_estimator_ols)
+            if isOLSRegressionResult(rr[i])
+                estimatorBlock[1,i+1] =  haskey(labels, "__LABEL_ESTIMATOR_OLS__") ? labels["__LABEL_ESTIMATOR_OLS__"] : renderSettings.label_estimator_ols
+            elseif isIVRegressionResult(rr[i])
+                estimatorBlock[1,i+1] =  haskey(labels, "__LABEL_ESTIMATOR_IV__") ? labels["__LABEL_ESTIMATOR_IV__"] : renderSettings.label_estimator_iv
+            else
+                estimatorBlock[1,i+1] =  haskey(labels, "__LABEL_ESTIMATOR_NL__") ? labels["__LABEL_ESTIMATOR_NL__"] : renderSettings.label_estimator_nl
+            end
         end
     end
 
@@ -248,12 +271,12 @@ function regtable(rr::AbstractRegressionResult...;
             if regression_statistics[i] == :nobs
                 statisticBlock[i,1] = haskey(labels, "__LABEL_STATISTIC_N__") ? labels["__LABEL_STATISTIC_N__"] : renderSettings.label_statistic_n
                 for resultIndex = 1:numberOfResults
-                    statisticBlock[i,resultIndex+1] = sprintf1("%i",rr[resultIndex].nobs)
+                    statisticBlock[i,resultIndex+1] = sprintf1("%i",nobs(rr[resultIndex]))
                 end
             elseif regression_statistics[i] == :r2
                 statisticBlock[i,1] = haskey(labels, "__LABEL_STATISTIC_R2__") ? labels["__LABEL_STATISTIC_R2__"] : renderSettings.label_statistic_r2
                 for resultIndex = 1:numberOfResults
-                    statisticBlock[i,resultIndex+1] = sprintf1(statisticformat, rr[resultIndex].r2)
+                    statisticBlock[i,resultIndex+1] = isnan(ther2(rr[resultIndex])) ? "" : sprintf1(statisticformat, ther2(rr[resultIndex]))
                 end
             elseif regression_statistics[i] == :r2_a
                 statisticBlock[i,1] = haskey(labels, "__LABEL_STATISTIC_R2_A__") ? labels["__LABEL_STATISTIC_R2_A__"] : renderSettings.label_statistic_r2_a
@@ -330,7 +353,7 @@ function regtable(rr::AbstractRegressionResult...;
         try
             outstream = open(renderSettings.outfile, "w")
         catch ex
-            error("Error opening file $(renderSettings.outfile): $(ex.msg)")
+            error("Error opening file $(renderSettings.outfile): $(ex)")
         end
     end
 

diff --git a/src/RenderSettings.jl → src/rendersettings.jl b/src/RenderSettings.jl → src/rendersettings.jl
@@ -29,6 +29,7 @@ struct RenderSettings
     label_estimator::String # label for the Estimator block. Override with __LABEL_ESTIMATOR__
     label_estimator_ols::String # label for the Estimator block. Override with __LABEL_ESTIMATOR_OLS__
     label_estimator_iv::String # label for the Estimator block. Override with __LABEL_ESTIMATOR_IV__
+    label_estimator_nl::String # label for the Estimator block. Override with __LABEL_ESTIMATOR_NL__
 
     outfile::String    # file to print output into.
                        # if empty, print to STDOUT.

diff --git a/src/rendersettings/ascii.jl b/src/rendersettings/ascii.jl
@@ -26,6 +26,7 @@ function asciiOutput(outfile::String = "")
     label_estimator = "Estimator"
     label_estimator_ols = "OLS"
     label_estimator_iv = "IV"
+    label_estimator_nl = "NL"
 
     foutfile = outfile
     encapsulateRegressand = asciiRegressandTransform
@@ -35,6 +36,6 @@ function asciiOutput(outfile::String = "")
         label_fe_yes, label_fe_no,
         label_statistic_n, label_statistic_r2, label_statistic_r2_a, label_statistic_r2_within,
         label_statistic_f, label_statistic_p, label_statistic_f_kp, label_statistic_p_kp, label_statistic_dof,
-        label_estimator, label_estimator_ols, label_estimator_iv,
+        label_estimator, label_estimator_ols, label_estimator_iv, label_estimator_nl,
         foutfile, encapsulateRegressand, header, footer)
 end
diff --git a/src/rendersettings/latex.jl b/src/rendersettings/latex.jl
@@ -35,6 +35,8 @@ function latexOutput(outfile::String = "")
     label_estimator = "Estimator"
     label_estimator_ols = "OLS"
     label_estimator_iv = "IV"
+    label_estimator_nl = "NL"
+
 
     foutfile = outfile
     encapsulateRegressand = latexRegressandTransform
@@ -44,6 +46,6 @@ function latexOutput(outfile::String = "")
         label_fe_yes, label_fe_no,
         label_statistic_n, label_statistic_r2, label_statistic_r2_a, label_statistic_r2_within,
         label_statistic_f, label_statistic_p, label_statistic_f_kp, label_statistic_p_kp, label_statistic_dof,
-        label_estimator, label_estimator_ols, label_estimator_iv,
+        label_estimator, label_estimator_ols, label_estimator_iv, label_estimator_nl,
         foutfile, encapsulateRegressand, header, footer)
 end
diff --git a/src/util/util.jl b/src/util/util.jl
@@ -1,3 +1,9 @@
 # functions that classify regression results
 isFERegressionResult(r::AbstractRegressionResult) = isa(r,RegressionResultFE) || isa(r,RegressionResultFEIV)
 isIVRegressionResult(r::AbstractRegressionResult) = isa(r,RegressionResultIV) || isa(r,RegressionResultFEIV)
+isOLSRegressionResult(r::AbstractRegressionResult) = !isIVRegressionResult(r)
+
+# FE and IV regression not supported in GLM.jl
+isFERegressionResult(r::DataFrameRegressionModel) = false
+isIVRegressionResult(r::DataFrameRegressionModel) = false
+isOLSRegressionResult(r::DataFrameRegressionModel) = isa(r.model, LinearModel)
diff --git a/test/RegressionTables.jl b/test/RegressionTables.jl
@@ -1,27 +1,44 @@
-using RegressionTables, FixedEffectModels, RDatasets, Base.Test
+using RegressionTables, FixedEffectModels, GLM, RDatasets, Base.Test
 
 df = dataset("datasets", "iris")
 df[:SpeciesDummy] = pool(df[:Species])
 df[:isSmall] = pool(df[:SepalWidth] .< 2.9)
 
+# FixedEffectModels.jl
 rr1 = reg(df, @model(SepalLength ~ SepalWidth))
 rr2 = reg(df, @model(SepalLength ~ SepalWidth + PetalLength   , fe = SpeciesDummy))
 rr3 = reg(df, @model(SepalLength ~ SepalWidth + PetalLength + PetalWidth  , fe = SpeciesDummy  + isSmall))
 rr4 = reg(df, @model(SepalWidth ~ SepalLength + PetalLength + PetalWidth  , fe = SpeciesDummy))
 rr5 = reg(df, @model(SepalWidth ~ SepalLength + (PetalLength ~ PetalWidth)  , fe = SpeciesDummy))
 
+# GLM.jl
+dobson = DataFrame(Counts = [18.,17,15,20,10,20,25,13,12],
+    Outcome = pool(repeat(["A", "B", "C"], outer = 3)),
+    Treatment = pool(repeat(["a","b", "c"], inner = 3)))
+
+lm1 = fit(LinearModel, @formula(SepalLength ~ SepalWidth), df)
+lm2 = fit(LinearModel, @formula(SepalLength ~ SepalWidth + PetalWidth), df)
+gm1 = fit(GeneralizedLinearModel, @formula(Counts ~ 1 + Outcome), dobson,
+              Poisson())
 
 function checkfilesarethesame(file1::String, file2::String)
 
-    f1 = open(file1)
-    f2 = open(file2)
+    f1 = open(file1, "r")
+    f2 = open(file2, "r")
 
     s1 = readstring(f1)
     s2 = readstring(f2)
 
     close(f1)
     close(f2)
 
+    # Character-by-character comparison
+    for i=1:length(s1)
+        if s1[i]!=s2[i]
+            println("Character $(i) different: $(s1[i]) $(s2[i])")
+        end
+    end
+
     if s1 == s2
         return true
     else
@@ -63,6 +80,11 @@ end
 regtable(rr1,rr2,rr3,rr5; renderSettings = asciiOutput(joinpath(dirname(@__FILE__), "tables", "test1.txt")), regression_statistics = [:nobs, :r2, :r2_a, :r2_within, :f, :p, :f_kp, :p_kp, :dof])
 @test checkfilesarethesame(joinpath(dirname(@__FILE__), "tables", "test1.txt"), joinpath(dirname(@__FILE__), "tables", "test1_reference.txt"))
 
+regtable(lm1, lm2, gm1; renderSettings = asciiOutput(joinpath(dirname(@__FILE__), "tables", "test3.txt")), regression_statistics = [:nobs, :r2])
+@test checkfilesarethesame(joinpath(dirname(@__FILE__), "tables", "test3.txt"), joinpath(dirname(@__FILE__), "tables", "test3_reference.txt"))
+
+
+
 # LATEX TABLES
 
 # # default
@@ -97,6 +119,12 @@ regtable(rr1,rr2,rr3,rr5; renderSettings = asciiOutput(joinpath(dirname(@__FILE_
 regtable(rr1,rr2,rr3,rr5; renderSettings = latexOutput(joinpath(dirname(@__FILE__), "tables", "test2.tex")), regression_statistics = [:nobs, :r2, :r2_a, :r2_within, :f, :p, :f_kp, :p_kp, :dof])
 @test checkfilesarethesame(joinpath(dirname(@__FILE__), "tables", "test2.tex"), joinpath(dirname(@__FILE__), "tables", "test2_reference.tex"))
 
+regtable(lm1, lm2, gm1; renderSettings = latexOutput(joinpath(dirname(@__FILE__), "tables", "test4.tex")), regression_statistics = [:nobs, :r2])
+@test checkfilesarethesame(joinpath(dirname(@__FILE__), "tables", "test4.tex"), joinpath(dirname(@__FILE__), "tables", "test4_reference.tex"))
+
+
 # clean up
 rm(joinpath(dirname(@__FILE__), "tables", "test1.txt"))
 rm(joinpath(dirname(@__FILE__), "tables", "test2.tex"))
+rm(joinpath(dirname(@__FILE__), "tables", "test3.txt"))
+rm(joinpath(dirname(@__FILE__), "tables", "test4.tex"))
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,7 +2,6 @@ @@
     language: julia
     julia:
       - 0.6
-      - nightly
     notifications:
       email: false
     git:
@@ Expand Down @@