Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use arrow format for datasets [ci skip] #382

Merged
merged 4 commits into from
Oct 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions Artifacts.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,21 @@
[TestData]
# compute this using
# using Tar, Inflate, SHA
# filename = "download?version=4" # I just used wget for the URL below and this is how it saved it
# filename = "download?version=2" # I just used wget for the URL below and this is how it saved it
# println("sha256: ", bytes2hex(open(sha256, filename)))
# println("git-tree-sha1: ", Tar.tree_hash(IOBuffer(inflate_gzip(filename))))
# from https://julialang.github.io/Pkg.jl/dev/artifacts/
git-tree-sha1 = "4d1410cd290622e426411273ef379fe82b749ca4"
git-tree-sha1 = "a3955a5f747d01e628944b1031b44e31f027ebef"
lazy = true

[[TestData.download]]
# this is the SHA from https://osf.io/pcjk6/?show=revision
sha256 = "b66369456c0ec9d1490d61d0c0686999e6422051295aeb9e14ab27693ccaec54"
# this is the SHA from https://osf.io/djaqb/download?version=2
sha256 = "b6273f0cfeb5b12e2afede33de6d68a8d926e7b684cf071c7622f1e6ef7aa64a"
# when updating this, make sure to change to change the version number,
# because if the version number isn't included, it will always point to the
# latest version, which means it will break existing users when we update
# between releases.
url = "https://osf.io/pcjk6/download?version=4"
url = "https://osf.io/djaqb/download?version=2"

# for future work on using xz-compressed data:
# Julia invokes wget without using HTTP metadata, so we need the link
Expand Down
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ author = ["Phillip Alday <[email protected]>", "Douglas Bates <dmbates@gmail.
version = "3.0.0-DEV"

[deps]
Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
Feather = "becb17da-46f6-5d3c-ad1b-1c5fe96bc73c"
GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd"
Expand All @@ -24,9 +24,9 @@ StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[compat]
Arrow = "0.3"
BlockArrays = "0.11, 0.12"
Distributions = "0.21, 0.22, 0.23"
Feather = "0.5"
GLM = "1"
NLopt = "0.5, 0.6"
NamedArrays = "0.9"
Expand Down
2 changes: 1 addition & 1 deletion src/MixedModels.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
module MixedModels

using Arrow
using BlockArrays
using Distributions
using Feather
using GLM
using LinearAlgebra
using NamedArrays
Expand Down
3 changes: 2 additions & 1 deletion src/linearmixedmodel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ function LinearMixedModel(
y, Xs = modelcols(form, tbl)

y = reshape(float(y), (:, 1)) # y as a floating-point matrix
T = eltype(y)
T = promote_type(Float64, eltype(y)) # ensure that eltype of model matrices is at least Float64
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we mention this in the docs anywhere? In some areas, there is a tendency to use half precision to speed things up (although that matters less for x86-64 and modern ARM, it can make a difference on GPU, which we of course don't use).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(this is my last comment, then I'm happy to merge)

y = convert(Matrix{T}, y)

reterms = ReMat{T}[]
feterms = FeMat{T}[]
Expand Down
8 changes: 4 additions & 4 deletions src/utilities.jl
Original file line number Diff line number Diff line change
Expand Up @@ -138,20 +138,20 @@ function replicate(f::Function, n::Integer; use_threads=false)
results
end

cacheddatasets = Dict{String,Any}()
cacheddatasets = Dict{String, Arrow.Table}()
"""
dataset(nm)

Return the data frame of test data set named `nm`, which can be a `String` or `Symbol`
"""
function dataset(nm::AbstractString)
get!(cacheddatasets, nm) do
path = joinpath(TestData, nm * ".feather")
path = joinpath(TestData, nm * ".arrow")
if !isfile(path)
throw(ArgumentError(
"Dataset \"$nm\" is not available.\nUse MixedModels.datasets() for available names."))
end
Feather.read(path)
Arrow.Table(path)
end
end
dataset(nm::Symbol) = dataset(string(nm))
Expand All @@ -161,7 +161,7 @@ dataset(nm::Symbol) = dataset(string(nm))

Return a vector of names of the available test data sets
"""
datasets() = first.(Base.Filesystem.splitext.(filter(Base.Fix2(endswith, ".feather"), readdir(TestData))))
datasets() = first.(Base.Filesystem.splitext.(filter(endswith(".arrow"), readdir(TestData))))


"""
Expand Down
14 changes: 8 additions & 6 deletions test/FactorReTerm.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
using DataFrames
using LinearAlgebra
using MixedModels
using Random
Expand Down Expand Up @@ -98,8 +99,9 @@ end
end

@testset "Categorical Blocking Variable" begin
# deepcopy because we're going to modify it
slp = deepcopy(dataset("sleepstudy"))
# deepcopy because we're going to modify it. Don't need the copy if dataset returns an Arrow.Table
#slp = deepcopy(DataFrame(dataset("sleepstudy")))
slp = DataFrame(dataset("sleepstudy"))
contrasts = Dict{Symbol,Any}()
f = @formula(reaction ~ 1 + (1|subj))

Expand Down Expand Up @@ -203,13 +205,13 @@ end
f1 = @formula(rt_trunc ~ 1 + (1 + prec + load | spkr))
ff1 = apply_schema(f1, sch, MixedModel)

retrm = ff1.rhs[end]
@test retrm.lhs.terms[end].contrasts.contrasts isa DummyCoding
retrm = last(ff1.rhs)
@test last(retrm.lhs.terms).contrasts.contrasts isa DummyCoding

f2 = @formula(rt_trunc ~ 1 + (1 + prec | spkr) + (0 + load | spkr))
ff2 = apply_schema(f2, sch, MixedModel)

retrm2 = ff2.rhs[end]
@test retrm2.lhs.terms[end].contrasts.contrasts isa DummyCoding
retrm2 = last(ff2.rhs)
@test last(retrm2.lhs.terms).contrasts.contrasts isa DummyCoding
end
end
2 changes: 1 addition & 1 deletion test/bootstrap.jl
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,4 @@ end
@test sort(columntable(bsamp_threaded.β).β) == sort(columntable(bsamp.β).β)
@test sum(issingular(bsamp)) == sum(issingular(bsamp_threaded))
end
end
end
10 changes: 6 additions & 4 deletions test/missing.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
using MixedModels, Test
using DataFrames
using MixedModels
using Test

# deepcopy because we're going to modify it
slp = deepcopy(MixedModels.dataset(:sleepstudy))
slp[!,:days] = Array{Union{Missing, Float64},1}(slp[!,:days])
# convert to DataFrame to modify it
slp = DataFrame(MixedModels.dataset(:sleepstudy))
allowmissing!(slp, :days)
slp[1,:days] = missing

# TODO: re-enable this test when better missing support has landed in StatsModels
Expand Down
6 changes: 4 additions & 2 deletions test/pirls.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
using MixedModels, Tables
using DataFrames
using MixedModels
using Tables
using Test

using MixedModels: dataset
Expand Down Expand Up @@ -81,7 +83,7 @@ end

@testset "grouseticks" begin
center(v::AbstractVector) = v .- (sum(v) / length(v))
grouseticks = dataset(:grouseticks)
grouseticks = DataFrame(dataset(:grouseticks))
grouseticks.ch = center(grouseticks.height)
gm4 = fit(MixedModel, only(gfms[:grouseticks]), grouseticks, Poisson(), fast=true) # fails in pirls! with fast=false
@test isapprox(deviance(gm4), 851.4046, atol=0.001)
Expand Down
9 changes: 5 additions & 4 deletions test/pls.jl
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ include("modelcache.jl")
@test fm1.σ ≈ 49.510099986291145 atol=1.e-5
@test fm1.X == ones(30,1)
ds = MixedModels.dataset(:dyestuff)
@test fm1.y == ds[!, :yield]
@test fm1.y == ds[:yield]
@test cond(fm1) == ones(1)
@test first(leverage(fm1)) ≈ 0.15650534392640486 rtol=1.e-5
@test sum(leverage(fm1)) ≈ 4.695160317792145 rtol=1.e-5
cm = coeftable(fm1)
@test length(cm.rownms) == 1
@test length(cm.colnms) == 4
@test fnames(fm1) == (:batch,)
@test response(fm1) == ds[!, :yield]
@test response(fm1) == ds[:yield]
rfu = ranef(fm1, uscale = true)
rfb = ranef(fm1)
@test abs(sum(rfu[1])) < 1.e-5
Expand Down Expand Up @@ -128,7 +128,7 @@ end
@test coef(fm) ≈ [5.6656]
@test logdet(fm) ≈ 0.0
@test issingular(fm)
refit!(fm, float(MixedModels.dataset(:dyestuff)[!, :yield]))
refit!(fm, float(MixedModels.dataset(:dyestuff)[:yield]))
@test objective(fm) ≈ 327.3270598811428 atol=0.001
end

Expand Down Expand Up @@ -336,7 +336,7 @@ end
@test logdet(fm_ind) ≈ logdet(fmnc)

# combining [ReMat{T,S1}, ReMat{T,S2}] for S1 ≠ S2
slpcat = categorical!(deepcopy(slp), [:days])
slpcat = categorical!(DataFrame(slp), [:days])
fm_cat = fit(MixedModel, @formula(reaction ~ 1+days+(1|subj)+(0+days|subj)),slpcat)
@test fm_cat isa LinearMixedModel
σρ = fm_cat.σρs
Expand Down Expand Up @@ -394,6 +394,7 @@ end
end

@testset "kb07" begin
global io
pca = last(models(:kb07)).PCA
@test keys(pca) == (:subj, :item)
show(io, models(:kb07)[2])
Expand Down
7 changes: 5 additions & 2 deletions test/utilities.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,10 @@ end

@testset "datasets" begin
@test isa(MixedModels.datasets(), Vector{String})
@test size(MixedModels.dataset(:dyestuff)) == (30, 2)
@test size(MixedModels.dataset("dyestuff")) == (30, 2)
@test length(MixedModels.dataset(:dyestuff)) == 2
@test length(MixedModels.dataset("dyestuff")) == 2
dyestuff = MixedModels.dataset(:dyestuff);
@test keys(dyestuff) == [:batch, :yield]
@test length(dyestuff.batch) == 30
@test_throws ArgumentError MixedModels.dataset(:foo)
end