Skip to content

Commit

Permalink
Enable color for Stat.qq (#1434)
Browse files Browse the repository at this point in the history
  • Loading branch information
Mattriks authored May 3, 2020
1 parent b231f1b commit 40c708b
Show file tree
Hide file tree
Showing 7 changed files with 103 additions and 69 deletions.
5 changes: 3 additions & 2 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@ Each release typically has a number of minor bug fixes beyond what is listed her

# Version 1.x

* Add `Geom.bar(position=:identity)` + alpha enabled (#1428)
* Enable stacked guides (#1423)
* Enable `color` aesthetic for `Stat.qq` (#1434)
* Add `Geom.bar(position=:identity)` + alpha enabled (#1428)
* Enable stacked guides (#1423)



Expand Down
19 changes: 14 additions & 5 deletions docs/src/gallery/statistics.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,21 @@ plot(y=[sigmoid, x->sigmoid(x+2)], xmin=[-10], xmax=[10],
## [`Stat.qq`](@ref)

```@example
using Gadfly, Distributions, Random
using Distributions, Gadfly, RDatasets
set_default_plot_size(21cm, 8cm)
Random.seed!(1234)
p1 = plot(x=rand(Normal(), 100), y=rand(Normal(), 100), Stat.qq, Geom.point)
p2 = plot(x=rand(Normal(), 100), y=Normal(), Stat.qq, Geom.point)
hstack(p1,p2)
iris, geyser = dataset.("datasets", ["iris", "faithful"])
df = by(iris, :Species, d=:SepalLength=>x->fit(Normal, x))
ds2 = fit.([Normal, Uniform], [geyser.Eruptions])
yeqx(x=4:6) = layer(x=x, Geom.abline(color="gray80"))
xylabs = [Guide.xlabel("Theoretical q"), Guide.ylabel("Sample q")]
p1 = plot(df, x=:d, y=iris[:,1], color=:Species, Stat.qq, yeqx(4:8),
xylabs..., Guide.title("3 Samples, 1 Distribution"))
p2 = plot(geyser, x=ds2, y=:Eruptions, color=["Normal","Uniform"], Stat.qq,
yeqx(0:6), xylabs..., Guide.title("1 Sample, 2 Distributions"),
Theme(discrete_highlight_color=c->nothing, alphas=[0.5], point_size=2pt)
)
hstack(p1, p2)
```

## [`Stat.smooth`](@ref)
Expand Down
1 change: 1 addition & 0 deletions src/Gadfly.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1220,6 +1220,7 @@ classify_data(data::CategoricalArray) = :categorical
classify_data(data::T) where {T <: Base.Callable} = :functional
classify_data(data::AbstractArray) = :numerical
classify_data(data::Distribution) = :distribution
classify_data(data::Vector{<:Distribution}) = :distribution

function classify_data(data::AbstractArray{Any})
for val in data
Expand Down
2 changes: 1 addition & 1 deletion src/mapping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ end
# Evaluate one mapping.
evalmapping(source, arg::AbstractArray) = arg
evalmapping(source, arg::Function) = arg
evalmapping(source, arg::Distribution) = arg
evalmapping(source, arg::Distribution) = [arg]

evalmapping(source::MeltedData, arg::Integer) = source.melted_data[:,source.colmap[arg]]
evalmapping(source::MeltedData, arg::Col.GroupedColumn) = source.col_indicators[:,source.colmap[arg]]
Expand Down
2 changes: 1 addition & 1 deletion src/scale.jl
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ function apply_scale(scale::ContinuousScale,

# special case for Distribution values bound to :x or :y. wait for
# scale to be re-applied by Stat.qq
if in(var, [:x, :y]) && typeof(vals) <: Distribution
if in(var, [:x, :y]) && eltype(vals) <: Distribution
setfield!(aes, var, vals)
continue
end
Expand Down
115 changes: 65 additions & 50 deletions src/statistics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1540,16 +1540,16 @@ struct QQStatistic <: Gadfly.StatisticElement end

input_aesthetics(::QQStatistic) = [:x, :y]
output_aesthetics(::QQStatistic) = [:x, :y]
default_scales(::QQStatistic) =
[Gadfly.Scale.x_continuous(), Gadfly.Scale.y_continuous]
default_scales(::QQStatistic) = [Scale.x_distribution(), Scale.y_distribution()]

"""
Stat.qq
Transform $(aes2str(input_aesthetics(qq()))) into cumulative distrubutions.
Transform $(aes2str(input_aesthetics(qq()))) into quantiles.
If each is a numeric vector, their sample quantiles will be compared. If one
is a `Distribution`, then its theoretical quantiles will be compared with the
sample quantiles of the other.
sample quantiles of the other. Optionally group using the `color` aesthetic.
`Stat.qq` uses function `qqbuild` from Distributions.jl.
"""
const qq = QQStatistic

Expand All @@ -1559,54 +1559,69 @@ function apply_statistic(stat::QQStatistic,
aes::Gadfly.Aesthetics)

Gadfly.assert_aesthetics_defined("Stat.qq", aes, :x, :y)
Gadfly.assert_aesthetics_undefined("State.qq", aes, :color)

# NOTES:
#
# apply_scales happens before apply_statistics, so we need to handle in
# apply_scales the Distributions that might be bound to x and y... By
# analogy with Stat.func, we can add a check in apply_statistic which defers
# application. Stat.func though requires an ARRAY of Functions, and doesn't
# work on naked functions bound to aes.y. If we want to bind Distributions,
# we'd need to extend the types that are allowed for aes.y/.x (e.g. change
# type of Aesthetics fields x and y). Right now these are of type
# NumericalOrCategoricalAesthetic. The .x and .y fields are the _only_
# place where this type is used, but I'm not sure if there's a reason that
# changing this typealias would be a bad idea...for now I've just used a
# direct `@compat(Union{NumericalOrCategoricalAesthetic, Distribution})`.
#
# TODO:
#
# Grouping by color etc.?

# a little helper function to convert either numeric or distribution
# variables to a format suitable to input to qqbuild.
toVecOrDist = v -> typeof(v) <: Distribution ? v : convert(Vector{Float64}, v)

# check and convert :x and :y to proper types for input to qqbuild
local xs, ys
try
(xs, ys) = map(toVecOrDist, (aes.x, aes.y))
catch e
error("Stat.qq requires that x and y be bound to either a Distribution or to arrays of plain numbers.")
end

qqq = qqbuild(xs, ys)
local n_distributions::Int

aes.x = qqq.qx
aes.y = qqq.qy

# apply_scale to Distribution-bound aesthetics is deferred, so re-apply here
# (but only for Distribution, numeric data is already scaled). Only one of
# :x or :y can be a Distribution since qqbuild will throw an error for two
# Distributions.
data = Gadfly.Data()
if typeof(xs) <: Distribution
data.x = aes.x
Scale.apply_scale(scales[:x], [aes], data)
elseif typeof(ys) <: Distribution
data.y = aes.y
Scale.apply_scale(scales[:y], [aes], data)
XT, YT = eltype(aes.x), eltype(aes.y)
for elt in (XT, YT)
if elt<:Distribution
elt<:UnivariateDistribution && continue
error("For Stat.qq, if using distributions, only UnivariateDistributions are supported (see Distributions.jl).")
end
end

colorflag = aes.color !== nothing
aes_color = colorflag ? aes.color : [nothing]
uc = unique(aes_color)
lx, ly = length(aes.x), length(aes.y)

n_distributions, dv, sv = if XT<:Distribution && lx>1
2, :x, :y
elseif YT<:Distribution && ly>1
2, :y, :x
elseif XT<:Distribution
1, :x, :y
elseif YT<:Distribution
1, :y, :x
else
0, :x, :y
end

dvar1, svar1 = getfield(aes, dv), getfield(aes, sv)
!colorflag && n_distributions<2 && (aes_color=fill(nothing, length(svar1)))

CT = eltype(aes_color)
dvar2, svar2, colorv = Float64[], Float64[], CT[]
if n_distributions==2
for (d,c) in Compose.cyclezip(dvar1, aes_color)
qqq = qqbuild(d, svar1)
append!(dvar2, qqq.qx)
append!(svar2, qqq.qy)
append!(colorv, fill(c, length(qqq.qx)))
end
elseif n_distributions==1
for c in uc
qqq = qqbuild(dvar1[1], svar1[aes_color.==c])
append!(dvar2, qqq.qx)
append!(svar2, qqq.qy)
append!(colorv, fill(c, length(qqq.qx)))
end
else
for c in uc
qqq = qqbuild(dvar1[aes_color.==c], svar1[aes_color.==c])
append!(dvar2, qqq.qx)
append!(svar2, qqq.qy)
append!(colorv, fill(c, length(qqq.qx)))
end
end

setfield!(aes, dv, dvar2)
setfield!(aes, sv, svar2)
colorflag && (aes.color = colorv)
if XT <: Distribution
Scale.apply_scale(scales[:x], [aes], Gadfly.Data(x=aes.x))
elseif YT <: Distribution
Scale.apply_scale(scales[:y], [aes], Gadfly.Data(y=aes.y))
end
end

Expand Down
28 changes: 18 additions & 10 deletions test/testscripts/stat_qq.jl
Original file line number Diff line number Diff line change
@@ -1,25 +1,33 @@
using Gadfly, Distributions, Random
using DataFrames, Gadfly, Distributions, Random

set_default_plot_size(6inch, 16inch)
set_default_plot_size(3.5inch, 16inch)

Random.seed!(1234)

x = rand(Normal(), 100)
y = rand(Normal(10), 100)

xd = Normal()
yd = Normal(10)

# two numeric vectors
pl1 = plot(x=x, y=y, Stat.qq, Geom.point)

# one numeric and one Distribution
pl2 = plot(x=x, y=yd, Stat.qq, Geom.point)
pl3 = plot(x=xd, y=y, Stat.qq, Geom.point)

# Apply different scales to x and y
pl4 = plot(x=x, y=exp.(y), Stat.qq, Geom.point, Scale.y_log10)
pl5= plot(x=exp.(x), y=y, Stat.qq, Geom.point, Scale.x_log10)
y = randn(100).+5
ds = fit.([Normal, LogNormal], [y])
df = [DataFrame(y=randn(100), g=g) for g in ["Sample1", "Sample2"]]

theme = Theme(discrete_highlight_color=c->nothing, alphas=[0.5],
point_size=2pt, key_position=:inside)
yeqx(x=-3:3) = layer(x=x, Geom.abline(color="gray80"))
gck = Guide.colorkey(title="")

# Plot title describes plots
pl2 = plot(x=Normal(), y=randn(100), Stat.qq, yeqx, theme,
Guide.title("1 sample, 1 Distribution"))
pl3 = plot(vcat(df...), x=Normal(), y=:y, Stat.qq, color=:g, yeqx,
gck, theme, Guide.title("2 samples, 1 Distribution"))
pl4 = plot(x=ds, y=y, color=["Normal", "LogNormal"], Stat.qq, yeqx(3:8),
gck, theme, Guide.title("1 sample, 2 Distributions"))

# Apply scales to Distributions
z = rand(Exponential(), 100)
Expand Down

0 comments on commit 40c708b

Please sign in to comment.