-
Notifications
You must be signed in to change notification settings - Fork 44
/
Copy pathdata.jl
407 lines (324 loc) · 13.3 KB
/
data.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
# SPLITTING DATA SETS
# Helper function for partitioning in the non-stratified case
function _partition(rows, fractions, ::Nothing)
# container for the row selections (head:tail)
n_splits = length(fractions) + 1
heads = zeros(Int, n_splits)
tails = zeros(Int, n_splits)
n_rows = length(rows)
head = 1
for (i, p) in enumerate(fractions)
n = round(Int, p * n_rows)
iszero(n) && (@warn "A split has only one element."; n = 1)
# update tail
tail = head + n - 1
# store
heads[i] = head
tails[i] = tail
# update head
head = tail + 1
end
if head > n_rows
@warn "Last vector in the split has only one element."
head = n_rows
end
heads[end] = head
tails[end] = n_rows
return tuple((rows[h:t] for (h, t) in zip(heads, tails))...)
end
_make_numerical(v::AbstractVector) =
throw(ArgumentError("`stratify` must have `Count`, `Continuous` "*
"or `Finite` element scitpye. Consider "*
"`coerce(stratify, Finite)`. "))
_make_numerical(v::AbstractVector{<:Union{Missing,Real}}) = v
_make_numerical(v::AbstractVector{<:Union{Missing,CategoricalValue}}) =
int.(v)
# Helper function for partitioning in the stratified case
function _partition(rows, fractions, raw_stratify::AbstractVector)
stratify = _make_numerical(raw_stratify)
length(stratify) == length(rows) ||
throw(ArgumentError("The stratification vector must "*
"have as many entries as " *
"the rows to partition."))
uv = unique(stratify)
# construct table (n_classes * idx_of_that_class)
# NOTE use of '===' is important to handle missing.
idxs = [[i for i in rows if stratify[rows[i]] === v] for v in uv]
# number of occurences of each class and proportions
nidxs = length.(idxs)
props = length.(idxs) ./ length(rows)
n_splits = length(fractions) + 1
n_rows = length(rows)
ns_props = round.(Int, n_rows * fractions * props')
ns_props = vcat(ns_props, nidxs' .- sum(ns_props, dims=1))
# warn if anything is >= 1
if !all(e -> e > 1, ns_props)
@warn "Some splits have a single or no representative of some class."
end
# container for the rows
split_rows = []
heads = ones(Int, length(uv))
for r in 1:size(ns_props, 1)
tails = heads .+ ns_props[r, :] .- 1
# take chunks of the indices corresponding to the current fraction
indices = vcat((idxs[i][heads[i]:tails[i]] for i in eachindex(uv))...)
# rearrange by order of appearance
indices = sort(indices)
push!(split_rows, rows[indices])
heads .= tails .+ 1
end
if !all(sl -> sl > 1, length.(split_rows))
@warn "Some splits have a single or no representative of some class."
end
return tuple(split_rows...)
end
"""
partition(X, fractions...;
shuffle=nothing,
rng=Random.GLOBAL_RNG,
stratify=nothing)
Splits the vector or matrix `X` into a tuple of vectors or matrices
whose vertical concatentation is `X`. The number of rows in each
componenent of the return value is determined by the corresponding
`fractions` of `length(nrows(X))`, where valid fractions are in (0,1)
and sum up to less than one. The last fraction is not provided, as it
is inferred from the preceding ones.
`X` can also be any object which implements the `Tables.jl` interface
according to `Tables.istable`.
So, for example,
julia> partition(1:1000, 0.8)
([1,...,800], [801,...,1000])
julia> partition(1:1000, 0.2, 0.7)
([1,...,200], [201,...,900], [901,...,1000])
julia> partition(reshape(1:10, 5, 2), 0.2, 0.4)
([1 6], [2 7; 3 8], [4 9; 5 10])
X, _ = make_regression() # a table
Xtrain, Xtest = partition(X, 0.8) # the table split on rows
## Keywords
* `shuffle=nothing`: if set to `true`, shuffles the rows before taking fractions.
* `rng=Random.GLOBAL_RNG`: specifies the random number generator to be used, can be an integer
seed. If specified, and `shuffle === nothing` is interpreted as true.
* `stratify=nothing`: if a vector is specified, the partition will match the stratification
of the given vector. In that case, `shuffle` cannot be `false`.
"""
function partition(X, fractions...; kwargs...)
if X isa AbstractMatrix || Tables.istable(X)
# Generic method for all matrices and tables. Partition its rows and
# apply `selectrows` to each partition.
return tuple((selectrows(X, p) for p in partition(1:nrows(X), fractions...; kwargs...))...)
else
throw(ArgumentError("Function `partition` only supports AbstractVector, " *
"AbstractMatrix or containers implementing the Tables interface."))
end
end
function partition(rows::AbstractVector, fractions::Real...;
shuffle::Union{Nothing,Bool}=nothing, rng=Random.GLOBAL_RNG,
stratify::Union{Nothing,AbstractVector}=nothing)
# if rows is a unitrange, collect it
rows = collect(rows)
# check the fractions
if !all(e -> 0 < e < 1, fractions) || sum(fractions) >= 1
throw(DomainError(fractions, "Fractions must be in (0, 1) with sum < 1."))
end
# check the rng & adjust shuffling
if rng isa Integer
rng = MersenneTwister(rng)
end
if rng != Random.GLOBAL_RNG && shuffle === nothing
shuffle = true
end
shuffle !== nothing && shuffle && shuffle!(rng, rows)
return _partition(rows, collect(fractions), stratify)
end
"""
t1, t2, ...., tk = unpack(table, f1, f2, ... fk;
wrap_singles=false,
shuffle=false,
rng::Union{AbstractRNG,Int,Nothing}=nothing)
Horizontally split any Tables.jl compatible `table` into smaller
tables (or vectors) `t1, t2, ..., tk` by making column selections
**without replacement** by successively applying the columnn name
filters `f1`, `f2`, ..., `fk`. A *filter* is any object `f` such that
`f(name)` is `true` or `false` for each column `name::Symbol` of
`table`. For example, use the filter `_ -> true` to pick up all
remaining columns of the table.
Whenever a returned table contains a single column, it is converted to
a vector unless `wrap_singles=true`.
Scientific type conversions can be optionally specified (note
semicolon):
unpack(table, t...; col1=>scitype1, col2=>scitype2, ... )
If `shuffle=true` then the rows of `table` are first shuffled, using
the global RNG, unless `rng` is specified; if `rng` is an integer, it
specifies the seed of an automatically generated Mersenne twister. If
`rng` is specified then `shuffle=true` is implicit.
### Example
```
julia> table = DataFrame(x=[1,2], y=['a', 'b'], z=[10.0, 20.0], w=["A", "B"])
julia> Z, XY = unpack(table, ==(:z), !=(:w);
:x=>Continuous, :y=>Multiclass)
julia> XY
2×2 DataFrame
│ Row │ x │ y │
│ │ Float64 │ Categorical… │
├─────┼─────────┼──────────────┤
│ 1 │ 1.0 │ 'a' │
│ 2 │ 2.0 │ 'b' │
julia> Z
2-element Array{Float64,1}:
10.0
20.0
```
"""
function unpack(X, tests...;
wrap_singles=false,
shuffle=nothing,
rng=nothing, pairs...)
shuffle, rng = shuffle_and_rng(shuffle, rng)
shuffle && (X = selectrows(X, Random.shuffle(rng, 1:nrows(X))))
if isempty(pairs)
Xfixed = X
else
Xfixed = coerce(X, pairs...)
end
unpacked = Any[]
names_left = schema(Xfixed).names |> collect
history = ""
counter = 1
for c in tests
names = filter(c, names_left)
filter!(!in(names), names_left)
history *= "selection $counter: $names\n remaining: $names_left\n"
isempty(names) &&
error("Empty column selection encountered at selection $counter"*
"\n$history")
length(names) == 1 && !wrap_singles && (names = names[1])
push!(unpacked, selectcols(Xfixed, names))
counter += 1
end
return Tuple(unpacked)
end
## RESTRICTING TO A FOLD
struct FoldRestrictor{i,N}
f::NTuple{N,Vector{Int}}
end
(r::FoldRestrictor{i})(X) where i = selectrows(X, (r.f)[i])
"""
restrict(X, folds, i)
The restriction of `X`, a vector, matrix or table, to the `i`th fold
of `folds`, where `folds` is a tuple of vectors of row indices.
The method is curried, so that `restrict(folds, i)` is the operator
on data defined by `restrict(folds, i)(X) = restrict(X, folds, i)`.
### Example
folds = ([1, 2], [3, 4, 5], [6,])
restrict([:x1, :x2, :x3, :x4, :x5, :x6], folds, 2) # [:x3, :x4, :x5]
See also [`corestrict`](@ref)
"""
restrict(f::NTuple{N}, i) where N = FoldRestrictor{i,N}(f)
restrict(X, f, i) = restrict(f, i)(X)
## RESTRICTING TO A FOLD COMPLEMENT
"""
complement(folds, i)
The complement of the `i`th fold of `folds` in the concatenation of
all elements of `folds`. Here `folds` is a vector or tuple of integer
vectors, typically representing row indices or a vector, matrix or
table.
complement(([1,2], [3,], [4, 5]), 2) # [1 ,2, 4, 5]
"""
complement(f, i) = reduce(vcat, collect(f)[Not(i)])
struct FoldComplementRestrictor{i,N}
f::NTuple{N,Vector{Int}}
end
(r::FoldComplementRestrictor{i})(X) where i =
selectrows(X, complement(r.f, i))
"""
corestrict(X, folds, i)
The restriction of `X`, a vector, matrix or table, to the *complement*
of the `i`th fold of `folds`, where `folds` is a tuple of vectors of
row indices.
The method is curried, so that `corestrict(folds, i)` is the operator
on data defined by `corestrict(folds, i)(X) = corestrict(X, folds, i)`.
### Example
folds = ([1, 2], [3, 4, 5], [6,])
corestrict([:x1, :x2, :x3, :x4, :x5, :x6], folds, 2) # [:x1, :x2, :x6]
"""
corestrict(f::NTuple{N}, i) where N = FoldComplementRestrictor{i,N}(f)
corestrict(X, f, i) = corestrict(f, i)(X)
## to be replaced (not used anywhere):
## ACCESSORS FOR JULIA NDSPARSE ARRAYS (N=2)
# nrows(::Val{:sparse}, X) = maximum([r[1] for r in keys(X)])
# function select(::Val{:sparse}, X, r::Integer, c::Symbol)
# try
# X[r,c][1]
# catch exception
# exception isa KeyError || throw(exception)
# missing
# end
# end
# select(::Val{:sparse}, X, r::AbstractVector{<:Integer}, c::Symbol) = [select(X, s, c) for s in r]
# select(::Val{:sparse}, X, ::Colon, c::Symbol) = [select(X, s, c) for s in 1:nrows(X)]
# selectrows(::Val{:sparse}, X, r::Integer) = X[r:r,:]
# selectrows(::Val{:sparse}, X, r) = X[r,:]
# selectcols(::Val{:sparse}, X, c::Symbol) = select(X, :, c)
# selectcols(::Val{:sparse}, X, c::AbstractVector{Symbol}) = X[:,sort(c)]
# selectcols(::Val{:sparse}, X, ::Colon) = X
# select(::Val{:sparse}, X, r::Integer, c::AbstractVector{Symbol}) = X[r,sort(c)]
# select(::Val{:sparse}, X, r::Integer, ::Colon) = X[r,:]
# select(::Val{:sparse}, X, r, c) = X[r,sort(c)]
## TRANSFORMING BETWEEN CATEGORICAL ELEMENTS AND RAW VALUES
_err_missing_class(c) = throw(DomainError(
"Value `$c` not in pool"))
function transform_(pool, x)
ismissing(x) && return missing
x in levels(pool) || _err_missing_class(x)
return pool[get(pool, x)]
end
transform_(pool, X::AbstractArray) = broadcast(x -> transform_(pool, x), X)
"""
transform(e::Union{CategoricalElement,CategoricalArray,CategoricalPool}, X)
Transform the specified object `X` into a categorical version, using
the pool contained in `e`. Here `X` is a raw value (an element of
`levels(e)`) or an `AbstractArray` of such values.
```julia
v = categorical(["x", "y", "y", "x", "x"])
julia> transform(v, "x")
CategoricalValue{String,UInt32} "x"
julia> transform(v[1], ["x" "x"; missing "y"])
2×2 CategoricalArray{Union{Missing, Symbol},2,UInt32}:
"x" "x"
missing "y"
"""
MLJModelInterface.transform(e::Union{CategoricalArray, CategoricalValue},
arg) = transform_(CategoricalArrays.pool(e), arg)
MLJModelInterface.transform(e::CategoricalPool, arg) =
transform_(e, arg)
## SKIPPING MISSING AND NAN: skipinvalid
_isnan(x) = false
_isnan(x::Number) = isnan(x)
skipnan(x) = Iterators.filter(!_isnan, x)
"""
skipinvalid(itr)
Return an iterator over the elements in `itr` skipping `missing` and
`NaN` values. Behaviour is similar to [`skipmissing`](@ref).
skipinvalid(A, B)
For vectors `A` and `B` of the same length, return a tuple of vectors
`(A[mask], B[mask])` where `mask[i]` is `true` if and only if `A[i]`
and `B[i]` are both valid (non-`missing` and non-`NaN`). Can also
called on other iterators of matching length, such as arrays, but
always returns a vector. Does not remove `Missing` from the element
types if present in the original iterators.
"""
skipinvalid(v) = v |> skipmissing |> skipnan
isinvalid(x) = ismissing(x) || _isnan(x)
function skipinvalid(yhat, y)
mask = .!(isinvalid.(yhat) .| isinvalid.(y))
return yhat[mask], y[mask]
end
# TODO: refactor balanced accuracy to get rid of these:
function _skipinvalid(yhat, y, w::Arr)
mask = .!(isinvalid.(yhat) .| isinvalid.(y))
return yhat[mask], y[mask], w[mask]
end
function _skipinvalid(yhat, y, w::Union{Nothing,AbstractDict})
mask = .!(isinvalid.(yhat) .| isinvalid.(y))
return yhat[mask], y[mask], w
end