From 0f5ee45248ef574119b693ebf735b3aa750acdd7 Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Fri, 29 Jan 2021 15:53:24 -0600 Subject: [PATCH 1/3] Add signed and compress to constructor args --- src/PooledArrays.jl | 14 ++++++++++---- test/runtests.jl | 4 ++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/PooledArrays.jl b/src/PooledArrays.jl index b524b1d..2aa1ed3 100644 --- a/src/PooledArrays.jl +++ b/src/PooledArrays.jl @@ -11,6 +11,7 @@ export PooledArray, PooledVector, PooledMatrix ############################################################################## const DEFAULT_POOLED_REF_TYPE = UInt32 +const DEFAULT_SIGNED_REF_TYPE = Int32 # This is used as a wrapper during PooledArray construction only, to distinguish # arrays of pool indices from normal arrays @@ -98,7 +99,9 @@ end _widen(::Type{UInt8}) = UInt16 _widen(::Type{UInt16}) = UInt32 _widen(::Type{UInt32}) = UInt64 - +_widen(::Type{Int8}) = Int16 +_widen(::Type{Int16}) = Int32 +_widen(::Type{Int32}) = Int64 # Constructor from array, invpool, and ref type """ @@ -123,13 +126,16 @@ function PooledArray{T}(d::AbstractArray, r::Type{R}) where {T,R<:Integer} PooledArray(RefArray(refs::Vector{R}), invpool::Dict{T,R}, pool) end -function PooledArray{T}(d::AbstractArray) where T - refs, invpool, pool = _label(d, T) +function PooledArray{T}(d::AbstractArray; signed::Bool=false, compress::Bool=false) where {T} + R = signed ? (compress ? Int8 : DEFAULT_SIGNED_REF_TYPE) : (compress ? UInt8 : DEFAULT_POOLED_REF_TYPE) + refs, invpool, pool = _label(d, T, R) PooledArray(RefArray(refs), invpool, pool) end PooledArray(d::AbstractArray{T}, r::Type) where {T} = PooledArray{T}(d, r) -PooledArray(d::AbstractArray{T}) where {T} = PooledArray{T}(d) +function PooledArray(d::AbstractArray{T}; signed::Bool=false, compress::Bool=false) where {T} + PooledArray{T}(d, signed=signed, compress=compress) +end # Construct an empty PooledVector of a specific type PooledArray(t::Type) = PooledArray(Array(t,0)) diff --git a/test/runtests.jl b/test/runtests.jl index af967b5..9c0267d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -77,6 +77,10 @@ using DataAPI: refarray, refvalue, refpool @test PooledMatrix == PooledArray{T, R, 2} where {T, R} s = PooledArray(["a", "a", "b"]) + @test eltype(PooledArray(s).refs) == UInt32 + @test eltype(PooledArray(s, signed=true).refs) == Int32 + @test eltype(PooledArray(s, compress=true).refs) == UInt8 + @test eltype(PooledArray(s, signed=true, compress=true).refs) == Int8 @test all(refarray(s) .== [1, 1, 2]) for i in 1:3 @test refvalue(s, refarray(s)[i]) == s[i] From 8f702aecc18c949bc4eb08c620c02583e834177b Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Fri, 29 Jan 2021 16:54:13 -0600 Subject: [PATCH 2/3] Add test for widening of signed refs --- test/runtests.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/runtests.jl b/test/runtests.jl index 9c0267d..d4d5637 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -81,6 +81,7 @@ using DataAPI: refarray, refvalue, refpool @test eltype(PooledArray(s, signed=true).refs) == Int32 @test eltype(PooledArray(s, compress=true).refs) == UInt8 @test eltype(PooledArray(s, signed=true, compress=true).refs) == Int8 + @test eltype(PooledArray(rand(300), signed=true, compress=true).refs) == Int16 @test all(refarray(s) .== [1, 1, 2]) for i in 1:3 @test refvalue(s, refarray(s)[i]) == s[i] From 0e0d38eaa81d18b5576682e5c59180d51ee77a1b Mon Sep 17 00:00:00 2001 From: Douglas Bates Date: Fri, 29 Jan 2021 17:05:08 -0600 Subject: [PATCH 3/3] Update doc string --- src/PooledArrays.jl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/PooledArrays.jl b/src/PooledArrays.jl index 2aa1ed3..79c16f4 100644 --- a/src/PooledArrays.jl +++ b/src/PooledArrays.jl @@ -105,11 +105,17 @@ _widen(::Type{Int32}) = Int64 # Constructor from array, invpool, and ref type """ - PooledArray(array, [reftype]) + PooledArray(array, [reftype]; signed=false, compress=false) Freshly allocate `PooledArray` using the given array as a source where each element will be referenced as an integer of the given type. If no `reftype` is specified one is chosen automatically based on the number of unique elements. +The Boolean keyword arguments, `signed` and `compress` determine the choice of `reftype`. +By default, unsigned integers are used, as they have a greater maxtype than the same size of +signed integer. However, the Arrow standard at https://arrow.apache.org/, as implemented in +the Arrow package, requires signed integer types, which are provided when `signed` is `true`. +The `compress` argument controls whether the default size of 32 bits is used (`UInt32` for +unsigned, `Int32` for signed) or if smaller integer types are chosen when they can be used. If `array` is not a `PooledArray` then the order of elements in `refpool` in the resulting `PooledArray` is the order of first appereance of elements in `array`. """