Some errors with interpolated functions #174

mcabbott · 2020-12-20T04:48:04Z

I said I'd try to narrow down some of the errors from mcabbott/Tullio.jl#57. Here's a start:

A max gradient:

# @tullio (max) X[i] := D[i,j] verbose=1 # MethodError: no method matching instruction!(::LoopVectorization.LoopSet, ::typeof(zero))

@macroexpand @avx for i = 𝒶𝓍i # works fine!
        done = 0
        for j = 𝒶𝓍j
            peak = onlyone(ℛ[i, 1] == D[i, j], done)
            begin
                ℰ𝓍1 = zero(𝒯)
                ℰ𝓍2 = ifelse(𝓇𝒽𝓈⛰, 𝛥ℛ[i, 1], ℰ𝓍1)
                𝛥D[i, j] += ℰ𝓍2
            end
            done += anyone(peak)
        end
    end;

@macroexpand @avx for i = 𝒶𝓍i # LoadError: type Expr has no field value
        done = 0
        for j = 𝒶𝓍j
            peak = $(Tullio.onlyone)(ℛ[i, 1] == D[i, j], done)
            begin
                ℰ𝓍1 = $zero(𝒯)
                ℰ𝓍2 = $ifelse(𝓇𝒽𝓈⛰, 𝛥ℛ[i, 1], ℰ𝓍1)
                𝛥D[i, j] += ℰ𝓍2
            end
            done += $anyone(peak)
        end
    end;

# But those aren't the same error. Here's the exact expression: 

lex2 = quote
    local #= /Users/me/.julia/dev/Tullio/src/macro.jl:1087 =# @inline(function ∇𝒜𝒸𝓉!(::Type{<:Array{<:Union{Base.HWReal, Bool}}}, 𝛥D, 𝛥ℛ::AbstractArray{𝒯}, ℛ, D, 𝒶𝓍i, 𝒶𝓍j, ♻=nothing) where {𝒯}
                #= /Users/me/.julia/dev/Tullio/src/macro.jl:1091 =# LoopVectorization.@avx unroll = 0 for i = 𝒶𝓍i
                        begin
                            🆗𝓇𝒽𝓈 = 0
                            for j = 𝒶𝓍j
                                begin
                                    𝓇𝒽𝓈⛰ = Tullio.onlyone(ℛ[i] == D[i, j], 🆗𝓇𝒽𝓈)
                                    begin
                                        ℰ𝓍1 = $(zero)(𝒯)
                                        ℰ𝓍2 = $(ifelse)(𝓇𝒽𝓈⛰, 𝛥ℛ[i], ℰ𝓍1)
                                        𝛥D[i, j] += ℰ𝓍2
                                    end
                                    🆗𝓇𝒽𝓈 += Tullio.anyone(𝓇𝒽𝓈⛰)
                                end
                            end
                            nothing
                        end
                    end
                nothing
            end)
end;

macroexpand(Main, lex2) # LoadError: MethodError: no method matching instruction!(::LoopVectorization.LoopSet, ::typeof(zero))

And a convolution:

# @tullio y[i] := x[pad(i-a, 3)] * k[a] verbose=1 # LoadError: "Don't know how to handle expression."

@macroexpand @avx for i  in 1:100 # fine!
        acc = 0.0
        for a in 1:7
            acc = acc + (
                    (i - a >= first(axes(x, 1))) & (i - a <= last(axes(x, 1))) ? 
                    x[i - a] : 
                    zero(eltype(x))
                ) * k[a]
        end
        y[i] = acc
    end;

@macroexpand @avx for i  in 1:100 # fails "LoadError: type Symbol has no field value"
        acc = 0.0
        for a in 1:7
            acc = acc + (
                    (i - a >= $first(axes(x, 1))) & (i - a <= last(axes(x, 1))) ? 
                    x[i - a] : 
                    zero(eltype(x))
                ) * k[a]
        end
        y[i] = acc
    end;

@macroexpand @avx for i  in 1:100 # fails
        acc = 0.0
        for a in 1:7
            acc = acc + (
                    (i - a >= $first($axes(x, 1))) & (i - a <= $last($axes(x, 1))) ? 
                    x[i - a] : 
                    $zero($eltype(x))
                ) * k[a]
        end
        y[i] = acc
    end;

# But the real error comes from the gradient.
# And this is a little different, perhaps no wonder it complains:

lex1 = quote
    local #= /Users/me/.julia/dev/Tullio/src/macro.jl:1087 =# @inline(function ∇𝒜𝒸𝓉!(::Type{<:Array{<:Union{Base.HWReal, Bool}}}, 𝛥x, 𝛥k, 𝛥ℛ::AbstractArray{𝒯}, ℛ, x, k, 𝒶𝓍a, 𝒶𝓍i, ♻=nothing) where {𝒯}
                #= /Users/me/.julia/dev/Tullio/src/macro.jl:1091 =# LoopVectorization.@avx unroll = 0 for a = 𝒶𝓍a
                        begin
                            nothing
                            for i = 𝒶𝓍i
                                begin
                                    ℰ𝓍1 = conj(k[a])
                                    ℰ𝓍2 = 𝛥ℛ[i] * ℰ𝓍1
                                    ℰ𝓍3 = conj(if (i - a >= first(axes(x, 1))) & (i - a <= Base.last(axes(x, 1)))
                                                x[i - a]
                                            else
                                                zero(eltype(x))
                                            end)
                                    ℰ𝓍4 = 𝛥ℛ[i] * ℰ𝓍3
                                    if (i - a >= first(axes(𝛥x, 1))) & (i - a <= last(axes(𝛥x, 1)))
                                        # NB it writes inside the if statement.
                                        𝛥x[i - a] = 𝛥x[i - a] + ℰ𝓍2
                                    else
                                        zero(eltype(𝛥x))
                                    end
                                    𝛥k[a] = 𝛥k[a] + ℰ𝓍4
                                end
                            end
                            nothing
                        end
                    end
                nothing
            end)
end;

macroexpand(Main, lex1) # LoadError: "Don't know how to handle expression."

The text was updated successfully, but these errors were encountered:

chriselrod · 2020-12-20T23:03:42Z

I fixed most of these on master, and will probably tag another release soon.
I still get a warning here:

julia> using Tullio, LoopVectorization, OffsetArrays

julia> x = rand(100); k = rand(5);

julia> @tullio y[i] := x[pad(i-a, 3)] * k[a] verbose=1;
┌ Info: symbolic gradients
│   inbody =
│    2-element Vector{Any}:
│     :(𝛥x[pad(i - a, 3)] = 𝛥x[pad(i - a, 3)] + 𝛥ℛ[i] * conj(k[a]))
└     :(𝛥k[a] = 𝛥k[a] + 𝛥ℛ[i] * conj(x[pad(i - a, 3)]))
┌ Warning: LoopVectorization failed (symbolic gradient)
│   err =
│    LoadError: Don't know how to handle expression.
│    if i - a ∈ (axes)(𝛥x, 1)
│        𝛥x[i - a] = 𝛥x[i - a] + ℰ𝓍2
│    else
│        (zero)((eltype)(𝛥x))
│    end
│    in expression starting at /home/chriselrod/.julia/packages/Tullio/u3myB/src/macro.jl:1090
└ @ Tullio ~/.julia/packages/Tullio/u3myB/src/macro.jl:1116
┌ Info: threading threshold (from cost = 11)
└   block = 23832
┌ Info: left index ranges
└   i = 3:104
┌ Info: reduction index ranges
└   a = Base.OneTo(5)
[ Info: running LoopVectorization actor
[ Info: running LoopVectorization actor

If it's easy for you to turn this into a conditional store:

i - a ∈ (axes)(𝛥x, 1) && (𝛥x[i - a] = 𝛥x[i - a] + ℰ𝓍2)

Then that should work, but it should be simple to fix in LoopVectorization, too.

mcabbott · 2020-12-21T00:31:55Z

Great, thank you. And yes, easy to chop the unneeded zero(T) as you suggest, which fixes it.

As suggested in JuliaSIMD/LoopVectorization.jl#174

mcabbott · 2020-12-21T13:44:08Z

Looking further at things like the first example above, I'm getting some wrong answers and am not sure why. I really thought I had this working with LoopVectorization, but my tests were not adequate to track this. Today I can't today find a version on which this works; it's possible that I somehow only checked one-dimensional cases.

This is for the gradient of maxumum(x; dims). My scheme is to check whether a given input attains the maximum, but in case several points do, then only the first should count (or any one, but just one). Which I keep track of with done here, plus a function _onlyone in case two points attain the maximum within the same Mask.

My guess here is that @avx is re-ordering loops so that done applies globally, not per reduction. But this may well be my fault in that I'm reaching into Vec objects, and perhaps this breaks some assumption. Does any way to fix this jump out at you?

It's also possible I should change the scheme used here entirely, to store the indices at which it attains the maximum on the forward pass. There I also had difficulties but that's another story.

using LoopVectorization, Random
# if VERSION < v"1.5"
#   using VectorizationBase: SVec, Mask, prevpow2
#   const Vec = SVec
# else
    using VectorizationBase: Vec, Mask, prevpow2 #, vsum, vany
# end

function grad!(𝛥x, 𝛥ℛ, ℛ, x, 𝒶𝓍i, 𝒶𝓍j)
    for i in 𝒶𝓍i
        done = 0
        for j in 𝒶𝓍j
            peak = (ℛ[i] == x[i, j]) && iszero(done)
            begin
                ℰ𝓍1 = 0.0
                ℰ𝓍2 = ifelse(peak, 𝛥ℛ[i], ℰ𝓍1)
                𝛥x[i, j] += ℰ𝓍2
            end
            done += peak
        end
    end
    𝛥x
end

# _onlyone(cond::Bool, seen) = cond && iszero(seen)
_onlyone(cond::Mask{W}) where {W} = Mask{W}(prevpow2(cond.u))
_onlyone(cond::Mask, seen) = _allzero(seen) ? _onlyone(cond) : zero(cond)

# _allzero(seen) = iszero(seen)
_allzero(seen::Vec) = iszero((!iszero(seen)).u) # iszero(vsum(seen))

# _anyone(cond::Bool) = cond
_anyone(cond::Mask) = !iszero(cond.u)

function grad_avx_new!(𝛥x, 𝛥ℛ, ℛ, x, 𝒶𝓍i, 𝒶𝓍j)
    @avx for i in 𝒶𝓍i
        done = 0
        for j in 𝒶𝓍j
            peak = _onlyone(ℛ[i] == x[i, j], done)
            begin
                ℰ𝓍1 = 0.0
                ℰ𝓍2 = ifelse(peak, 𝛥ℛ[i], ℰ𝓍1)
                𝛥x[i, j] += ℰ𝓍2
            end
            done += _anyone(peak)
        end
    end
    𝛥x
end

m2 = reshape(shuffle(1:20), 4,5); # no repeats
m2max = vec(maximum(m2, dims=2))

grad!(zeros(4,5), ones(4), m2max, m2, 1:4, 1:5)
sum(ans, dims=2) # [1, 1, 1, 1]

grad_avx_new!(zeros(4,5), ones(4), m2max, m2, 1:4, 1:5)
sum(ans, dims=2) # [0, 0, 1, 0]


# (jl_HftcGW) pkg> st
# Status `/private/var/folders/d0/hrtbsjqj66qf8zm88ycw9_580000gp/T/jl_HftcGW/Project.toml`
#   [bdcacae8] LoopVectorization v0.9.9
#   [3d5dd08c] VectorizationBase v0.14.4

chriselrod · 2020-12-21T14:21:15Z

Have you noticed:

julia> function grad_avx_new_v2!(𝛥x, 𝛥ℛ, ℛ, x, 𝒶𝓍i, 𝒶𝓍j)
           @avx for i in 𝒶𝓍i
               for j in 𝒶𝓍j
                   peak = ℛ[i] == x[i, j]
                   ℰ𝓍1 = 0.0
                   ℰ𝓍2 = ifelse(peak, 𝛥ℛ[i], ℰ𝓍1)
                   𝛥x[i, j] += ℰ𝓍2
               end
           end
           𝛥x
       end
grad_avx_new_v2! (generic function with 1 method)

julia> grad_avx_new_v2!(zeros(4,5), ones(4), m2max, m2, 1:4, 1:5)
4×5 Matrix{Float64}:
 0.0  1.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  1.0
 1.0  0.0  0.0  0.0  0.0

julia> grad!(zeros(4,5), ones(4), m2max, m2, 1:4, 1:5)
4×5 Matrix{Float64}:
 0.0  1.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  1.0
 1.0  0.0  0.0  0.0  0.0

Of course, we'd get duplicate answers, which you were trying to avoid.
The problem is that it is SIMD-ing across i, so the mask corresponds to multiple rows. _onlyone then guarantees that it only lets one of these rows have a maximum value.
But of course, each of the rows is allowed to have a maximum.

I'd use this definition of _onlyone:

@inline _onlyone(m::Mask{W}) where {W} = Mask{W}(one(m.u) << trailing_zeros(m.u))

The other one could place the on-mask off the edge of the array. The above definition places the on bit as close to the front of the mask as possible to avoid this.

With the rewrite, I'll have LoopVectorization properly track and handle loop carried dependencies, but I can't make promises on when that may happen.

chriselrod · 2020-12-21T14:24:49Z

Oh, and, make LoopVectorization swap the iteration order by transposing the matrices:

julia> grad_avx_new!(zeros(5,4)', ones(4), m2max, permutedims(m2)', 1:4, 1:5)
4×5 adjoint(::Matrix{Float64}) with eltype Float64:
 0.0  1.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  1.0
 1.0  0.0  0.0  0.0  0.0

And the answers are of course suddenly correct, because now the mask goes across columns of the array, where we do really want _onlyone.

chriselrod · 2020-12-21T14:27:15Z

If you want a workaround between now and when LoopVectorization could eventually handle the loop carried dependencies itself, you could pass in something that'd be a Vec vs a scalar depending on which is vectorized, to determine whether or not _onlyone does anything.

mcabbott · 2020-12-21T14:38:58Z

OK, thanks for your thoughts. Indeed the transposed version is fine, this may be why I missed the bug at first!

Will think about the done::Vec idea, that's a good point.

chriselrod · 2020-12-21T14:39:01Z

Notice:

julia> _onlyone(cond::Mask, seen, dummy) = (@show typeof(dummy); _allzero(seen) ? _onlyone(cond) : zero(cond))
_onlyone (generic function with 5 methods)

julia> function grad_avx_new!(𝛥x, 𝛥ℛ, ℛ, x, 𝒶𝓍i, 𝒶𝓍j)
           @avx unroll=1 for i in 𝒶𝓍i
               done = 0
               for j in 𝒶𝓍j
                   peak = _onlyone(ℛ[i] == x[i, j], done, i)
                   ℰ𝓍1 = 0.0
                   ℰ𝓍2 = ifelse(peak, 𝛥ℛ[i], ℰ𝓍1)
                   𝛥x[i, j] += ℰ𝓍2
                   done += _anyone(peak)
               end
           end
           𝛥x
       end
grad_avx_new! (generic function with 1 method)

julia> grad_avx_new!(zeros(4,5), ones(4), m2max, m2, 1:4, 1:5)
typeof(dummy) = VectorizationBase.MM{8, 1, Int64}
typeof(dummy) = VectorizationBase.MM{8, 1, Int64}
typeof(dummy) = VectorizationBase.MM{8, 1, Int64}
typeof(dummy) = VectorizationBase.MM{8, 1, Int64}
typeof(dummy) = VectorizationBase.MM{8, 1, Int64}
4×5 Matrix{Float64}:
 0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0

julia> grad_avx_new!(zeros(5,4)', ones(4), m2max, permutedims(m2)', 1:4, 1:5)
typeof(dummy) = Int64
typeof(dummy) = Int64
typeof(dummy) = Int64
typeof(dummy) = Int64
4×5 adjoint(::Matrix{Float64}) with eltype Float64:
 0.0  1.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  1.0
 1.0  0.0  0.0  0.0  0.0

So, this should work:

julia> _anyone(cond::Mask, ::Integer) = !iszero(cond.u)
_anyone (generic function with 3 methods)

julia> _anyone(cond::Mask, ::LoopVectorization.VectorizationBase.AbstractSIMD) = !iszero(cond)
_anyone (generic function with 3 methods)

julia> _onlyone(cond::Mask, seen, ::Integer) = _allzero(seen) ? _onlyone(cond) : zero(cond)
_onlyone (generic function with 7 methods)

julia> _onlyone(cond::Mask, seen, ::LoopVectorization.VectorizationBase.AbstractSIMD) = iszero(seen) & cond
_onlyone (generic function with 7 methods)

julia> function grad_avx_new!(𝛥x, 𝛥ℛ, ℛ, x, 𝒶𝓍i, 𝒶𝓍j)
           @avx unroll=1 for i in 𝒶𝓍i
               done = 0
               for j in 𝒶𝓍j
                   peak = _onlyone(ℛ[i] == x[i, j], done, i)
                   ℰ𝓍1 = 0.0
                   ℰ𝓍2 = ifelse(peak, 𝛥ℛ[i], ℰ𝓍1)
                   𝛥x[i, j] += ℰ𝓍2
                   done += _anyone(peak, i)
               end
           end
           𝛥x
       end
grad_avx_new! (generic function with 1 method)

julia> grad_avx_new!(zeros(4,5), ones(4), m2max, m2, 1:4, 1:5)
4×5 Matrix{Float64}:
 0.0  1.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  1.0
 1.0  0.0  0.0  0.0  0.0

julia> grad_avx_new!(zeros(5,4)', ones(4), m2max, permutedims(m2)', 1:4, 1:5)
4×5 adjoint(::Matrix{Float64}) with eltype Float64:
 0.0  1.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  1.0
 1.0  0.0  0.0  0.0  0.0

mcabbott · 2020-12-21T14:43:10Z

Oh nice, thanks. That's perfect!

mcabbott · 2020-12-21T15:04:27Z

Also, here (at last) is an isolation of the weird bug of this comment mcabbott/Tullio.jl#57 (comment)

# Tracker.gradient(x -> sum(@tullio y[i] := x[pad(i,0,2)] avx=true verbose=2), rand(3))[1]

function grad!(𝛥x, 𝛥ℛ, x, 𝒶𝓍i=eachindex(x))
    for i = 𝒶𝓍i
        (i >= first(axes(𝛥x, 1))) & (i <= last(axes(𝛥x, 1))) && (𝛥x[i] = 𝛥x[i] + 𝛥ℛ[i])
    end
    𝛥x
end

function grad_avx!(𝛥x, 𝛥ℛ, x, 𝒶𝓍i=eachindex(x))
    @avx for i = 𝒶𝓍i
        (i >= first(axes(𝛥x, 1))) & (i <= last(axes(𝛥x, 1))) && (𝛥x[i] = 𝛥x[i] + 𝛥ℛ[i])
    end
    𝛥x
end

function grad_avx_base!(𝛥x, 𝛥ℛ, x, 𝒶𝓍i=eachindex(x))
    @avx for i = 𝒶𝓍i
        (i >= first(axes(𝛥x, 1))) & (i <= Base.last(axes(𝛥x, 1))) && (𝛥x[i] = 𝛥x[i] + 𝛥ℛ[i])
    end
    𝛥x
end

@eval function grad_avx_eval!(𝛥x, 𝛥ℛ, x, 𝒶𝓍i=eachindex(x))
    @avx for i = 𝒶𝓍i
        (i >= $first($axes(𝛥x, 1))) & (i <= $last($axes(𝛥x, 1))) && (𝛥x[i] = 𝛥x[i] + 𝛥ℛ[i])
    end
    𝛥x
end # LoadError: KeyError: key typeof(first) not found

grad!(zeros(5), ones(5), ones(3)) # [1,1,1,0,0]
grad_avx!(zeros(5), ones(5), ones(3)) # UndefVarError: last not defined
grad_avx_base!(zeros(5), ones(5), ones(3)) # ok
grad_avx_eval!(zeros(5), ones(5), ones(3)) # not defined

chriselrod · 2020-12-21T15:13:48Z

# UndefVarError: last not defined

Re that bizarre error, here is a snippet of the generated code:

        var"##i_loopstop#1398" = last(#= /home/chriselrod/.julia/dev/LoopVectorization/src/reconstruct_loopset.jl:41 =# @inbounds(lb[1]))
        LoopVectorization.VectorizationBase.assume(var"##i_loopstop#1398" > 0)
        var"##vptr##_𝛥x" = #= /home/chriselrod/.julia/dev/LoopVectorization/src/reconstruct_loopset.jl:120 =# @inbounds(vargs[1])
        var"##vptr##_𝛥ℛ" = #= /home/chriselrod/.julia/dev/LoopVectorization/src/reconstruct_loopset.jl:120 =# @inbounds(vargs[2])
        var"##vptr##_𝛥x" = #= /home/chriselrod/.julia/dev/LoopVectorization/src/reconstruct_loopset.jl:120 =# @inbounds(vargs[3])
        var"##symlicm#1413" = #= /home/chriselrod/.julia/dev/LoopVectorization/src/reconstruct_loopset.jl:194 =# @inbounds(vargs[4])
        axes = #= /home/chriselrod/.julia/dev/LoopVectorization/src/reconstruct_loopset.jl:390 =# @inbounds(vargs[5])
        last = #= /home/chriselrod/.julia/dev/LoopVectorization/src/reconstruct_loopset.jl:390 =# @inbounds(vargs[6])

LoopVectorization doesn't "know" last, so it passes it in as an argument, and calls it last.
But it also uses last itself in the generated code. Hence, error for (it's own use) before later defining it.

Maybe I should mangle the names of functions like this.

I should also use a similar strategy of passing in arguments for interpolated functions. Currently, I'm just looking them up in a dictionary, so you can only interpolate ones I happened to put in there.

chriselrod · 2020-12-22T03:04:16Z

I added them to the tests.

Handling of interpolated functions is much more robust now. Tagging a new version.

mcabbott · 2020-12-22T16:07:39Z

My next problem here is these:

julia> vn = Vec(ntuple(_ -> rand(1:33), VectorizationBase.pick_vector_width_val(Int64))...)
Vec{4,Int64}<24, 21, 9, 14>

julia> mod(vn, 10)
ERROR: mod not defined for Vec{4, Int64}

julia> mod(vn, 1:10)
ERROR: MethodError: no method matching mod(::Vec{4, Int64}, ::UnitRange{Int64})

julia> clamp(vn, 1, 10)
ERROR: TypeError: non-boolean (Mask{4, UInt8}) used in boolean context

julia> clamp(vn, 1:10)
ERROR: MethodError: no method matching clamp(::Vec{4, Int64}, ::UnitRange{Int64})

How do these look as solutions? Trying to copy Base except for fld which is made up:

using VectorizationBase, IfElse
# clamp(x::X, lo::L, hi::H) where {X, L, H} in Base.Math at math.jl:65
Base.clamp(x::X, lo::L, hi::H) where {X<:Vec, L,H} =
    IfElse.ifelse(x > hi, convert(promote_type(X,L,H), hi),
           IfElse.ifelse(x < lo,
                  convert(promote_type(X,L,H), lo),
                  convert(promote_type(X,L,H), x)))

# clamp(x::Integer, r::AbstractUnitRange{var"#s830"} where var"#s830"<:Integer) in Base.Math at math.jl:110
Base.clamp(x::Vec{<:Any,<:Integer}, r::AbstractUnitRange{<:Integer}) = clamp(x, first(r), last(r))

# div(x::T, y::T, ::RoundingMode{:Down}) where T<:Integer in Base at div.jl:288
# function div(x::T, y::T, ::typeof(RoundDown)) where T<:Integer
#     d = div(x, y, RoundToZero)
#     return d - (signbit(x ⊻ y) & (d * y != x))
# end
Base.fld(x::Vec{W1,T}, y::Vec{W2,T}) where {W1,W2,T<:Integer} = div(x-2*signbit(x),y)

# mod(x::T, y::T) where T<:Integer in Base at int.jl:266
function Base.mod(x::Vec{W1,T}, y::Vec{W2,T}) where {W1,W2,T<:Integer}
    # y == -1 && return T(0)   # avoid potential overflow in fld
    # return x - fld(x, y) * y
    IfElse.ifelse(y == -1, zero(x), x - fld(x, y) * y)
end

# mod(i::Integer, r::AbstractUnitRange{var"#s78"} where var"#s78"<:Integer) in Base at range.jl:1153
Base.mod(i::Vec{<:Any,<:Integer}, r::AbstractUnitRange{<:Integer}) = mod(i-first(r), length(r)) + first(r)

chriselrod · 2020-12-22T16:41:09Z

My next problem...

That's why I wait for you to close your issues. lol

Also, note that LoopVectorization will eventually move toward using VecUnrolls in place of Vecs much of the time, so it's better to start allowing AbstractSIMDs.
I'm also paranoid / pretty aggressive about sprinkling @inline everywhere. Partly because llvmcall is marked as considerably more expensive to the inliner than many of these functions actually are (the inliner has to pick some value, and can't introsepct into what the llvmcall is actually doing).

promote is also probably a more convenient way of writing some of these:

julia> using VectorizationBase

julia> vx = Vec(ntuple(i -> 3i-2.0, VectorizationBase.pick_vector_width_val(Float64))...)
Vec{8,Float64}<1.0, 4.0, 7.0, 10.0, 13.0, 16.0, 19.0, 22.0>

julia> promote(vx, 2.3, 1, 4f0)
(Vec{8,Float64}<1.0, 4.0, 7.0, 10.0, 13.0, 16.0, 19.0, 22.0>, Vec{8,Float64}<2.3, 2.3, 2.3, 2.3, 2.3, 2.3, 2.3, 2.3>, Vec{8,Float64}<1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0>, Vec{8,Float64}<4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0>)

I.e.:

@inline function Base.clamp(_x::AbstractSIMD, _lo, _hi)
    x, lo, hi = promote(_x, _lo, _hi)
    IfElse.ifelse(x > hi, hi, IfElse.ifelse(x < lo, lo, x))
end

I'd also mind the integer literal 2's type in fld.

Please make a PR to VectorizationBase!

mcabbott · 2020-12-22T17:00:36Z

OK, will tidy up and make a PR. At last!

It is weird that Base.clamp doesn't use promote, maybe it's ancient? Will fix, and make a tidier fld.

mcabbott · 2020-12-22T18:51:23Z

Am not sure I understand what VecUnroll is, but the tests use it... should this work?

julia> m1
4 x Vec{4, Int64}
Vec{4,Int64}<7, 8, 9, 10>
Vec{4,Int64}<1, 2, 3, 4>
Vec{4,Int64}<13, 14, 15, 16>
Vec{4,Int64}<32, 33, 34, 35>

julia> promote_type(typeof(m1), Int)
Real

julia> promote(m1, 2, 3)
ERROR: promotion of types VectorizationBase.VecUnroll{3, 4, Int64, MM{4, 1, Int64}}, Int64 and Int64 failed to change any arguments

chriselrod · 2020-12-22T22:30:01Z

Yes, it's supposed to work. I just fixed it on VectorizationBase master.
I haven't switched LoopVectorization over to using them yet, and I need to improve VectorizationBase's test coverage, so there are probably more bugs.

It represents a group of vectors, e.g. from unrolling a loop. It can improve out of order execution, will make some aspects of code gen simpler, and will also make it easier to special case optimize some operations, e.g. offset strided loads/stores, such as when loading/storing from an array of structs.

chriselrod · 2021-01-25T04:41:02Z

It's been a month, so I'll close this. We can use a new issue for the next flurry of issues.

mcabbott pushed a commit to mcabbott/Tullio.jl that referenced this issue Dec 21, 2020

padding conditional store, without else clause

0c1227b

As suggested in JuliaSIMD/LoopVectorization.jl#174

mcabbott mentioned this issue Dec 22, 2020

Allow mod and clamp to accept Vec JuliaSIMD/VectorizationBase.jl#28

Merged

chriselrod closed this as completed Jan 25, 2021

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Some errors with interpolated functions #174

Some errors with interpolated functions #174

mcabbott commented Dec 20, 2020

chriselrod commented Dec 20, 2020 •

edited

Loading

mcabbott commented Dec 21, 2020

mcabbott commented Dec 21, 2020

chriselrod commented Dec 21, 2020 •

edited

Loading

chriselrod commented Dec 21, 2020 •

edited

Loading

chriselrod commented Dec 21, 2020 •

edited

Loading

mcabbott commented Dec 21, 2020

chriselrod commented Dec 21, 2020

mcabbott commented Dec 21, 2020

mcabbott commented Dec 21, 2020 •

edited

Loading

chriselrod commented Dec 21, 2020

chriselrod commented Dec 22, 2020

mcabbott commented Dec 22, 2020

chriselrod commented Dec 22, 2020

mcabbott commented Dec 22, 2020

mcabbott commented Dec 22, 2020

chriselrod commented Dec 22, 2020

chriselrod commented Jan 25, 2021

Some errors with interpolated functions #174

Some errors with interpolated functions #174

Comments

mcabbott commented Dec 20, 2020

chriselrod commented Dec 20, 2020 • edited Loading

mcabbott commented Dec 21, 2020

mcabbott commented Dec 21, 2020

chriselrod commented Dec 21, 2020 • edited Loading

chriselrod commented Dec 21, 2020 • edited Loading

chriselrod commented Dec 21, 2020 • edited Loading

mcabbott commented Dec 21, 2020

chriselrod commented Dec 21, 2020

mcabbott commented Dec 21, 2020

mcabbott commented Dec 21, 2020 • edited Loading

chriselrod commented Dec 21, 2020

chriselrod commented Dec 22, 2020

mcabbott commented Dec 22, 2020

chriselrod commented Dec 22, 2020

mcabbott commented Dec 22, 2020

mcabbott commented Dec 22, 2020

chriselrod commented Dec 22, 2020

chriselrod commented Jan 25, 2021

chriselrod commented Dec 20, 2020 •

edited

Loading

chriselrod commented Dec 21, 2020 •

edited

Loading

chriselrod commented Dec 21, 2020 •

edited

Loading

chriselrod commented Dec 21, 2020 •

edited

Loading

mcabbott commented Dec 21, 2020 •

edited

Loading