-
Notifications
You must be signed in to change notification settings - Fork 68
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Slowdown with views in 0.12.119 #428
Comments
Hmm, I cannot reproduce... julia> @benchmark reflectorApply!($x, $τ, $y)
BenchmarkTools.Trial: 10000 samples with 171 evaluations.
Range (min … max): 630.281 ns … 855.865 ns ┊ GC (min … max): 0.00% … 0.00%
Time (median): 635.731 ns ┊ GC (median): 0.00%
Time (mean ± σ): 637.356 ns ± 5.920 ns ┊ GC (mean ± σ): 0.00% ± 0.00%
▃▃▂█▄▄▂▁
▂▂▃▃▃▄▄▅▇█████████▇▇▅▄▄▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▃▃▂ ▃
630 ns Histogram: frequency by time 654 ns <
Memory estimate: 0 bytes, allocs estimate: 0.
(lvdev) pkg> st -m LoopVectorization
Status `~/Documents/progwork/julia/env/lvdev/Manifest.toml`
[bdcacae8] LoopVectorization v0.12.125 `~/.julia/dev/LoopVectorization`
julia> versioninfo()
Julia Version 1.9.0-DEV.1130
Commit c80316e125* (2022-08-15 13:05 UTC)
Platform Info:
OS: Linux (x86_64-redhat-linux)
CPU: 36 × Intel(R) Core(TM) i9-7980XE CPU @ 2.60GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-14.0.5 (ORCJIT, skylake-avx512)
Threads: 18 on 36 virtual cores |
Mind showing me the @code_typed reflectorApply!(x, τ, y) and @code_native debuginfo = :none syntax = :intel reflectorApply!(x, τ, y) .text
.file "reflectorApply!"
.globl "julia_reflectorApply!_4509" # -- Begin function julia_reflectorApply!_4509
.p2align 4, 0x90
.type "julia_reflectorApply!_4509",@function
"julia_reflectorApply!_4509": # @"julia_reflectorApply!_4509"
.cfi_startproc
# %bb.0: # %top
push rbp
.cfi_def_cfa_offset 16
push r15
.cfi_def_cfa_offset 24
push r14
.cfi_def_cfa_offset 32
push r13
.cfi_def_cfa_offset 40
push r12
.cfi_def_cfa_offset 48
push rbx
.cfi_def_cfa_offset 56
.cfi_offset rbx, -56
.cfi_offset r12, -48
.cfi_offset r13, -40
.cfi_offset r14, -32
.cfi_offset r15, -24
.cfi_offset rbp, -16
mov qword ptr [rsp - 56], rsi # 8-byte Spill
mov qword ptr [rsp - 48], rdi # 8-byte Spill
mov r11, qword ptr [rcx + 32]
mov qword ptr [rsp - 80], rcx # 8-byte Spill
mov rcx, qword ptr [rcx + 24]
mov rax, rcx
mov qword ptr [rsp - 64], rcx # 8-byte Spill
sub r11, rcx
movabs rax, 9223372036854775806
cmp r11, rax
jbe .LBB0_1
.LBB0_26: # %L694
mov rcx, qword ptr [rsp - 80] # 8-byte Reload
mov rax, qword ptr [rcx]
mov rdx, qword ptr [rsp - 56] # 8-byte Reload
mov qword ptr [rdx], rax
vmovups ymm0, ymmword ptr [rcx]
vmovups ymm1, ymmword ptr [rcx + 24]
mov rax, qword ptr [rsp - 48] # 8-byte Reload
vmovups ymmword ptr [rax + 24], ymm1
vmovups ymmword ptr [rax], ymm0
pop rbx
.cfi_def_cfa_offset 48
pop r12
.cfi_def_cfa_offset 40
pop r13
.cfi_def_cfa_offset 32
pop r14
.cfi_def_cfa_offset 24
pop r15
.cfi_def_cfa_offset 16
pop rbp
.cfi_def_cfa_offset 8
vzeroupper
ret
.LBB0_1: # %L34.preheader
.cfi_def_cfa_offset 56
mov rdi, qword ptr [rsp - 80] # 8-byte Reload
mov r9, qword ptr [rdi + 8]
mov rax, qword ptr [rdi + 16]
sub rax, r9
inc rax
mov rsi, rax
sar rsi, 63
andn rcx, rsi, rax
inc r11
mov rax, qword ptr [rdi]
lea rsi, [r9 - 1]
mov qword ptr [rsp - 8], rsi # 8-byte Spill
mov r12, qword ptr [rax]
mov r13, qword ptr [rax + 24]
mov rax, qword ptr [rdx]
mov r8, qword ptr [rax]
mov r10, qword ptr [rdx + 8]
mov rsi, qword ptr [rdx + 24]
dec rsi
imul rsi, qword ptr [rax + 24]
lea rax, [r10 + rsi]
lea rdx, [r8 + 8*rax]
mov rax, qword ptr [rsp - 64] # 8-byte Reload
lea rbp, [rax - 1]
imul rbp, r13
lea rax, [r9 + rbp]
dec rax
lea rax, [r12 + 8*rax + 8]
mov qword ptr [rsp - 16], rax # 8-byte Spill
lea rax, [rcx - 1]
mov rdi, rax
and rdi, -32
and rax, -16
lea ebx, [rcx - 1]
and ebx, 7
mov r14d, 8
cmovne r14d, ebx
lea rbx, [rcx - 2]
shl r9, 3
lea r15, [r9 + 8*rbp]
lea rbp, [rcx - 10]
mov qword ptr [rsp - 72], rbp # 8-byte Spill
shl r10, 3
lea rsi, [r10 + 8*rsi]
lea rbp, [rcx - 18]
mov qword ptr [rsp - 32], rbp # 8-byte Spill
add rsi, r8
add rsi, 192
mov ebp, -1
bzhi ebp, ebp, r14d
mov dword ptr [rsp - 84], ebp # 4-byte Spill
mov r9, rcx
add rcx, -26
mov qword ptr [rsp - 40], rcx # 8-byte Spill
lea r10, [r12 + r15 + 192]
lea rcx, [8*r13]
mov qword ptr [rsp - 24], rcx # 8-byte Spill
mov r14d, 1
jmp .LBB0_2
.p2align 4, 0x90
.LBB0_19: # %L602
# in Loop: Header=BB0_2 Depth=1
mov ecx, dword ptr [rsp - 84] # 4-byte Reload
kmovd k1, ecx
vmovupd zmm2 {k1} {z}, zmmword ptr [rdx + 8*rdi]
vmovupd zmm3 {k1} {z}, zmmword ptr [r8 + 8*rdi]
vbroadcastsd zmm1, xmm1
vfnmadd213pd zmm1, zmm2, zmm3 # zmm1 = -(zmm2 * zmm1) + zmm3
vmovupd zmmword ptr [r8 + 8*rdi] {k1}, zmm1
.LBB0_25: # %L682
# in Loop: Header=BB0_2 Depth=1
lea rbp, [r14 + 1]
add r10, qword ptr [rsp - 24] # 8-byte Folded Reload
cmp r14, r11
mov r14, rbp
je .LBB0_26
.LBB0_2: # %L34
# =>This Loop Header: Depth=1
# Child Loop BB0_5 Depth 2
# Child Loop BB0_16 Depth 2
lea r8, [8*r14 - 8]
imul r8, r13
cmp r9, 33
jae .LBB0_4
# %bb.3: # in Loop: Header=BB0_2 Depth=1
vxorpd xmm1, xmm1, xmm1
xor r15d, r15d
vxorpd xmm2, xmm2, xmm2
jmp .LBB0_7
.p2align 4, 0x90
.LBB0_4: # %L194.preheader
# in Loop: Header=BB0_2 Depth=1
vxorpd xmm2, xmm2, xmm2
vxorpd xmm1, xmm1, xmm1
vxorpd xmm3, xmm3, xmm3
vxorpd xmm4, xmm4, xmm4
xor ebp, ebp
.p2align 4, 0x90
.LBB0_5: # %L205
# Parent Loop BB0_2 Depth=1
# => This Inner Loop Header: Depth=2
vmovupd zmm5, zmmword ptr [rsi + 8*rbp - 192]
vmovupd zmm6, zmmword ptr [rsi + 8*rbp - 128]
vmovupd zmm7, zmmword ptr [rsi + 8*rbp - 64]
vmovupd zmm8, zmmword ptr [rsi + 8*rbp]
vfmadd231pd zmm4, zmm5, zmmword ptr [r10 + 8*rbp - 192] # zmm4 = (zmm5 * mem) + zmm4
vfmadd231pd zmm3, zmm6, zmmword ptr [r10 + 8*rbp - 128] # zmm3 = (zmm6 * mem) + zmm3
vfmadd231pd zmm1, zmm7, zmmword ptr [r10 + 8*rbp - 64] # zmm1 = (zmm7 * mem) + zmm1
vfmadd231pd zmm2, zmm8, zmmword ptr [r10 + 8*rbp] # zmm2 = (zmm8 * mem) + zmm2
add rbp, 32
cmp rdi, rbp
jne .LBB0_5
# %bb.6: # %L232
# in Loop: Header=BB0_2 Depth=1
vaddpd zmm1, zmm4, zmm1
vaddpd zmm2, zmm3, zmm2
mov r15, rdi
.LBB0_7: # %L237
# in Loop: Header=BB0_2 Depth=1
mov rbp, qword ptr [rsp - 64] # 8-byte Reload
add rbp, r14
add rbp, -2
imul rbp, r13
add r8, qword ptr [rsp - 16] # 8-byte Folded Reload
cmp r15, rax
jne .LBB0_9
# %bb.8: # in Loop: Header=BB0_2 Depth=1
mov r15, rax
add rbp, qword ptr [rsp - 8] # 8-byte Folded Reload
cmp r15, rbx
jg .LBB0_14
jmp .LBB0_11
.p2align 4, 0x90
.LBB0_9: # %L260
# in Loop: Header=BB0_2 Depth=1
vmovupd zmm3, zmmword ptr [rdx + 8*r15]
vmovupd zmm4, zmmword ptr [rdx + 8*r15 + 64]
vfmadd231pd zmm1, zmm3, zmmword ptr [r8 + 8*r15] # zmm1 = (zmm3 * mem) + zmm1
vfmadd231pd zmm2, zmm4, zmmword ptr [r8 + 8*r15 + 64] # zmm2 = (zmm4 * mem) + zmm2
or r15, 16
add rbp, qword ptr [rsp - 8] # 8-byte Folded Reload
cmp r15, rbx
jg .LBB0_14
.LBB0_11: # %L280
# in Loop: Header=BB0_2 Depth=1
cmp qword ptr [rsp - 72], r15 # 8-byte Folded Reload
jge .LBB0_13
# %bb.12: # %L291
# in Loop: Header=BB0_2 Depth=1
mov ecx, dword ptr [rsp - 84] # 4-byte Reload
kmovd k1, ecx
vmovupd zmm3 {k1} {z}, zmmword ptr [rdx + 8*r15]
vmovupd zmm4 {k1} {z}, zmmword ptr [r8 + 8*r15]
vfmadd231pd zmm1 {k1}, zmm3, zmm4 # zmm1 {%k1} = (zmm3 * zmm4) + zmm1
jmp .LBB0_14
.p2align 4, 0x90
.LBB0_13: # %L299
# in Loop: Header=BB0_2 Depth=1
vmovupd zmm3, zmmword ptr [rdx + 8*r15]
mov ecx, dword ptr [rsp - 84] # 4-byte Reload
kmovd k1, ecx
vmovupd zmm4 {k1} {z}, zmmword ptr [rdx + 8*r15 + 64]
vmovupd zmm5 {k1} {z}, zmmword ptr [r8 + 8*r15 + 64]
vfmadd231pd zmm1, zmm3, zmmword ptr [r8 + 8*r15] # zmm1 = (zmm3 * mem) + zmm1
vfmadd231pd zmm2 {k1}, zmm4, zmm5 # zmm2 {%k1} = (zmm4 * zmm5) + zmm2
.LBB0_14: # %L309
# in Loop: Header=BB0_2 Depth=1
vaddpd zmm1, zmm1, zmm2
vmovsd xmm2, qword ptr [r12 + 8*rbp] # xmm2 = mem[0],zero
vextractf64x4 ymm3, zmm1, 1
vaddpd zmm1, zmm1, zmm3
vextractf128 xmm3, ymm1, 1
vaddpd xmm1, xmm1, xmm3
vpermilpd xmm3, xmm1, 1 # xmm3 = xmm1[1,0]
vaddsd xmm1, xmm1, xmm3
vaddsd xmm1, xmm2, xmm1
vmulsd xmm1, xmm1, xmm0
vsubsd xmm2, xmm2, xmm1
vmovsd qword ptr [r12 + 8*rbp], xmm2
test rdi, rdi
je .LBB0_17
# %bb.15: # %L563.lr.ph
# in Loop: Header=BB0_2 Depth=1
vbroadcastsd zmm2, xmm1
xor ebp, ebp
.p2align 4, 0x90
.LBB0_16: # %L563
# Parent Loop BB0_2 Depth=1
# => This Inner Loop Header: Depth=2
vmovupd zmm3, zmmword ptr [rsi + 8*rbp - 192]
vmovupd zmm4, zmmword ptr [rsi + 8*rbp - 128]
vmovupd zmm5, zmmword ptr [rsi + 8*rbp - 64]
vmovupd zmm6, zmmword ptr [rsi + 8*rbp]
vfnmadd213pd zmm3, zmm2, zmmword ptr [r10 + 8*rbp - 192] # zmm3 = -(zmm2 * zmm3) + mem
vfnmadd213pd zmm4, zmm2, zmmword ptr [r10 + 8*rbp - 128] # zmm4 = -(zmm2 * zmm4) + mem
vfnmadd213pd zmm5, zmm2, zmmword ptr [r10 + 8*rbp - 64] # zmm5 = -(zmm2 * zmm5) + mem
vfnmadd213pd zmm6, zmm2, zmmword ptr [r10 + 8*rbp] # zmm6 = -(zmm2 * zmm6) + mem
vmovupd zmmword ptr [r10 + 8*rbp - 192], zmm3
vmovupd zmmword ptr [r10 + 8*rbp - 128], zmm4
vmovupd zmmword ptr [r10 + 8*rbp - 64], zmm5
vmovupd zmmword ptr [r10 + 8*rbp], zmm6
add rbp, 32
cmp rdi, rbp
jne .LBB0_16
.LBB0_17: # %L589
# in Loop: Header=BB0_2 Depth=1
cmp rdi, rbx
jg .LBB0_25
# %bb.18: # %L591
# in Loop: Header=BB0_2 Depth=1
cmp qword ptr [rsp - 72], rdi # 8-byte Folded Reload
jl .LBB0_19
# %bb.20: # %L613
# in Loop: Header=BB0_2 Depth=1
cmp qword ptr [rsp - 32], rdi # 8-byte Folded Reload
jge .LBB0_22
# %bb.21: # %L616
# in Loop: Header=BB0_2 Depth=1
vmovupd zmm2, zmmword ptr [rdx + 8*rdi]
mov ecx, dword ptr [rsp - 84] # 4-byte Reload
kmovd k1, ecx
vmovupd zmm3 {k1} {z}, zmmword ptr [rdx + 8*rdi + 64]
vmovupd zmm4 {k1} {z}, zmmword ptr [r8 + 8*rdi + 64]
vbroadcastsd zmm1, xmm1
vfnmadd213pd zmm2, zmm1, zmmword ptr [r8 + 8*rdi] # zmm2 = -(zmm1 * zmm2) + mem
vfnmadd231pd zmm4, zmm1, zmm3 # zmm4 = -(zmm1 * zmm3) + zmm4
vmovupd zmmword ptr [r8 + 8*rdi], zmm2
vmovupd zmmword ptr [r8 + 8*rdi + 64] {k1}, zmm4
jmp .LBB0_25
.LBB0_22: # %L632
# in Loop: Header=BB0_2 Depth=1
vmovupd zmm3, zmmword ptr [rdx + 8*rdi]
vmovupd zmm2, zmmword ptr [rdx + 8*rdi + 64]
cmp qword ptr [rsp - 40], rdi # 8-byte Folded Reload
jge .LBB0_24
# %bb.23: # %L635
# in Loop: Header=BB0_2 Depth=1
lea rbp, [rdx + 8*rdi]
add rbp, 128
mov ecx, dword ptr [rsp - 84] # 4-byte Reload
kmovd k1, ecx
vmovupd zmm4 {k1} {z}, zmmword ptr [rbp]
vmovupd zmm5 {k1} {z}, zmmword ptr [r8 + 8*rdi + 128]
vbroadcastsd zmm1, xmm1
vfnmadd213pd zmm3, zmm1, zmmword ptr [r8 + 8*rdi] # zmm3 = -(zmm1 * zmm3) + mem
vfnmadd213pd zmm2, zmm1, zmmword ptr [r8 + 8*rdi + 64] # zmm2 = -(zmm1 * zmm2) + mem
vfnmadd231pd zmm5, zmm1, zmm4 # zmm5 = -(zmm1 * zmm4) + zmm5
vmovupd zmmword ptr [r8 + 8*rdi], zmm3
vmovupd zmmword ptr [r8 + 8*rdi + 64], zmm2
vmovupd zmmword ptr [r8 + 8*rdi + 128] {k1}, zmm5
jmp .LBB0_25
.LBB0_24: # %L656
# in Loop: Header=BB0_2 Depth=1
lea rbp, [rdx + 8*rdi]
vmovupd zmm4, zmmword ptr [rbp + 128]
mov ecx, dword ptr [rsp - 84] # 4-byte Reload
kmovd k1, ecx
vmovupd zmm5 {k1} {z}, zmmword ptr [rbp + 192]
vmovupd zmm6 {k1} {z}, zmmword ptr [r8 + 8*rdi + 192]
vbroadcastsd zmm1, xmm1
vfnmadd213pd zmm3, zmm1, zmmword ptr [r8 + 8*rdi] # zmm3 = -(zmm1 * zmm3) + mem
vfnmadd213pd zmm2, zmm1, zmmword ptr [r8 + 8*rdi + 64] # zmm2 = -(zmm1 * zmm2) + mem
vfnmadd213pd zmm4, zmm1, zmmword ptr [r8 + 8*rdi + 128] # zmm4 = -(zmm1 * zmm4) + mem
vfnmadd231pd zmm6, zmm1, zmm5 # zmm6 = -(zmm1 * zmm5) + zmm6
vmovupd zmmword ptr [r8 + 8*rdi], zmm3
vmovupd zmmword ptr [r8 + 8*rdi + 64], zmm2
vmovupd zmmword ptr [r8 + 8*rdi + 128], zmm4
vmovupd zmmword ptr [r8 + 8*rdi + 192] {k1}, zmm6
jmp .LBB0_25
.Lfunc_end0:
.size "julia_reflectorApply!_4509", .Lfunc_end0-"julia_reflectorApply!_4509"
.cfi_endproc
# -- End function
.type .L_j_const1,@object # @_j_const1
.section .rodata.cst8,"aM",@progbits,8
.p2align 3
.L_j_const1:
.quad 1 # 0x1
.size .L_j_const1, 8
.section ".note.GNU-stack","",@progbits |
There's basically only one change between 0.12.118 and 0.12.119: v0.12.118...v0.12.119 However, I cannot reproduce a performance problem with either AVX512 or AVX2. Additionally, that change should be irrelevant here. |
Versioninfo: Julia Version 1.7.0
Commit 3bf9d17731 (2021-11-30 12:12 UTC)
Platform Info:
OS: Linux (x86_64-pc-linux-gnu)
CPU: Intel(R) Core(TM) i5-6600 CPU @ 3.30GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-12.0.1 (ORCJIT, skylake) I put code_typed and native in a gist since they're quite long: I checked this on the login node of a cluster now as well and I don't get a slow down there. So maybe it's specific to julia 1.7 or my cpu? julia> versioninfo()
Julia Version 1.7.1
Commit ac5cc99908* (2021-12-22 19:35 UTC)
Platform Info:
OS: Linux (x86_64-pc-linux-gnu)
CPU: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-12.0.1 (ORCJIT, skylake-avx512)
# with LoopVectorization 0.12.125
julia> @benchmark reflectorApply!($x, $τ, $y)
BenchmarkTools.Trial: 10000 samples with 165 evaluations.
Range (min … max): 651.345 ns … 2.410 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 692.697 ns ┊ GC (median): 0.00%
Time (mean ± σ): 703.326 ns ± 64.135 ns ┊ GC (mean ± σ): 0.00% ± 0.00%
▅█ ▁▂▁ ▁▄▆ ▁ ▃▅▆ ▁▂ ▁ ▃ ▁
██▃████▆████▆▇██▆▇████▆▇██▇▇███▇▇▇▇▇▆▅▆▄▆▆▅▄▄▂▄▃▂▃▂▃▄█▅▂▂▅▆▄ █
651 ns Histogram: log(frequency) by time 914 ns <
Memory estimate: 0 bytes, allocs estimate: 0. |
From your code: movabs rax, offset StrideIndex
vzeroupper
call rax
mov r13, qword ptr [r13]
mov rax, qword ptr [rsp + 96]
add rax, r13
mov qword ptr [rsp + 104], rax
mov rdi, r12
mov rsi, qword ptr [rsp + 32]
movabs rax, offset StrideIndex
call rax or │ invoke LayoutPointers.StrideIndex(x::SubArray{Float64, 1, Matrix{Float64}, Tuple{UnitRange{Int64}, Int64}, true})::ArrayInterface.StrideIndex{1, (1,), 1, Tuple{Static.StaticInt{1}}, Tuple{Static.StaticInt{1}}}
│ %136 = Base.getfield(A, :parent)::Matrix{Float64}
│ %137 = $(Expr(:foreigncall, :(:jl_array_ptr), Ptr{Float64}, svec(Any), 0, :(:ccall), :(%136)))::Ptr{Float64}
│ %138 = Base.getfield(A, :parent)::Matrix{Float64}
│ %139 = Base.getfield(A, :indices)::Tuple{UnitRange{Int64}, UnitRange{Int64}}
│ %140 = LayoutPointers.getfield(%139, 1, false)::UnitRange{Int64}
│ %141 = Base.getfield(%140, :start)::Int64
│ %142 = Base.sub_int(%141, 1)::Int64
│ %143 = Core.getfield(%139, 2)::UnitRange{Int64}
│ %144 = Base.getfield(%143, :start)::Int64
│ %145 = Base.sub_int(%144, 1)::Int64
│ %146 = Base.arraysize(%138, 1)::Int64
│ Base.arraysize(%138, 2)::Int64
│ %148 = Base.mul_int(1, %146)::Int64
│ %149 = Base.mul_int(%142, 1)::Int64
│ %150 = Base.mul_int(%145, %148)::Int64
│ %151 = Base.add_int(%149, %150)::Int64
│ %152 = Base.mul_int(8, %151)::Int64
│ %153 = Core.bitcast(Core.UInt, %137)::UInt64
│ %154 = Base.bitcast(UInt64, %152)::UInt64
│ %155 = Base.add_ptr(%153, %154)::UInt64
│ %156 = Core.bitcast(Ptr{Float64}, %155)::Ptr{Float64}
│ %157 = invoke LayoutPointers.StrideIndex(A::SubArray{Float64, 2, Matrix{Float64}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false})::ArrayInterface.StrideIndex{2, (1, 2), 1, Tuple{Static.StaticInt{1}, Int64}, Tuple{Static.StaticInt{1}, Static.StaticInt{1}}} That is bad. |
What do you get for |
(@v1.7) pkg> st -m ArrayInterface
Status `~/.julia/environments/v1.7/Manifest.toml`
[4fba245c] ArrayInterface v6.0.22 |
And no slowdown in 1.8.0 |
I'm guessing it is because |
I also tried (@v1.7) pkg> activate .
Activating new project at `~/Documents/julia source`
(julia source) pkg> add LoopVectorization BenchmarkTools and I still get a slowdown in 1.7. The package version match, except for some packages getting version numbers in 1.8 while not having any in 1.7 (ArgTools, Downloads, ...) |
With JuliaArrays/ArrayInterface.jl#343: https://gist.github.com/ffreyer/d63a95c21857678e8b76f40fff457a4a I'm fine with switching to the view-less version/upgrading Julia too |
Could you share the code typed? That looks really bad, but I don't know what functions are being called. |
I updated the gist |
@Tokazama do you want to make sure |
I can confirm that I see the regression on 1.7.3: julia> @benchmark reflectorApply!($x, $τ, $y)
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
Range (min … max): 77.337 μs … 142.382 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 127.234 μs ┊ GC (median): 0.00%
Time (mean ± σ): 127.364 μs ± 2.679 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
█▂
▂▂▂▁▂▁▂▂▂▂▂▂▂▂▁▂▂▂▁▂▂▂▂▂▂▂▂▂▁▂▂▂▂▁▁▂▂▂▂▂▂▁▂▂▂▁▁▂▂▁▂▁▂▂▂▄██▄▄▃ ▂
77.3 μs Histogram: frequency by time 131 μs <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> versioninfo()
Julia Version 1.7.3
Commit 742b9abb4d (2022-05-06 12:58 UTC)
Platform Info:
OS: Linux (x86_64-redhat-linux)
CPU: Intel(R) Core(TM) i9-7980XE CPU @ 2.60GHz |
35 ──││││││ %87 = ArrayInterfaceCore.is_splat_index::Core.Const(ArrayInterfaceCore.is_splat_index)
│ ││││││┌ @ /home/chriselrod/.julia/dev/ArrayInterface/lib/ArrayInterfaceCore/src/ArrayInterfaceCore.jl:39 within `map_tuple_type`
│ │││││││ %88 = %new(ArrayInterfaceCore.var"#1#2"{typeof(ArrayInterfaceCore.is_splat_index), DataType}, %87, Tuple{UnitRange{Int64}, Int64})::Core.Const(ArrayInterfaceCore.var"#1#2"{typeof(ArrayInterfaceCore.is_splat_index), DataType}(ArrayInterfaceCore.is_splat_index, Tuple{UnitRange{Int64}, Int64}))
│ │││││││┌ @ ntuple.jl:49 within `ntuple`
│ ││││││││ invoke %88(1::Int64)
│ ││││││││ invoke %88(2::Int64) and %89 = invoke #1(::Int64)::Core.Const(false)
%90 = invoke #1(::Int64)::Core.Const(false)
%96 = invoke #1(::Int64)::Core.Const(false)
%97 = invoke #1(::Int64)::Core.Const(false)
%101 = invoke #1(::Int64)::Core.Const(false)
%102 = invoke #1(::Int64)::Core.Const(false)
%132 = invoke #1(::Int64)::Core.Const(false)
%133 = invoke #1(::Int64)::Core.Const(false)
%139 = invoke #1(::Int64)::Core.Const(false)
%140 = invoke #1(::Int64)::Core.Const(false)
%144 = invoke #1(::Int64)::Core.Const(false)
%145 = invoke #1(::Int64)::Core.Const(false)
%378 = invoke #1(::Int64)::Core.Const(false)
%379 = invoke #1(::Int64)::Core.Const(false)
%385 = invoke #1(::Int64)::Core.Const(false)
%386 = invoke #1(::Int64)::Core.Const(false)
%390 = invoke #1(::Int64)::Core.Const(false)
%391 = invoke #1(::Int64)::Core.Const(false)
%431 = invoke #1(::Int64)::Core.Const(false)
%432 = invoke #1(::Int64)::Core.Const(false)
%438 = invoke #1(::Int64)::Core.Const(false)
%439 = invoke #1(::Int64)::Core.Const(false)
%443 = invoke #1(::Int64)::Core.Const(false)
%444 = invoke #1(::Int64)::Core.Const(false) Look at all these useless function calls known to return |
@ffreyer Try the ArrayInterfaceCore version from that PR, with the |
With that I get julia> @benchmark reflectorApply!($x, $τj, $y)
BenchmarkTools.Trial: 10000 samples with 71 evaluations.
Range (min … max): 847.521 ns … 3.534 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 921.204 ns ┊ GC (median): 0.00%
Time (mean ± σ): 972.610 ns ± 190.748 ns ┊ GC (mean ± σ): 0.00% ± 0.00%
█ ▇▃▂▁
▃█▇████▆▃▂▁▁▁▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
848 ns Histogram: frequency by time 1.84 μs <
Memory estimate: 0 bytes, allocs estimate: 0. which is much better but still a bit slower than it used to be |
Yeah. The 35 ── %87 = ArrayInterfaceCore.fieldtype(Tuple{UnitRange{Int64}, Int64}, 1)::Union{Type{UnitRange{Int64}}, Type{Int64}} ││╻╷╷╷╷╷╷╷ bytestrideindex
│ %88 = (isa)(%87, Type{UnitRange{Int64}})::Bool │││┃││││││ StrideIndex
└──── goto #37 if not %88 ││││┃│││││ stride_rank
36 ── goto #40 │││││┃││││ to_parent_dims
37 ── %91 = (isa)(%87, Type{Int64})::Bool ││││││┃│││ IndicesInfo
└──── goto #39 if not %91 │││││││┃││ map_tuple_type
38 ── goto #40 ││││││││┃│ ntuple
39 ── Core.throw(ErrorException("fatal error in type inference (type bound)"))::Union{} │││││││││┃ #36
└──── unreachable ││││││││││
40 ┄─ goto #41 ││││││││││
41 ── %97 = ArrayInterfaceCore.fieldtype(Tuple{UnitRange{Int64}, Int64}, 2)::Union{Type{UnitRange{Int64}}, Type{Int64}} ││││││││││
│ %98 = (isa)(%97, Type{UnitRange{Int64}})::Bool ││││││││││
└──── goto #43 if not %98 ││││││││││
42 ── goto #46 ││││││││││
43 ── %101 = (isa)(%97, Type{Int64})::Bool ││││││││││
└──── goto #45 if not %101 ││││││││││
44 ── goto #46 ││││││││││
45 ── Core.throw(ErrorException("fatal error in type inference (type bound)"))::Union{} ││││││││││
└──── unreachable ││││││││││
46 ┄─ goto #47 ││││││││││
47 ── goto #48 │││││││││
48 ── goto #49 ││││││││
49 ── %109 = ArrayInterfaceCore.fieldtype(Tuple{UnitRange{Int64}, Int64}, 1)::Union{Type{UnitRange{Int64}}, Type{Int64}} ││││││││╻╷ ntuple
│ %110 = (isa)(%109, Type{UnitRange{Int64}})::Bool │││││││││┃ #36
└──── goto #51 if not %110 ││││││││││
50 ── goto #54 ││││││││││
51 ── %113 = (isa)(%109, Type{Int64})::Bool ││││││││││
└──── goto #53 if not %113 ││││││││││
52 ── goto #54 ││││││││││
53 ── Core.throw(ErrorException("fatal error in type inference (type bound)"))::Union{} ││││││││││
└──── unreachable ││││││││││
54 ┄─ goto #55 ││││││││││
55 ── %119 = ArrayInterfaceCore.fieldtype(Tuple{UnitRange{Int64}, Int64}, 2)::Union{Type{UnitRange{Int64}}, Type{Int64}} ││││││││││
│ %120 = (isa)(%119, Type{UnitRange{Int64}})::Bool ││││││││││
└──── goto #57 if not %120 ││││││││││
56 ── goto #60 ││││││││││
57 ── %123 = (isa)(%119, Type{Int64})::Bool ││││││││││
└──── goto #59 if not %123 ││││││││││
58 ── goto #60 ││││││││││
59 ── Core.throw(ErrorException("fatal error in type inference (type bound)"))::Union{} ││││││││││
└──── unreachable ││││││││││
60 ┄─ goto #61 ││││││││││
61 ── goto #62 │││││││││
62 ── goto #63 ││││││││
63 ── goto #64 │││││││
64 ── goto #65 ││││││
65 ── goto #66 │││││
66 ── nothing::Nothing │
67 ── nothing::Nothing │
68 ── %136 = ArrayInterfaceCore.fieldtype(Tuple{UnitRange{Int64}, Int64}, 1)::Union{Type{UnitRange{Int64}}, Type{Int64}} │││││╻╷╷╷╷ from_parent_dims
│ %137 = (isa)(%136, Type{UnitRange{Int64}})::Bool ││││││┃│││ IndicesInfo
└──── goto #70 if not %137 │││││││┃││ map_tuple_type
69 ── goto #73 ││││││││┃│ ntuple
70 ── %140 = (isa)(%136, Type{Int64})::Bool │││││││││┃ #36
└──── goto #72 if not %140 ││││││││││
71 ── goto #73 ││││││││││
72 ── Core.throw(ErrorException("fatal error in type inference (type bound)"))::Union{} ││││││││││
└──── unreachable ││││││││││
73 ┄─ goto #74 ││││││││││
74 ── %146 = ArrayInterfaceCore.fieldtype(Tuple{UnitRange{Int64}, Int64}, 2)::Union{Type{UnitRange{Int64}}, Type{Int64}} ││││││││││
│ %147 = (isa)(%146, Type{UnitRange{Int64}})::Bool ││││││││││
└──── goto #76 if not %147 ││││││││││
75 ── goto #79 ││││││││││
76 ── %150 = (isa)(%146, Type{Int64})::Bool ││││││││││
└──── goto #78 if not %150 ││││││││││
77 ── goto #79 ││││││││││
78 ── Core.throw(ErrorException("fatal error in type inference (type bound)"))::Union{} ││││││││││
└──── unreachable ││││││││││
79 ┄─ goto #80 ││││││││││
80 ── goto #81 │││││││││
81 ── goto #82 ││││││││
82 ── %158 = ArrayInterfaceCore.fieldtype(Tuple{UnitRange{Int64}, Int64}, 1)::Union{Type{UnitRange{Int64}}, Type{Int64}} ││││││││╻╷ ntuple
│ %159 = (isa)(%158, Type{UnitRange{Int64}})::Bool │││││││││┃ #36
└──── goto #84 if not %159 ││││││││││
83 ── goto #87 ││││││││││
84 ── %162 = (isa)(%158, Type{Int64})::Bool ││││││││││
└──── goto #86 if not %162 ││││││││││
85 ── goto #87 ││││││││││
86 ── Core.throw(ErrorException("fatal error in type inference (type bound)"))::Union{} ││││││││││
└──── unreachable ││││││││││
87 ┄─ goto #88 ││││││││││
88 ── %168 = ArrayInterfaceCore.fieldtype(Tuple{UnitRange{Int64}, Int64}, 2)::Union{Type{UnitRange{Int64}}, Type{Int64}} ││││││││││
│ %169 = (isa)(%168, Type{UnitRange{Int64}})::Bool ││││││││││
└──── goto #90 if not %169 ││││││││││
89 ── goto #93 ││││││││││
90 ── %172 = (isa)(%168, Type{Int64})::Bool ││││││││││
└──── goto #92 if not %172 ││││││││││
91 ── goto #93 ││││││││││
92 ── Core.throw(ErrorException("fatal error in type inference (type bound)"))::Union{} ││││││││││
└──── unreachable ││││││││││
93 ┄─ goto #94 ││││││││││
94 ── goto #95 │││││││││
95 ── goto #96 ││││││││
96 ── goto #97 │││││││
97 ── goto #98 ││││││
98 ── goto #99 │││││
99 ── %183 = ArrayInterfaceCore.fieldtype(Tuple{UnitRange{Int64}, Int64}, 1)::Union{Type{UnitRange{Int64}}, Type{Int64}} │││││╻╷╷╷╷╷ strides
│ %184 = (isa)(%183, Type{UnitRange{Int64}})::Bool ││││││┃││││ to_parent_dims
└──── goto #101 if not %184 │││││││┃│││ IndicesInfo
100 ─ goto #104 ││││││││┃││ map_tuple_type
101 ─ %187 = (isa)(%183, Type{Int64})::Bool │││││││││┃│ ntuple
└──── goto #103 if not %187 ││││││││││┃ #36
102 ─ goto #104 │││││││││││
103 ─ Core.throw(ErrorException("fatal error in type inference (type bound)"))::Union{} │││││││││││
└──── unreachable │││││││││││
104 ┄ goto #105 │││││││││││
105 ─ %193 = ArrayInterfaceCore.fieldtype(Tuple{UnitRange{Int64}, Int64}, 2)::Union{Type{UnitRange{Int64}}, Type{Int64}} │││││││││││
│ %194 = (isa)(%193, Type{UnitRange{Int64}})::Bool │││││││││││
└──── goto #107 if not %194 │││││││││││
106 ─ goto #110 │││││││││││
107 ─ %197 = (isa)(%193, Type{Int64})::Bool │││││││││││
└──── goto #109 if not %197 │││││││││││
108 ─ goto #110 │││││││││││
109 ─ Core.throw(ErrorException("fatal error in type inference (type bound)"))::Union{} │││││││││││
└──── unreachable │││││││││││
110 ┄ goto #111 │││││││││││
111 ─ goto #112 ││││││││││
112 ─ goto #113 │││││││││
113 ─ %205 = ArrayInterfaceCore.fieldtype(Tuple{UnitRange{Int64}, Int64}, 1)::Union{Type{UnitRange{Int64}}, Type{Int64}} │││││││││╻╷ ntuple
│ %206 = (isa)(%205, Type{UnitRange{Int64}})::Bool ││││││││││┃ #36
└──── goto #115 if not %206 │││││││││││
114 ─ goto #118 │││││││││││
115 ─ %209 = (isa)(%205, Type{Int64})::Bool │││││││││││
└──── goto #117 if not %209 │││││││││││
116 ─ goto #118 │││││││││││
117 ─ Core.throw(ErrorException("fatal error in type inference (type bound)"))::Union{} │││││││││││
└──── unreachable │││││││││││
118 ┄ goto #119 │││││││││││
119 ─ %215 = ArrayInterfaceCore.fieldtype(Tuple{UnitRange{Int64}, Int64}, 2)::Union{Type{UnitRange{Int64}}, Type{Int64}} │││││││││││
│ %216 = (isa)(%215, Type{UnitRange{Int64}})::Bool │││││││││││
└──── goto #121 if not %216 │││││││││││
120 ─ goto #124 │││││││││││
121 ─ %219 = (isa)(%215, Type{Int64})::Bool │││││││││││
└──── goto #123 if not %219 │││││││││││
122 ─ goto #124 │││││││││││
123 ─ Core.throw(ErrorException("fatal error in type inference (type bound)"))::Union{} │││││││││││
└──── unreachable │││││││││││
124 ┄ goto #125 │││││││││││
125 ─ goto #126 ││││││││││ and I suspect this makes it into the generated code, but I'm not sure. |
We might need to revert the changes to |
This is probably one of those instances where I should see what comes out of this when we use |
FWIW, master seems fine. I haven't tried 1.8. So we could have different versions for >= 1.8 vs not. |
I haven't tested this particular issue, but I know from experience that pre 1.8 inference requires a lot more explicitly defined types |
This isn't even an inference failure. The compiler just isn't doing its job 82 ── %158 = ArrayInterfaceCore.fieldtype(Tuple{UnitRange{Int64}, Int64}, 1)::Union{Type{UnitRange{Int64}}, Type{Int64}} ││││││││╻╷ ntuple
│ %159 = (isa)(%158, Type{UnitRange{Int64}})::Bool |
(1) With the new changes from the pr: julia> @benchmark reflectorApply!($x, $τj, $y)
BenchmarkTools.Trial: 10000 samples with 156 evaluations.
Range (min … max): 662.442 ns … 1.881 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 739.295 ns ┊ GC (median): 0.00%
Time (mean ± σ): 754.946 ns ± 87.526 ns ┊ GC (mean ± σ): 0.00% ± 0.00%
█ ▁
▄▁█▅▄█▄██████▆▅▅▄▃▃▃▃▃▃▂▂▂▂▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂ ▃
662 ns Histogram: frequency by time 1.14 μs <
Memory estimate: 0 bytes, allocs estimate: 0. vs (2) [email protected]: @benchmark reflectorApply!($x, $τ, $y)
BenchmarkTools.Trial: 10000 samples with 151 evaluations.
Range (min … max): 677.364 ns … 1.560 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 709.536 ns ┊ GC (median): 0.00%
Time (mean ± σ): 742.538 ns ± 103.227 ns ┊ GC (mean ± σ): 0.00% ± 0.00%
█ ▆▇▂▂ ▁ ▁▂▂▁▁▁▁ ▁
▅█▆███████▆█████████▇▇███▇███▇▇▇▇▇▇▆█▆▇▇▇█▇▇▆▅▅▆▇▆▇▇▇▇▆▄▄▄▅▅▆ █
677 ns Histogram: log(frequency) by time 1.17 μs <
Memory estimate: 0 bytes, allocs estimate: 0. Across multiple benchmarks (1) usually has lower min times, but higher median, mean and max times than (2). |
If you want to fix things, just |
I have a reimplementation of
Linearlagebra.qr
in my codebase which uses@turbo
. In it there iswhich is called with two views into the same array:
I've noticed that after triggering a package update things have been running more slowly and Identified that the problem is here. Trying some different version of LoopVectorization pointed showed that v0.12.119 had made this much slower:
Rewriting this to use no views restores the performance from before 0.12.119.
The text was updated successfully, but these errors were encountered: