Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support vectorization of the gc.loaded intrinsics #56188

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from

Conversation

vchuravy
Copy link
Member

@vchuravy vchuravy commented Oct 16, 2024

Motivated by #56145 (comment) I was interested
in seeing if we could teach LLVM about vectorizing our intrinsics that will be removed before the final result
e.g. @julia.gc_loaded

This is an alternative to #51536 and uses the "vector-function-abi-variant" attribute.

This creates a lot of code-duplication so it might make sense to first do #52945

On nightly (54299d9)

; Function Signature: var"#2"(FixedSizeArrays.FixedSizeArray{Float64, 1, Memory{Float64}})
define void @"julia_#2_2050"(ptr nocapture noundef nonnull readonly align 8 dereferenceable(16) %"v::FixedSizeArray", ptr nocapture readonly %.roots.v) #0 {
top:
  %0 = getelementptr inbounds i8, ptr %"v::FixedSizeArray", i64 8
  %.unbox = load i64, ptr %0, align 8
  %1 = icmp slt i64 %.unbox, 1
  br i1 %1, label %L48, label %L13.preheader15

L13.preheader15:                                  ; preds = %top
  %memoryref_mem = load ptr, ptr %.roots.v, align 8
  %memory_data_ptr = getelementptr inbounds { i64, ptr }, ptr %memoryref_mem, i64 0, i32 1
  %memoryref_data.pre = load ptr, ptr %memory_data_ptr, align 8
  br label %L30

L30:                                              ; preds = %L30, %L13.preheader15
  %value_phi3 = phi i64 [ %3, %L30 ], [ 1, %L13.preheader15 ]
  %memoryref_offset = shl i64 %value_phi3, 3
  %2 = getelementptr i8, ptr %memoryref_data.pre, i64 %memoryref_offset
  %memoryref_data6 = getelementptr i8, ptr %2, i64 -8
  store i64 4607182418800017408, ptr %memoryref_data6, align 8
  %3 = add nuw i64 %value_phi3, 1
  %4 = icmp ult i64 %value_phi3, %.unbox
  br i1 %4, label %L30, label %L48

L48:                                              ; preds = %L30, %top
  ret void
}

On this PR:

; Function Signature: var"#2"(FixedSizeArrays.FixedSizeArray{Float64, 1, Memory{Float64}})
define void @"julia_#2_2053"(ptr nocapture noundef nonnull readonly align 8 dereferenceable(16) %"v::FixedSizeArray", ptr nocapture readonly %.roots.v) #0 {
top:
  %0 = getelementptr inbounds i8, ptr %"v::FixedSizeArray", i64 8
  %.unbox = load i64, ptr %0, align 8
  %1 = icmp slt i64 %.unbox, 1
  br i1 %1, label %L48, label %L13.preheader15

L13.preheader15:                                  ; preds = %top
  %memoryref_mem = load ptr, ptr %.roots.v, align 8
  %memory_data_ptr = getelementptr inbounds { i64, ptr }, ptr %memoryref_mem, i64 0, i32 1
  %min.iters.check = icmp ult i64 %.unbox, 8
  %memoryref_data.pre.pre = load ptr, ptr %memory_data_ptr, align 8
  br i1 %min.iters.check, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %L13.preheader15
  %n.vec = and i64 %.unbox, 9223372036854775800
  %ind.end = or disjoint i64 %n.vec, 1
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %vec.ind = phi <8 x i64> [ <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8>, %vector.ph ], [ %vec.ind.next, %vector.body ]
  %broadcast.splatinsert20 = insertelement <8 x ptr> poison, ptr %memoryref_data.pre.pre, i64 0
  %broadcast.splat21 = shufflevector <8 x ptr> %broadcast.splatinsert20, <8 x ptr> poison, <8 x i32> zeroinitializer
  %2 = shl <8 x i64> %vec.ind, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
  %3 = getelementptr i8, <8 x ptr> %broadcast.splat21, <8 x i64> %2
  %4 = getelementptr i8, <8 x ptr> %3, i64 -8
  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> <i64 4607182418800017408, i64 4607182418800017408, i64 4607182418800017408, i64 4607182418800017408, i64 4607182418800017408, i64 4607182418800017408, i64 4607182418800017408, i64 4607182418800017408>, <8 x ptr> %4, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
  %index.next = add nuw i64 %index, 8
  %vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
  %5 = icmp eq i64 %index.next, %n.vec
  br i1 %5, label %middle.block, label %vector.body

middle.block:                                     ; preds = %vector.body
  %cmp.n = icmp eq i64 %.unbox, %n.vec
  br i1 %cmp.n, label %L48, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block, %L13.preheader15
  %bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %L13.preheader15 ]
  br label %L30

L30:                                              ; preds = %L30, %scalar.ph
  %value_phi3 = phi i64 [ %7, %L30 ], [ %bc.resume.val, %scalar.ph ]
  %memoryref_offset = shl i64 %value_phi3, 3
  %6 = getelementptr i8, ptr %memoryref_data.pre.pre, i64 %memoryref_offset
  %memoryref_data6 = getelementptr i8, ptr %6, i64 -8
  store i64 4607182418800017408, ptr %memoryref_data6, align 8
  %7 = add nuw i64 %value_phi3, 1
  %8 = icmp ult i64 %value_phi3, %.unbox
  br i1 %8, label %L30, label %L48

L48:                                              ; preds = %L30, %middle.block, %top
  ret void
}

Not pretty, but at least the intrinsic no longer blocks vectorization.

Running with JULIA_LLVM_ARGS="--debug-only=vfabi-demangler"

VFABI: Adding mapping '_ZGV_LLVM_N2vv_julia.gc_loaded(julia.gc_loaded.v2)' for   %3 = call ptr addrspace(13) @julia.gc_loaded(ptr addrspace(10) %memoryref_mem, ptr %memoryref_data), !dbg !80
VFABI: Adding mapping '_ZGV_LLVM_N4vv_julia.gc_loaded(julia.gc_loaded.v4)' for   %3 = call ptr addrspace(13) @julia.gc_loaded(ptr addrspace(10) %memoryref_mem, ptr %memoryref_data), !dbg !80
VFABI: Adding mapping '_ZGV_LLVM_N8vv_julia.gc_loaded(julia.gc_loaded.v8)' for   %3 = call ptr addrspace(13) @julia.gc_loaded(ptr addrspace(10) %memoryref_mem, ptr %memoryref_data), !dbg !80
VFABI: Adding mapping '_ZGV_LLVM_N16vv_julia.gc_loaded(julia.gc_loaded.v16)' for   %3 = call ptr addrspace(13) @julia.gc_loaded(ptr addrspace(10) %memoryref_mem, ptr %memoryref_data), !dbg !80

and with JULIA_LLVM_ARGS="--print-after=loop-vectorize --pass-remarks-analysis=loop-vectorize"

remark: abstractarray.jl:699:0: the cost-model indicates that interleaving is not beneficial
; *** IR Dump After LoopVectorizePass on julia_#5_1744 ***
define void @"julia_#5_1744"(ptr addrspace(11) nocapture noundef nonnull readonly align 8 dereferenceable(16) %"v::FixedSizeArray", ptr nocapture readonly %.roots.v) #0 !dbg !5 {
top:
  %pgcstack = call ptr @julia.get_pgcstack()
  %memoryref_mem = load ptr addrspace(10), ptr %.roots.v, align 8, !tbaa !24, !alias.scope !28, !noalias !31
  call void @llvm.dbg.declare(metadata ptr addrspace(11) %"v::FixedSizeArray", metadata !23, metadata !DIExpression()), !dbg !36
  %0 = getelementptr inbounds i8, ptr addrspace(11) %"v::FixedSizeArray", i64 8, !dbg !37
  %.unbox = load i64, ptr addrspace(11) %0, align 8, !dbg !51, !tbaa !62, !invariant.load !10, !alias.scope !64, !noalias !65
  %1 = icmp slt i64 %.unbox, 1, !dbg !51
  br i1 %1, label %L48, label %L13.preheader15, !dbg !36

L13.preheader15:                                  ; preds = %top
  %2 = addrspacecast ptr addrspace(10) %memoryref_mem to ptr addrspace(11)
  %memory_data_ptr = getelementptr inbounds { i64, ptr }, ptr addrspace(11) %2, i64 0, i32 1
  %min.iters.check = icmp ult i64 %.unbox, 8, !dbg !66
  br i1 %min.iters.check, label %scalar.ph, label %vector.ph, !dbg !66

vector.ph:                                        ; preds = %L13.preheader15
  %n.mod.vf = urem i64 %.unbox, 8, !dbg !66
  %n.vec = sub i64 %.unbox, %n.mod.vf, !dbg !66
  %ind.end = add i64 1, %n.vec, !dbg !66
  %broadcast.splatinsert = insertelement <8 x ptr addrspace(10)> poison, ptr addrspace(10) %memoryref_mem, i64 0, !dbg !66
  %broadcast.splat = shufflevector <8 x ptr addrspace(10)> %broadcast.splatinsert, <8 x ptr addrspace(10)> poison, <8 x i32> zeroinitializer, !dbg !66
  br label %vector.body, !dbg !66

vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %vec.ind = phi <8 x i64> [ <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8>, %vector.ph ], [ %vec.ind.next, %vector.body ]
  %3 = load ptr, ptr addrspace(11) %memory_data_ptr, align 8, !dbg !71, !tbaa !62, !invariant.load !10, !alias.scope !64, !noalias !65, !nonnull !10
  %broadcast.splatinsert20 = insertelement <8 x ptr> poison, ptr %3, i64 0, !dbg !74
  %broadcast.splat21 = shufflevector <8 x ptr> %broadcast.splatinsert20, <8 x ptr> poison, <8 x i32> zeroinitializer, !dbg !74
  %4 = shl <8 x i64> %vec.ind, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, !dbg !74
  %5 = call <8 x ptr addrspace(13)> @julia.gc_loaded.v8(<8 x ptr addrspace(10)> %broadcast.splat, <8 x ptr> %broadcast.splat21), !dbg !66
  %6 = getelementptr i8, <8 x ptr addrspace(13)> %5, <8 x i64> %4, !dbg !66
  %7 = getelementptr i8, <8 x ptr addrspace(13)> %6, i64 -8, !dbg !66
  call void @llvm.masked.scatter.v8i64.v8p13(<8 x i64> <i64 4607182418800017408, i64 4607182418800017408, i64 4607182418800017408, i64 4607182418800017408, i64 4607182418800017408, i64 4607182418800017408, i64 4607182418800017408, i64 4607182418800017408>, <8 x ptr addrspace(13)> %7, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !dbg !80, !tbaa !81, !alias.scope !83, !noalias !84
  %index.next = add nuw i64 %index, 8
  %vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
  %8 = icmp eq i64 %index.next, %n.vec
  br i1 %8, label %middle.block, label %vector.body, !llvm.loop !85

middle.block:                                     ; preds = %vector.body
  %cmp.n = icmp eq i64 %.unbox, %n.vec, !dbg !88
  br i1 %cmp.n, label %L48.loopexit, label %scalar.ph, !dbg !88

scalar.ph:                                        ; preds = %L13.preheader15, %middle.block
  %bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %L13.preheader15 ]
  br label %L30, !dbg !66

L30:                                              ; preds = %L30, %scalar.ph
  %value_phi3 = phi i64 [ %11, %L30 ], [ %bc.resume.val, %scalar.ph ]
  %memoryref_data = load ptr, ptr addrspace(11) %memory_data_ptr, align 8, !dbg !71, !tbaa !62, !invariant.load !10, !alias.scope !64, !noalias !65, !nonnull !10
  %memoryref_offset = shl i64 %value_phi3, 3, !dbg !74
  %9 = call ptr addrspace(13) @julia.gc_loaded(ptr addrspace(10) %memoryref_mem, ptr %memoryref_data), !dbg !80
  %10 = getelementptr i8, ptr addrspace(13) %9, i64 %memoryref_offset, !dbg !80
  %memoryref_data6 = getelementptr i8, ptr addrspace(13) %10, i64 -8, !dbg !80
  store i64 4607182418800017408, ptr addrspace(13) %memoryref_data6, align 8, !dbg !80, !tbaa !81, !alias.scope !83, !noalias !84
  %11 = add nuw i64 %value_phi3, 1, !dbg !89
  %12 = icmp ult i64 %value_phi3, %.unbox, !dbg !88
  br i1 %12, label %L30, label %L48.loopexit, !dbg !88, !llvm.loop !90

L48.loopexit:                                     ; preds = %middle.block, %L30
  br label %L48, !dbg !88

L48:                                              ; preds = %L48.loopexit, %top
  ret void, !dbg !88
}

src/codegen.cpp Outdated Show resolved Hide resolved
@vchuravy vchuravy force-pushed the vc/vectorize_intrinsics branch from 4391248 to 8ad3fc9 Compare October 17, 2024 07:18
@vchuravy vchuravy force-pushed the vc/vectorize_intrinsics branch from 8ad3fc9 to 8ad66e5 Compare October 17, 2024 07:19
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants