Skip to content

Commit

Permalink
[WebAssembly] Restore defaults for stores per memop
Browse files Browse the repository at this point in the history
Summary:
Large slowdowns were observed in Rust due to many small, constant
sized copies in conjunction with poorly-optimized memory.copy
implementations. Since memory.copy cannot be expected to be inlined
efficiently by engines at this time, stop using it for the smallest
copies. We continue to lower all memcpy intrinsics to memory.copy,
though.

Reviewers: aheejin, alexcrichton

Subscribers: dschuff, sbc100, jgravelle-google, hiraditya, JDevlieghere, sunfish, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D67639

llvm-svn: 372275
  • Loading branch information
tlively authored and alexcrichton committed Oct 2, 2019
1 parent 8adf9bd commit 94b07ab
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 30 deletions.
10 changes: 0 additions & 10 deletions llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,16 +259,6 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(

setMaxAtomicSizeInBitsSupported(64);

if (Subtarget->hasBulkMemory()) {
// Use memory.copy and friends over multiple loads and stores
MaxStoresPerMemcpy = 1;
MaxStoresPerMemcpyOptSize = 1;
MaxStoresPerMemmove = 1;
MaxStoresPerMemmoveOptSize = 1;
MaxStoresPerMemset = 1;
MaxStoresPerMemsetOptSize = 1;
}

// Override the __gnu_f2h_ieee/__gnu_h2f_ieee names so that the f32 name is
// consistent with the f64 and f128 names.
setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
Expand Down
40 changes: 20 additions & 20 deletions llvm/test/CodeGen/WebAssembly/bulk-memory.ll
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ define void @memset_1024(i8* %dest, i8 %val) {
}

; The following tests check that frame index elimination works for
; bulk memory instructions. The stack pointer is bumped by 16 instead
; of 10 because the stack pointer in WebAssembly is currently always
; bulk memory instructions. The stack pointer is bumped by 112 instead
; of 100 because the stack pointer in WebAssembly is currently always
; 16-byte aligned, even in leaf functions, although it is not written
; back to the global in this case.

Expand All @@ -156,52 +156,52 @@ define void @memset_1024(i8* %dest, i8 %val) {
; NO-BULK-MEM-NOT: memory.copy
; BULK-MEM-NEXT: .functype memcpy_alloca_src (i32) -> ()
; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 16
; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 6
; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 10
; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L4]], $pop[[L5]]
; BULK-MEM-NEXT: return
define void @memcpy_alloca_src(i8* %dst) {
%a = alloca [10 x i8]
%p = bitcast [10 x i8]* %a to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %p, i32 10, i1 false)
%a = alloca [100 x i8]
%p = bitcast [100 x i8]* %a to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %p, i32 100, i1 false)
ret void
}

; CHECK-LABEL: memcpy_alloca_dst:
; NO-BULK-MEM-NOT: memory.copy
; BULK-MEM-NEXT: .functype memcpy_alloca_dst (i32) -> ()
; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 16
; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 6
; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 10
; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L4]], $0, $pop[[L5]]
; BULK-MEM-NEXT: return
define void @memcpy_alloca_dst(i8* %src) {
%a = alloca [10 x i8]
%p = bitcast [10 x i8]* %a to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* %src, i32 10, i1 false)
%a = alloca [100 x i8]
%p = bitcast [100 x i8]* %a to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* %src, i32 100, i1 false)
ret void
}

; CHECK-LABEL: memset_alloca:
; NO-BULK-MEM-NOT: memory.fill
; BULK-MEM-NEXT: .functype memset_alloca (i32) -> ()
; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 16
; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 6
; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 10
; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
; BULK-MEM-NEXT: memory.fill 0, $pop[[L4]], $0, $pop[[L5]]
; BULK-MEM-NEXT: return
define void @memset_alloca(i8 %val) {
%a = alloca [10 x i8]
%p = bitcast [10 x i8]* %a to i8*
call void @llvm.memset.p0i8.i32(i8* %p, i8 %val, i32 10, i1 false)
%a = alloca [100 x i8]
%p = bitcast [100 x i8]* %a to i8*
call void @llvm.memset.p0i8.i32(i8* %p, i8 %val, i32 100, i1 false)
ret void
}

0 comments on commit 94b07ab

Please sign in to comment.