From b71a1b317e13ce45f9f3520a4b9cd4bc3355f879 Mon Sep 17 00:00:00 2001 From: Kuai Wei Date: Mon, 29 Jan 2024 09:33:22 +0000 Subject: [PATCH] 8325821: [REDO] use "dmb.ishst+dmb.ishld" for release barrier --- src/hotspot/cpu/aarch64/aarch64.ad | 7 +- src/hotspot/cpu/aarch64/globals_aarch64.hpp | 2 + .../cpu/aarch64/macroAssembler_aarch64.cpp | 31 ++- .../cpu/aarch64/macroAssembler_aarch64.hpp | 1 + .../cpu/aarch64/vm_version_aarch64.cpp | 5 +- src/hotspot/share/asm/codeBuffer.cpp | 15 +- src/hotspot/share/asm/codeBuffer.hpp | 8 + .../gtest/aarch64/test_assembler_aarch64.cpp | 202 +++++++++++++++++- .../vm/compiler/FinalFieldInitialize.java | 85 ++++++++ 9 files changed, 343 insertions(+), 13 deletions(-) create mode 100644 test/micro/org/openjdk/bench/vm/compiler/FinalFieldInitialize.java diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad index a82a1b396e0a6..672397d6c6068 100644 --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -7780,7 +7780,7 @@ instruct membar_acquire() %{ ins_cost(VOLATILE_REF_COST); format %{ "membar_acquire\n\t" - "dmb ish" %} + "dmb ishld" %} ins_encode %{ __ block_comment("membar_acquire"); @@ -7834,11 +7834,12 @@ instruct membar_release() %{ ins_cost(VOLATILE_REF_COST); format %{ "membar_release\n\t" - "dmb ish" %} + "dmb ishst\n\tdmb ishld" %} ins_encode %{ __ block_comment("membar_release"); - __ membar(Assembler::LoadStore|Assembler::StoreStore); + __ membar(Assembler::StoreStore); + __ membar(Assembler::LoadStore); %} ins_pipe(pipe_serial); %} diff --git a/src/hotspot/cpu/aarch64/globals_aarch64.hpp b/src/hotspot/cpu/aarch64/globals_aarch64.hpp index 2f83838fc0fd1..f5be35abadfaf 100644 --- a/src/hotspot/cpu/aarch64/globals_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/globals_aarch64.hpp @@ -124,6 +124,8 @@ define_pd_global(intx, InlineSmallCode, 1000); range(1, 99) \ product(ccstr, UseBranchProtection, "none", \ "Branch Protection to use: none, standard, pac-ret") \ + product(bool, AlwaysMergeDMB, true, DIAGNOSTIC, \ + "Always merge DMB instructions in code emission") \ // end of ARCH_FLAGS diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp index faba321afc7f3..bac5cdcf4c72a 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp @@ -2350,14 +2350,31 @@ void MacroAssembler::membar(Membar_mask_bits order_constraint) { address last = code()->last_insn(); if (last != nullptr && nativeInstruction_at(last)->is_Membar() && prev == last) { NativeMembar *bar = NativeMembar_at(prev); - // We are merging two memory barrier instructions. On AArch64 we - // can do this simply by ORing them together. - bar->set_kind(bar->get_kind() | order_constraint); - BLOCK_COMMENT("merged membar"); - } else { - code()->set_last_insn(pc()); - dmb(Assembler::barrier(order_constraint)); + // Don't promote DMB ST|DMB LD to DMB (a full barrier) because + // doing so would introduce a StoreLoad which the caller did not + // intend + if (AlwaysMergeDMB || bar->get_kind() == order_constraint + || bar->get_kind() == AnyAny + || order_constraint == AnyAny) { + // We are merging two memory barrier instructions. On AArch64 we + // can do this simply by ORing them together. + bar->set_kind(bar->get_kind() | order_constraint); + BLOCK_COMMENT("merged membar"); + return; + } else if (!AlwaysMergeDMB){ + // A special case like "DMB ST;DMB LD;DMB ST", the last DMB can be skipped + // We need check the last 2 instructions + address prev2 = prev - NativeMembar::instruction_size; + if (last != code()->last_label() && nativeInstruction_at(prev2)->is_Membar()) { + NativeMembar *bar2 = NativeMembar_at(prev2); + assert(bar2->get_kind() == order_constraint, "it should be merged before"); + BLOCK_COMMENT("merged membar"); + return; + } + } } + code()->set_last_insn(pc()); + dmb(Assembler::barrier(order_constraint)); } bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) { diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp index 67fd7236842a7..62658d7b21469 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp @@ -150,6 +150,7 @@ class MacroAssembler: public Assembler { void bind(Label& L) { Assembler::bind(L); code()->clear_last_insn(); + code()->set_last_label(pc()); } void membar(Membar_mask_bits order_constraint); diff --git a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp index 18f310c746cd4..aa64f411dbfb9 100644 --- a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2015, 2020, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * @@ -230,6 +230,9 @@ void VM_Version::initialize() { if (FLAG_IS_DEFAULT(OnSpinWaitInstCount)) { FLAG_SET_DEFAULT(OnSpinWaitInstCount, 1); } + if (FLAG_IS_DEFAULT(AlwaysMergeDMB)) { + FLAG_SET_DEFAULT(AlwaysMergeDMB, false); + } } if (_cpu == CPU_ARM) { diff --git a/src/hotspot/share/asm/codeBuffer.cpp b/src/hotspot/share/asm/codeBuffer.cpp index e818853870e55..10b7879e392cc 100644 --- a/src/hotspot/share/asm/codeBuffer.cpp +++ b/src/hotspot/share/asm/codeBuffer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -930,6 +930,10 @@ void CodeBuffer::expand(CodeSection* which_cs, csize_t amount) { // Move all the code and relocations to the new blob: relocate_code_to(&cb); + // some internal addresses, _last_insn _last_label, are used during code emission, + // adjust them in expansion + adjust_internal_address(insts_begin(), cb.insts_begin()); + // Copy the temporary code buffer into the current code buffer. // Basically, do {*this = cb}, except for some control information. this->take_over_code_from(&cb); @@ -951,6 +955,15 @@ void CodeBuffer::expand(CodeSection* which_cs, csize_t amount) { #endif //PRODUCT } +void CodeBuffer::adjust_internal_address(address from, address to) { + if (_last_insn != nullptr) { + _last_insn += to - from; + } + if (_last_label != nullptr) { + _last_label += to - from; + } +} + void CodeBuffer::take_over_code_from(CodeBuffer* cb) { // Must already have disposed of the old blob somehow. assert(blob() == nullptr, "must be empty"); diff --git a/src/hotspot/share/asm/codeBuffer.hpp b/src/hotspot/share/asm/codeBuffer.hpp index 5cff78a96cc83..f7fdfb71bcf2a 100644 --- a/src/hotspot/share/asm/codeBuffer.hpp +++ b/src/hotspot/share/asm/codeBuffer.hpp @@ -436,6 +436,7 @@ class CodeBuffer: public StackObj DEBUG_ONLY(COMMA private Scrubber) { Arena* _overflow_arena; address _last_insn; // used to merge consecutive memory barriers, loads or stores. + address _last_label; // record last bind label address, it's also the start of current bb. SharedStubToInterpRequests* _shared_stub_to_interp_requests; // used to collect requests for shared iterpreter stubs SharedTrampolineRequests* _shared_trampoline_requests; // used to collect requests for shared trampolines @@ -460,6 +461,7 @@ class CodeBuffer: public StackObj DEBUG_ONLY(COMMA private Scrubber) { _oop_recorder = nullptr; _overflow_arena = nullptr; _last_insn = nullptr; + _last_label = nullptr; _main_code_size = 0; _finalize_stubs = false; _shared_stub_to_interp_requests = nullptr; @@ -514,6 +516,9 @@ class CodeBuffer: public StackObj DEBUG_ONLY(COMMA private Scrubber) { // moves code sections to new buffer (assumes relocs are already in there) void relocate_code_to(CodeBuffer* cb) const; + // adjust some internal address during expand + void adjust_internal_address(address from, address to); + // set up a model of the final layout of my contents void compute_final_layout(CodeBuffer* dest) const; @@ -686,6 +691,9 @@ class CodeBuffer: public StackObj DEBUG_ONLY(COMMA private Scrubber) { void set_last_insn(address a) { _last_insn = a; } void clear_last_insn() { set_last_insn(nullptr); } + address last_label() const { return _last_label; } + void set_last_label(address a) { _last_label = a; } + #ifndef PRODUCT AsmRemarks &asm_remarks() { return _asm_remarks; } DbgStrings &dbg_strings() { return _dbg_strings; } diff --git a/test/hotspot/gtest/aarch64/test_assembler_aarch64.cpp b/test/hotspot/gtest/aarch64/test_assembler_aarch64.cpp index 1d75685475527..1c408f8b5696f 100644 --- a/test/hotspot/gtest/aarch64/test_assembler_aarch64.cpp +++ b/test/hotspot/gtest/aarch64/test_assembler_aarch64.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2020, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * @@ -28,6 +28,7 @@ #include "asm/assembler.hpp" #include "asm/assembler.inline.hpp" +#include "asm/macroAssembler.hpp" #include "compiler/disassembler.hpp" #include "memory/resourceArea.hpp" #include "unittest.hpp" @@ -81,4 +82,203 @@ TEST_VM(AssemblerAArch64, validate) { BufferBlob::free(b); } +static void asm_dump(address start, address end) { + ResourceMark rm; + stringStream ss; + ss.print_cr("Insns:"); + Disassembler::decode(start, end, &ss); + printf("%s\n", ss.as_string()); +} + +TEST_VM(AssemblerAArch64, merge_dmb) { + BufferBlob* b = BufferBlob::create("aarch64Test", 400); + CodeBuffer code(b); + MacroAssembler _masm(&code); + + { + // merge with same type + __ membar(Assembler::Membar_mask_bits::StoreStore); + __ membar(Assembler::Membar_mask_bits::StoreStore); + __ membar(Assembler::Membar_mask_bits::StoreStore); + __ nop(); + __ membar(Assembler::Membar_mask_bits::LoadStore); + __ membar(Assembler::Membar_mask_bits::LoadStore); + __ membar(Assembler::Membar_mask_bits::LoadStore); + __ membar(Assembler::Membar_mask_bits::LoadStore); + __ nop(); + // merge with high rank + __ membar(Assembler::Membar_mask_bits::LoadStore); + __ membar(Assembler::Membar_mask_bits::LoadStore); + __ membar(Assembler::Membar_mask_bits::AnyAny); + __ membar(Assembler::Membar_mask_bits::StoreStore); + __ membar(Assembler::Membar_mask_bits::StoreStore); + __ nop(); + // merge with different type + __ membar(Assembler::Membar_mask_bits::LoadStore); + __ membar(Assembler::Membar_mask_bits::StoreStore); + __ membar(Assembler::Membar_mask_bits::LoadStore); + __ membar(Assembler::Membar_mask_bits::StoreStore); + } + asm_dump(code.insts()->start(), code.insts()->end()); + // AlwaysMergeDMB + static const unsigned int insns1[] = { + 0xd5033abf, // dmb.ishst + 0xd503201f, // nop + 0xd50339bf, // dmb.ishld + 0xd503201f, // nop + 0xd5033bbf, // dmb.ish + 0xd503201f, // nop + 0xd5033bbf, // dmb.ish + }; + // !AlwaysMergeDMB + static const unsigned int insns2[] = { + 0xd5033abf, // dmb.ishst + 0xd503201f, // nop + 0xd50339bf, // dmb.ishld + 0xd503201f, // nop + 0xd5033bbf, // dmb.ish + 0xd503201f, // nop + 0xd50339bf, // dmb.ishld + 0xd5033abf, // dmb.ishst + }; + if (AlwaysMergeDMB) { + EXPECT_EQ(code.insts()->size(), (CodeSection::csize_t)(sizeof insns1)); + asm_check((const unsigned int *)code.insts()->start(), insns1, sizeof insns1 / sizeof insns1[0]); + } else { + EXPECT_EQ(code.insts()->size(), (CodeSection::csize_t)(sizeof insns2)); + asm_check((const unsigned int *)code.insts()->start(), insns2, sizeof insns2 / sizeof insns2[0]); + } + + BufferBlob::free(b); +} + +TEST_VM(AssemblerAArch64, merge_dmb_block_by_label) { + BufferBlob* b = BufferBlob::create("aarch64Test", 400); + CodeBuffer code(b); + MacroAssembler _masm(&code); + + { + Label l; + // merge can not cross the label + __ membar(Assembler::Membar_mask_bits::StoreStore); + __ bind(l); + __ membar(Assembler::Membar_mask_bits::StoreStore); + } + asm_dump(code.insts()->start(), code.insts()->end()); + static const unsigned int insns[] = { + 0xd5033abf, // dmb.ishst + 0xd5033abf, // dmb.ishst + }; + EXPECT_EQ(code.insts()->size(), (CodeSection::csize_t)(sizeof insns)); + asm_check((const unsigned int *)code.insts()->start(), insns, sizeof insns / sizeof insns[0]); + + BufferBlob::free(b); +} + +TEST_VM(AssemblerAArch64, merge_dmb_after_expand) { + ResourceMark rm; + BufferBlob* b = BufferBlob::create("aarch64Test", 400); + CodeBuffer code(b); + code.set_blob(b); + MacroAssembler _masm(&code); + + { + __ membar(Assembler::Membar_mask_bits::StoreStore); + code.insts()->maybe_expand_to_ensure_remaining(50000); + __ membar(Assembler::Membar_mask_bits::StoreStore); + } + asm_dump(code.insts()->start(), code.insts()->end()); + static const unsigned int insns[] = { + 0xd5033abf, // dmb.ishst + }; + EXPECT_EQ(code.insts()->size(), (CodeSection::csize_t)(sizeof insns)); + asm_check((const unsigned int *)code.insts()->start(), insns, sizeof insns / sizeof insns[0]); +} + +TEST_VM(AssemblerAArch64, merge_ldst) { + BufferBlob* b = BufferBlob::create("aarch64Test", 400); + CodeBuffer code(b); + MacroAssembler _masm(&code); + + { + Label l; + // merge ld/st into ldp/stp + __ ldr(r0, Address(sp, 8)); + __ ldr(r1, Address(sp, 0)); + __ nop(); + __ str(r0, Address(sp, 0)); + __ str(r1, Address(sp, 8)); + __ nop(); + __ ldrw(r0, Address(sp, 0)); + __ ldrw(r1, Address(sp, 4)); + __ nop(); + __ strw(r0, Address(sp, 4)); + __ strw(r1, Address(sp, 0)); + __ nop(); + // can not merge + __ ldrw(r0, Address(sp, 4)); + __ ldr(r1, Address(sp, 8)); + __ nop(); + __ ldrw(r0, Address(sp, 0)); + __ ldrw(r1, Address(sp, 8)); + __ nop(); + __ str(r0, Address(sp, 0)); + __ bind(l); // block by label + __ str(r1, Address(sp, 8)); + __ nop(); + } + asm_dump(code.insts()->start(), code.insts()->end()); + static const unsigned int insns1[] = { + 0xa94003e1, // ldp x1, x0, [sp] + 0xd503201f, // nop + 0xa90007e0, // stp x0, x1, [sp] + 0xd503201f, // nop + 0x294007e0, // ldp w0, w1, [sp] + 0xd503201f, // nop + 0x290003e1, // stp w1, w0, [sp] + 0xd503201f, // nop + 0xb94007e0, // ldr w0, [sp, 4] + 0xf94007e1, // ldr x1, [sp, 8] + 0xd503201f, // nop + 0xb94003e0, // ldr w0, [sp] + 0xb9400be1, // ldr w1, [sp, 8] + 0xd503201f, // nop + 0xf90003e0, // str x0, [sp] + 0xf90007e1, // str x1, [sp, 8] + 0xd503201f, // nop + }; + EXPECT_EQ(code.insts()->size(), (CodeSection::csize_t)(sizeof insns1)); + asm_check((const unsigned int *)code.insts()->start(), insns1, sizeof insns1 / sizeof insns1[0]); + + BufferBlob::free(b); +} + +TEST_VM(AssemblerAArch64, merge_ldst_after_expand) { + ResourceMark rm; + BufferBlob* b = BufferBlob::create("aarch64Test", 400); + CodeBuffer code(b); + code.set_blob(b); + MacroAssembler _masm(&code); + + { + __ ldr(r0, Address(sp, 8)); + code.insts()->maybe_expand_to_ensure_remaining(10000); + __ ldr(r1, Address(sp, 0)); + __ nop(); + __ str(r0, Address(sp, 0)); + code.insts()->maybe_expand_to_ensure_remaining(100000); + __ str(r1, Address(sp, 8)); + __ nop(); + } + asm_dump(code.insts()->start(), code.insts()->end()); + static const unsigned int insns[] = { + 0xa94003e1, // ldp x1, x0, [sp] + 0xd503201f, // nop + 0xa90007e0, // stp x0, x1, [sp] + 0xd503201f, // nop + }; + EXPECT_EQ(code.insts()->size(), (CodeSection::csize_t)(sizeof insns)); + asm_check((const unsigned int *)code.insts()->start(), insns, sizeof insns / sizeof insns[0]); +} + #endif // AARCH64 diff --git a/test/micro/org/openjdk/bench/vm/compiler/FinalFieldInitialize.java b/test/micro/org/openjdk/bench/vm/compiler/FinalFieldInitialize.java new file mode 100644 index 0000000000000..ee0779faecf2a --- /dev/null +++ b/test/micro/org/openjdk/bench/vm/compiler/FinalFieldInitialize.java @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2024, Alibaba Group Co., Ltd. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.vm.compiler; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; +import org.openjdk.jmh.infra.Blackhole; + +/* test allocation speed of object with final field */ +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.SECONDS) +@State(Scope.Benchmark) +@Warmup(iterations = 5, time = 3, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 3, time = 3, timeUnit = TimeUnit.SECONDS) +@Fork(value = 3) +public class FinalFieldInitialize { + final static int LEN = 100_000; + Object arr[] = null; + @Setup + public void setup(){ + arr = new Object[LEN]; + } + + @Benchmark + public void testAlloc(Blackhole bh) { + for (int i=0; i