diff --git a/src/hotspot/cpu/riscv/assembler_riscv.hpp b/src/hotspot/cpu/riscv/assembler_riscv.hpp index 98ab86bf72e..d1021d9e283 100644 --- a/src/hotspot/cpu/riscv/assembler_riscv.hpp +++ b/src/hotspot/cpu/riscv/assembler_riscv.hpp @@ -705,6 +705,16 @@ public: emit(insn); } + void fencei() { + unsigned insn = 0; + patch((address)&insn, 6, 0, 0b0001111); // opcode + patch((address)&insn, 11, 7, 0b00000); // rd + patch((address)&insn, 14, 12, 0b001); // func + patch((address)&insn, 19, 15, 0b00000); // rs1 + patch((address)&insn, 31, 20, 0b000000000000); // fm + emit(insn); + } + #define INSN(NAME, op, funct3, funct7) \ void NAME() { \ unsigned insn = 0; \ diff --git a/src/hotspot/cpu/riscv/gc/z/zBarrierSetAssembler_riscv.cpp b/src/hotspot/cpu/riscv/gc/z/zBarrierSetAssembler_riscv.cpp index 8fbeaa45371..cbb918ade00 100644 --- a/src/hotspot/cpu/riscv/gc/z/zBarrierSetAssembler_riscv.cpp +++ b/src/hotspot/cpu/riscv/gc/z/zBarrierSetAssembler_riscv.cpp @@ -636,8 +636,20 @@ void ZBarrierSetAssembler::patch_barrier_relocation(address addr, int format) { ShouldNotReachHere(); } - // A full fence is generated before icache_flush by default in invalidate_word - ICache::invalidate_range(addr, bytes); + // If we are using UseCtxFencei no ICache invalidation is needed here. + // Instead every hart will preform an fence.i either by a Java thread + // (due to patching epoch will take it to slow path), + // or by the kernel when a Java thread is moved to a hart. + // The instruction streams changes must only happen before the disarm of + // the nmethod barrier. Where the disarm have a leading full two way fence. + // If this is performed during a safepoint, all Java threads will emit a fence.i + // before transitioning to 'Java', e.g. leaving native or the safepoint wait barrier. + if (!UseCtxFencei) { + // ICache invalidation is a serialization point. + // The above patching of instructions happens before the invalidation. + // Hence it have a leading full two way fence (wr, wr). + ICache::invalidate_range(addr, bytes); + } } #ifdef COMPILER2 diff --git a/src/hotspot/cpu/riscv/globals_riscv.hpp b/src/hotspot/cpu/riscv/globals_riscv.hpp index c2585f2d161..dd31de14704 100644 --- a/src/hotspot/cpu/riscv/globals_riscv.hpp +++ b/src/hotspot/cpu/riscv/globals_riscv.hpp @@ -122,6 +122,8 @@ define_pd_global(intx, InlineSmallCode, 1000); product(bool, UseRVVForBigIntegerShiftIntrinsics, true, \ "Use RVV instructions for left/right shift of BigInteger") \ product(bool, UseTrampolines, false, EXPERIMENTAL, \ - "Far calls uses jal to trampoline.") + "Far calls uses jal to trampoline.") \ + product(bool, UseCtxFencei, false, EXPERIMENTAL, \ + "Use PR_RISCV_CTX_SW_FENCEI_ON to avoid explicit icache flush") #endif // CPU_RISCV_GLOBALS_RISCV_HPP diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp index 42a18e9a753..3987812d58a 100644 --- a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp +++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp @@ -3159,6 +3159,13 @@ void MacroAssembler::membar(uint32_t order_constraint) { } } +void MacroAssembler::cmodx_fence() { + BLOCK_COMMENT("cmodx fence"); + if (VM_Version::supports_fencei_barrier()) { + Assembler::fencei(); + } +} + // Form an address from base + offset in Rd. Rd my or may not // actually be used: you must use the Address that is returned. It // is up to you to ensure that the shift provided matches the size diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp index 43d9dc387ca..13df99085f6 100644 --- a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp +++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp @@ -431,6 +431,8 @@ class MacroAssembler: public Assembler { } } + void cmodx_fence(); + void pause() { Assembler::fence(w, 0); } diff --git a/src/hotspot/cpu/riscv/relocInfo_riscv.cpp b/src/hotspot/cpu/riscv/relocInfo_riscv.cpp index d0903c96e22..18b4302c7e6 100644 --- a/src/hotspot/cpu/riscv/relocInfo_riscv.cpp +++ b/src/hotspot/cpu/riscv/relocInfo_riscv.cpp @@ -55,7 +55,21 @@ void Relocation::pd_set_data_value(address x, bool verify_only) { bytes = MacroAssembler::pd_patch_instruction_size(addr(), x); break; } - ICache::invalidate_range(addr(), bytes); + + // If we are using UseCtxFencei no ICache invalidation is needed here. + // Instead every hart will preform an fence.i either by a Java thread + // (due to patching epoch will take it to slow path), + // or by the kernel when a Java thread is moved to a hart. + // The instruction streams changes must only happen before the disarm of + // the nmethod barrier. Where the disarm have a leading full two way fence. + // If this is performed during a safepoint, all Java threads will emit a fence.i + // before transitioning to 'Java', e.g. leaving native or the safepoint wait barrier. + if (!UseCtxFencei) { + // ICache invalidation is a serialization point. + // The above patching of instructions happens before the invalidation. + // Hence it have a leading full two way fence (wr, wr). + ICache::invalidate_range(addr(), bytes); + } } address Relocation::pd_call_destination(address orig_addr) { diff --git a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp index ee14d045407..d4ec76da943 100644 --- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp @@ -2428,6 +2428,14 @@ class StubGenerator: public StubCodeGenerator { __ la(t1, ExternalAddress(bs_asm->patching_epoch_addr())); __ lwu(t1, t1); __ sw(t1, thread_epoch_addr); + // There are two ways this can work: + // - The writer did system icache shootdown after the instruction stream update. + // Hence do nothing. + // - The writer trust us to make sure our icache is in sync before entering. + // Hence use cmodx fence (fence.i, may change). + if (UseCtxFencei) { + __ cmodx_fence(); + } __ membar(__ LoadLoad); } diff --git a/src/hotspot/cpu/riscv/vm_version_riscv.hpp b/src/hotspot/cpu/riscv/vm_version_riscv.hpp index bd4bfe86d9b..8fdde0094f4 100644 --- a/src/hotspot/cpu/riscv/vm_version_riscv.hpp +++ b/src/hotspot/cpu/riscv/vm_version_riscv.hpp @@ -285,6 +285,7 @@ class VM_Version : public Abstract_VM_Version { // RISCV64 supports fast class initialization checks static bool supports_fast_class_init_checks() { return true; } + static bool supports_fencei_barrier() { return ext_Zifencei.enabled(); } }; #endif // CPU_RISCV_VM_VERSION_RISCV_HPP diff --git a/src/hotspot/os_cpu/linux_riscv/orderAccess_linux_riscv.hpp b/src/hotspot/os_cpu/linux_riscv/orderAccess_linux_riscv.hpp index a7dc84770f8..368d6c971fa 100644 --- a/src/hotspot/os_cpu/linux_riscv/orderAccess_linux_riscv.hpp +++ b/src/hotspot/os_cpu/linux_riscv/orderAccess_linux_riscv.hpp @@ -54,6 +54,24 @@ inline void OrderAccess::fence() { } inline void OrderAccess::cross_modify_fence_impl() { + // From 3 “Zifencei” Instruction-Fetch Fence, Version 2.0 + // "RISC-V does not guarantee that stores to instruction memory will be made + // visible to instruction fetches on a RISC-V hart until that hart executes a + // FENCE.I instruction. A FENCE.I instruction ensures that a subsequent + // instruction fetch on a RISC-V hart will see any previous data stores + // already visible to the same RISC-V hart. FENCE.I does not ensure that other + // RISC-V harts’ instruction fetches will observe the local hart’s stores in a + // multiprocessor system." + // + // Hence to be able to use fence.i directly we need a kernel that supports + // PR_RISCV_CTX_SW_FENCEI_ON. Thus if context switch to another hart we are + // ensured that instruction fetch will see any previous data stores + // + // The alternative is using full system IPI (system wide icache sync) then + // this barrier is not strictly needed. As this is emitted in runtime slow-path + // we will just always emit it, typically after a safepoint. + guarantee(VM_Version::supports_fencei_barrier(), "Linux kernel require fence.i"); + __asm__ volatile("fence.i" : : : "memory"); } #endif // OS_CPU_LINUX_RISCV_ORDERACCESS_LINUX_RISCV_HPP diff --git a/src/hotspot/os_cpu/linux_riscv/vm_version_linux_riscv.cpp b/src/hotspot/os_cpu/linux_riscv/vm_version_linux_riscv.cpp index 3f9f26b525b..a3a226502f6 100644 --- a/src/hotspot/os_cpu/linux_riscv/vm_version_linux_riscv.cpp +++ b/src/hotspot/os_cpu/linux_riscv/vm_version_linux_riscv.cpp @@ -35,6 +35,7 @@ #include #include #include +#include #ifndef HWCAP_ISA_I #define HWCAP_ISA_I nth_bit('I' - 'A') @@ -82,6 +83,23 @@ __v; \ }) +// prctl PR_RISCV_SET_ICACHE_FLUSH_CTX is from Linux 6.9 +#ifndef PR_RISCV_SET_ICACHE_FLUSH_CTX +#define PR_RISCV_SET_ICACHE_FLUSH_CTX 71 +#endif +#ifndef PR_RISCV_CTX_SW_FENCEI_ON +#define PR_RISCV_CTX_SW_FENCEI_ON 0 +#endif +#ifndef PR_RISCV_CTX_SW_FENCEI_OFF +#define PR_RISCV_CTX_SW_FENCEI_OFF 1 +#endif +#ifndef PR_RISCV_SCOPE_PER_PROCESS +#define PR_RISCV_SCOPE_PER_PROCESS 0 +#endif +#ifndef PR_RISCV_SCOPE_PER_THREAD +#define PR_RISCV_SCOPE_PER_THREAD 1 +#endif + uint32_t VM_Version::cpu_vector_length() { assert(ext_V.enabled(), "should not call this"); return (uint32_t)read_csr(CSR_VLENB); @@ -102,6 +120,7 @@ void VM_Version::setup_cpu_available_features() { if (!RiscvHwprobe::probe_features()) { os_aux_features(); } + char* uarch = os_uarch_additional_features(); vendor_features(); @@ -155,6 +174,24 @@ void VM_Version::setup_cpu_available_features() { i++; } + // Linux kernel require Zifencei + if (!ext_Zifencei.enabled()) { + log_info(os, cpu)("Zifencei not found, required by Linux, enabling."); + ext_Zifencei.enable_feature(); + } + + if (UseCtxFencei) { + // Note that we can set this up only for effected threads + // via PR_RISCV_SCOPE_PER_THREAD, i.e. on VM attach/deattach. + int ret = prctl(PR_RISCV_SET_ICACHE_FLUSH_CTX, PR_RISCV_CTX_SW_FENCEI_ON, PR_RISCV_SCOPE_PER_PROCESS); + if (ret == 0) { + log_debug(os, cpu)("UseCtxFencei (PR_RISCV_CTX_SW_FENCEI_ON) enabled."); + } else { + FLAG_SET_ERGO(UseCtxFencei, false); + log_info(os, cpu)("UseCtxFencei (PR_RISCV_CTX_SW_FENCEI_ON) disabled, unsupported by kernel."); + } + } + _features_string = os::strdup(buf); }