From a5765a916a03471cd771c870e1c0e5eab7a08bf1 Mon Sep 17 00:00:00 2001 From: Anjian Wen Date: Mon, 9 Feb 2026 11:50:40 +0000 Subject: [PATCH] 8377225: RISC-V: Improve receiver type profiling reliability Reviewed-by: shade, fjiang, fyang --- .../cpu/riscv/c1_LIRAssembler_riscv.cpp | 68 ++------ .../cpu/riscv/c1_LIRAssembler_riscv.hpp | 5 +- src/hotspot/cpu/riscv/interp_masm_riscv.cpp | 159 +----------------- src/hotspot/cpu/riscv/interp_masm_riscv.hpp | 11 +- .../cpu/riscv/macroAssembler_riscv.cpp | 154 +++++++++++++++++ .../cpu/riscv/macroAssembler_riscv.hpp | 2 + src/hotspot/cpu/riscv/templateTable_riscv.cpp | 4 +- 7 files changed, 176 insertions(+), 227 deletions(-) diff --git a/src/hotspot/cpu/riscv/c1_LIRAssembler_riscv.cpp b/src/hotspot/cpu/riscv/c1_LIRAssembler_riscv.cpp index e77a2067e89..63e2fd015d7 100644 --- a/src/hotspot/cpu/riscv/c1_LIRAssembler_riscv.cpp +++ b/src/hotspot/cpu/riscv/c1_LIRAssembler_riscv.cpp @@ -1041,31 +1041,10 @@ void LIR_Assembler::emit_alloc_array(LIR_OpAllocArray* op) { __ bind(*op->stub()->continuation()); } -void LIR_Assembler::type_profile_helper(Register mdo, ciMethodData *md, ciProfileData *data, - Register recv, Label* update_done) { - for (uint i = 0; i < ReceiverTypeData::row_limit(); i++) { - Label next_test; - // See if the receiver is receiver[n]. - __ ld(t1, Address(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i)))); - __ bne(recv, t1, next_test); - Address data_addr(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i))); - __ increment(data_addr, DataLayout::counter_increment); - __ j(*update_done); - __ bind(next_test); - } - - // Didn't find receiver; find next empty slot and fill it in - for (uint i = 0; i < ReceiverTypeData::row_limit(); i++) { - Label next_test; - Address recv_addr(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i))); - __ ld(t1, recv_addr); - __ bnez(t1, next_test); - __ sd(recv, recv_addr); - __ mv(t1, DataLayout::counter_increment); - __ sd(t1, Address(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i)))); - __ j(*update_done); - __ bind(next_test); - } +void LIR_Assembler::type_profile_helper(Register mdo, ciMethodData *md, + ciProfileData *data, Register recv) { + int mdp_offset = md->byte_offset_of_slot(data, in_ByteSize(0)); + __ profile_receiver_type(recv, mdo, mdp_offset); } void LIR_Assembler::data_check(LIR_OpTypeCheck *op, ciMethodData **md, ciProfileData **data) { @@ -1139,14 +1118,9 @@ void LIR_Assembler::profile_object(ciMethodData* md, ciProfileData* data, Regist __ j(*obj_is_null); __ bind(not_null); - Label update_done; Register recv = k_RInfo; __ load_klass(recv, obj); - type_profile_helper(mdo, md, data, recv, &update_done); - Address counter_addr(mdo, md->byte_offset_of_slot(data, CounterData::count_offset())); - __ increment(counter_addr, DataLayout::counter_increment); - - __ bind(update_done); + type_profile_helper(mdo, md, data, recv); } void LIR_Assembler::typecheck_loaded(LIR_OpTypeCheck *op, ciKlass* k, Register k_RInfo) { @@ -1554,11 +1528,8 @@ void LIR_Assembler::emit_profile_call(LIR_OpProfileCall* op) { // We know the type that will be seen at this call site; we can // statically update the MethodData* rather than needing to do // dynamic tests on the receiver type - // NOTE: we should probably put a lock around this search to - // avoid collisions by concurrent compilations ciVirtualCallData* vc_data = (ciVirtualCallData*) data; - uint i; - for (i = 0; i < VirtualCallData::row_limit(); i++) { + for (uint i = 0; i < VirtualCallData::row_limit(); i++) { ciKlass* receiver = vc_data->receiver(i); if (known_klass->equals(receiver)) { Address data_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i))); @@ -1566,32 +1537,13 @@ void LIR_Assembler::emit_profile_call(LIR_OpProfileCall* op) { return; } } - - // Receiver type not found in profile data; select an empty slot - // Note that this is less efficient than it should be because it - // always does a write to the receiver part of the - // VirtualCallData rather than just the first time - for (i = 0; i < VirtualCallData::row_limit(); i++) { - ciKlass* receiver = vc_data->receiver(i); - if (receiver == nullptr) { - Address recv_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_offset(i))); - __ mov_metadata(t1, known_klass->constant_encoding()); - __ sd(t1, recv_addr); - Address data_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i))); - __ increment(data_addr, DataLayout::counter_increment); - return; - } - } + // Receiver type is not found in profile data. + // Fall back to runtime helper to handle the rest at runtime. + __ mov_metadata(recv, known_klass->constant_encoding()); } else { __ load_klass(recv, recv); - Label update_done; - type_profile_helper(mdo, md, data, recv, &update_done); - // Receiver did not match any saved receiver and there is no empty row for it. - // Increment total counter to indicate polymorphic case. - __ increment(counter_addr, DataLayout::counter_increment); - - __ bind(update_done); } + type_profile_helper(mdo, md, data, recv); } else { // Static call __ increment(counter_addr, DataLayout::counter_increment); diff --git a/src/hotspot/cpu/riscv/c1_LIRAssembler_riscv.hpp b/src/hotspot/cpu/riscv/c1_LIRAssembler_riscv.hpp index 1e466e90d37..90b6b3ee4f4 100644 --- a/src/hotspot/cpu/riscv/c1_LIRAssembler_riscv.hpp +++ b/src/hotspot/cpu/riscv/c1_LIRAssembler_riscv.hpp @@ -54,9 +54,8 @@ private: Address stack_slot_address(int index, uint shift, int adjust = 0); // Record the type of the receiver in ReceiverTypeData - void type_profile_helper(Register mdo, - ciMethodData *md, ciProfileData *data, - Register recv, Label* update_done); + void type_profile_helper(Register mdo, ciMethodData *md, + ciProfileData *data, Register recv); void casw(Register addr, Register newval, Register cmpval); void caswu(Register addr, Register newval, Register cmpval); diff --git a/src/hotspot/cpu/riscv/interp_masm_riscv.cpp b/src/hotspot/cpu/riscv/interp_masm_riscv.cpp index 189c7c93d07..744590bec2b 100644 --- a/src/hotspot/cpu/riscv/interp_masm_riscv.cpp +++ b/src/hotspot/cpu/riscv/interp_masm_riscv.cpp @@ -237,15 +237,14 @@ void InterpreterMacroAssembler::load_resolved_klass_at_offset( // Rsub_klass: subklass // // Kills: -// x12, x15 +// x12 void InterpreterMacroAssembler::gen_subtype_check(Register Rsub_klass, Label& ok_is_subtype) { assert(Rsub_klass != x10, "x10 holds superklass"); assert(Rsub_klass != x12, "x12 holds 2ndary super array length"); - assert(Rsub_klass != x15, "x15 holds 2ndary super array scan ptr"); // Profile the not-null value's klass. - profile_typecheck(x12, Rsub_klass, x15); // blows x12, reloads x15 + profile_typecheck(x12, Rsub_klass); // blows x12 // Do the check. check_klass_subtype(Rsub_klass, x10, x12, ok_is_subtype); // blows x12 @@ -1042,7 +1041,6 @@ void InterpreterMacroAssembler::profile_final_call(Register mdp) { void InterpreterMacroAssembler::profile_virtual_call(Register receiver, Register mdp, - Register reg2, bool receiver_can_be_null) { if (ProfileInterpreter) { Label profile_continue; @@ -1060,7 +1058,7 @@ void InterpreterMacroAssembler::profile_virtual_call(Register receiver, } // Record the receiver type. - record_klass_in_profile(receiver, mdp, reg2); + profile_receiver_type(receiver, mdp, 0); bind(skip_receiver_profile); // The method data pointer needs to be updated to reflect the new target. @@ -1072,153 +1070,6 @@ void InterpreterMacroAssembler::profile_virtual_call(Register receiver, } } -// This routine creates a state machine for updating the multi-row -// type profile at a virtual call site (or other type-sensitive bytecode). -// The machine visits each row (of receiver/count) until the receiver type -// is found, or until it runs out of rows. At the same time, it remembers -// the location of the first empty row. (An empty row records null for its -// receiver, and can be allocated for a newly-observed receiver type.) -// Because there are two degrees of freedom in the state, a simple linear -// search will not work; it must be a decision tree. Hence this helper -// function is recursive, to generate the required tree structured code. -// It's the interpreter, so we are trading off code space for speed. -// See below for example code. -void InterpreterMacroAssembler::record_klass_in_profile_helper( - Register receiver, Register mdp, - Register reg2, Label& done) { - if (TypeProfileWidth == 0) { - increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset())); - } else { - record_item_in_profile_helper(receiver, mdp, reg2, 0, done, TypeProfileWidth, - &VirtualCallData::receiver_offset, &VirtualCallData::receiver_count_offset); - } -} - -void InterpreterMacroAssembler::record_item_in_profile_helper(Register item, Register mdp, - Register reg2, int start_row, Label& done, int total_rows, - OffsetFunction item_offset_fn, OffsetFunction item_count_offset_fn) { - int last_row = total_rows - 1; - assert(start_row <= last_row, "must be work left to do"); - // Test this row for both the item and for null. - // Take any of three different outcomes: - // 1. found item => increment count and goto done - // 2. found null => keep looking for case 1, maybe allocate this cell - // 3. found something else => keep looking for cases 1 and 2 - // Case 3 is handled by a recursive call. - for (int row = start_row; row <= last_row; row++) { - Label next_test; - bool test_for_null_also = (row == start_row); - - // See if the item is item[n]. - int item_offset = in_bytes(item_offset_fn(row)); - test_mdp_data_at(mdp, item_offset, item, - (test_for_null_also ? reg2 : noreg), - next_test); - // (Reg2 now contains the item from the CallData.) - - // The item is item[n]. Increment count[n]. - int count_offset = in_bytes(item_count_offset_fn(row)); - increment_mdp_data_at(mdp, count_offset); - j(done); - bind(next_test); - - if (test_for_null_also) { - Label found_null; - // Failed the equality check on item[n]... Test for null. - if (start_row == last_row) { - // The only thing left to do is handle the null case. - beqz(reg2, found_null); - // Item did not match any saved item and there is no empty row for it. - // Increment total counter to indicate polymorphic case. - increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset())); - j(done); - bind(found_null); - break; - } - // Since null is rare, make it be the branch-taken case. - beqz(reg2, found_null); - - // Put all the "Case 3" tests here. - record_item_in_profile_helper(item, mdp, reg2, start_row + 1, done, total_rows, - item_offset_fn, item_count_offset_fn); - - // Found a null. Keep searching for a matching item, - // but remember that this is an empty (unused) slot. - bind(found_null); - } - } - - // In the fall-through case, we found no matching item, but we - // observed the item[start_row] is null. - // Fill in the item field and increment the count. - int item_offset = in_bytes(item_offset_fn(start_row)); - set_mdp_data_at(mdp, item_offset, item); - int count_offset = in_bytes(item_count_offset_fn(start_row)); - mv(reg2, DataLayout::counter_increment); - set_mdp_data_at(mdp, count_offset, reg2); - if (start_row > 0) { - j(done); - } -} - -// Example state machine code for three profile rows: -// # main copy of decision tree, rooted at row[1] -// if (row[0].rec == rec) then [ -// row[0].incr() -// goto done -// ] -// if (row[0].rec != nullptr) then [ -// # inner copy of decision tree, rooted at row[1] -// if (row[1].rec == rec) then [ -// row[1].incr() -// goto done -// ] -// if (row[1].rec != nullptr) then [ -// # degenerate decision tree, rooted at row[2] -// if (row[2].rec == rec) then [ -// row[2].incr() -// goto done -// ] -// if (row[2].rec != nullptr) then [ -// count.incr() -// goto done -// ] # overflow -// row[2].init(rec) -// goto done -// ] else [ -// # remember row[1] is empty -// if (row[2].rec == rec) then [ -// row[2].incr() -// goto done -// ] -// row[1].init(rec) -// goto done -// ] -// else [ -// # remember row[0] is empty -// if (row[1].rec == rec) then [ -// row[1].incr() -// goto done -// ] -// if (row[2].rec == rec) then [ -// row[2].incr() -// goto done -// ] -// row[0].init(rec) -// goto done -// ] -// done: - -void InterpreterMacroAssembler::record_klass_in_profile(Register receiver, - Register mdp, Register reg2) { - assert(ProfileInterpreter, "must be profiling"); - Label done; - - record_klass_in_profile_helper(receiver, mdp, reg2, done); - - bind(done); -} - void InterpreterMacroAssembler::profile_ret(Register return_bci, Register mdp) { if (ProfileInterpreter) { Label profile_continue; @@ -1274,7 +1125,7 @@ void InterpreterMacroAssembler::profile_null_seen(Register mdp) { } } -void InterpreterMacroAssembler::profile_typecheck(Register mdp, Register klass, Register reg2) { +void InterpreterMacroAssembler::profile_typecheck(Register mdp, Register klass) { if (ProfileInterpreter) { Label profile_continue; @@ -1287,7 +1138,7 @@ void InterpreterMacroAssembler::profile_typecheck(Register mdp, Register klass, mdp_delta = in_bytes(VirtualCallData::virtual_call_data_size()); // Record the object type. - record_klass_in_profile(klass, mdp, reg2); + profile_receiver_type(klass, mdp, 0); } update_mdp_by_constant(mdp, mdp_delta); diff --git a/src/hotspot/cpu/riscv/interp_masm_riscv.hpp b/src/hotspot/cpu/riscv/interp_masm_riscv.hpp index a9df09d656a..59cc76b022f 100644 --- a/src/hotspot/cpu/riscv/interp_masm_riscv.hpp +++ b/src/hotspot/cpu/riscv/interp_masm_riscv.hpp @@ -262,14 +262,6 @@ class InterpreterMacroAssembler: public MacroAssembler { Register test_value_out, Label& not_equal_continue); - void record_klass_in_profile(Register receiver, Register mdp, - Register reg2); - void record_klass_in_profile_helper(Register receiver, Register mdp, - Register reg2, Label& done); - void record_item_in_profile_helper(Register item, Register mdp, - Register reg2, int start_row, Label& done, int total_rows, - OffsetFunction item_offset_fn, OffsetFunction item_count_offset_fn); - void update_mdp_by_offset(Register mdp_in, int offset_of_offset); void update_mdp_by_offset(Register mdp_in, Register reg, int offset_of_disp); void update_mdp_by_constant(Register mdp_in, int constant); @@ -283,11 +275,10 @@ class InterpreterMacroAssembler: public MacroAssembler { void profile_call(Register mdp); void profile_final_call(Register mdp); void profile_virtual_call(Register receiver, Register mdp, - Register t1, bool receiver_can_be_null = false); void profile_ret(Register return_bci, Register mdp); void profile_null_seen(Register mdp); - void profile_typecheck(Register mdp, Register klass, Register temp); + void profile_typecheck(Register mdp, Register klass); void profile_typecheck_failed(Register mdp); void profile_switch_default(Register mdp); void profile_switch_case(Register index_in_scratch, Register mdp, diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp index fb30f64e9ed..4f5e7afc166 100644 --- a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp +++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp @@ -543,6 +543,160 @@ void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, BLOCK_COMMENT("} verify_oop"); } +// Handle the receiver type profile update given the "recv" klass. +// +// Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset". +// If there are no matching or claimable receiver entries in RD, updates +// the polymorphic counter. +// +// This code expected to run by either the interpreter or JIT-ed code, without +// extra synchronization. For safety, receiver cells are claimed atomically, which +// avoids grossly misrepresenting the profiles under concurrent updates. For speed, +// counter updates are not atomic. +// +void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset) { + assert_different_registers(recv, mdp, t0, t1); + + int base_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(0)); + int end_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(ReceiverTypeData::row_limit())); + int poly_count_offset = in_bytes(CounterData::count_offset()); + int receiver_step = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset; + int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset; + + // Adjust for MDP offsets. Slots are pointer-sized, so is the global offset. + base_receiver_offset += mdp_offset; + end_receiver_offset += mdp_offset; + poly_count_offset += mdp_offset; + +#ifdef ASSERT + // We are about to walk the MDO slots without asking for offsets. + // Check that our math hits all the right spots. + for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) { + int real_recv_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c)); + int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c)); + int offset = base_receiver_offset + receiver_step*c; + int count_offset = offset + receiver_to_count_step; + assert(offset == real_recv_offset, "receiver slot math"); + assert(count_offset == real_count_offset, "receiver count math"); + } + int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset()); + assert(poly_count_offset == real_poly_count_offset, "poly counter math"); +#endif + + // Corner case: no profile table. Increment poly counter and exit. + if (ReceiverTypeData::row_limit() == 0) { + increment(Address(mdp, poly_count_offset), DataLayout::counter_increment); + return; + } + + Register offset = t1; + + Label L_loop_search_receiver, L_loop_search_empty; + Label L_restart, L_found_recv, L_found_empty, L_polymorphic, L_count_update; + + // The code here recognizes three major cases: + // A. Fastest: receiver found in the table + // B. Fast: no receiver in the table, and the table is full + // C. Slow: no receiver in the table, free slots in the table + // + // The case A performance is most important, as perfectly-behaved code would end up + // there, especially with larger TypeProfileWidth. The case B performance is + // important as well, this is where bulk of code would land for normally megamorphic + // cases. The case C performance is not essential, its job is to deal with installation + // races, we optimize for code density instead. Case C needs to make sure that receiver + // rows are only claimed once. This makes sure we never overwrite a row for another + // receiver and never duplicate the receivers in the list, making profile type-accurate. + // + // It is very tempting to handle these cases in a single loop, and claim the first slot + // without checking the rest of the table. But, profiling code should tolerate free slots + // in the table, as class unloading can clear them. After such cleanup, the receiver + // we need might be _after_ the free slot. Therefore, we need to let at least full scan + // to complete, before trying to install new slots. Splitting the code in several tight + // loops also helpfully optimizes for cases A and B. + // + // This code is effectively: + // + // restart: + // // Fastest: receiver is already installed + // for (i = 0; i < receiver_count(); i++) { + // if (receiver(i) == recv) goto found_recv(i); + // } + // + // // Fast: no receiver, but profile is full + // for (i = 0; i < receiver_count(); i++) { + // if (receiver(i) == null) goto found_null(i); + // } + // goto polymorphic + // + // // Slow: try to install receiver + // found_null(i): + // CAS(&receiver(i), null, recv); + // goto restart + // + // polymorphic: + // count++; + // return + // + // found_recv(i): + // *receiver_count(i)++ + // + + bind(L_restart); + + // Fastest: receiver is already installed + mv(offset, base_receiver_offset); + bind(L_loop_search_receiver); + add(t0, mdp, offset); + ld(t0, Address(t0)); + beq(recv, t0, L_found_recv); + add(offset, offset, receiver_step); + sub(t0, offset, end_receiver_offset); + bnez(t0, L_loop_search_receiver); + + // Fast: no receiver, but profile is full + mv(offset, base_receiver_offset); + bind(L_loop_search_empty); + add(t0, mdp, offset); + ld(t0, Address(t0)); + beqz(t0, L_found_empty); + add(offset, offset, receiver_step); + sub(t0, offset, end_receiver_offset); + bnez(t0, L_loop_search_empty); + j(L_polymorphic); + + // Slow: try to install receiver + bind(L_found_empty); + + // Atomically swing receiver slot: null -> recv. + // + // The update uses CAS, which clobbers t0. Therefore, t1 + // is used to hold the destination address. This is safe because the + // offset is no longer needed after the address is computed. + add(t1, mdp, offset); + weak_cmpxchg(/*addr*/ t1, /*expected*/ zr, /*new*/ recv, Assembler::int64, + /*acquire*/ Assembler::relaxed, /*release*/ Assembler::relaxed, /*result*/ t0); + + // CAS success means the slot now has the receiver we want. CAS failure means + // something had claimed the slot concurrently: it can be the same receiver we want, + // or something else. Since this is a slow path, we can optimize for code density, + // and just restart the search from the beginning. + j(L_restart); + + // Counter updates: + // Increment polymorphic counter instead of receiver slot. + bind(L_polymorphic); + mv(offset, poly_count_offset); + j(L_count_update); + + // Found a receiver, convert its slot offset to corresponding count offset. + bind(L_found_recv); + add(offset, offset, receiver_to_count_step); + + bind(L_count_update); + add(t1, mdp, offset); + increment(Address(t1), DataLayout::counter_increment); +} + void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) { if (!VerifyOops) { return; diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp index 3b021388fa5..f5e985c28a2 100644 --- a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp +++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp @@ -390,6 +390,8 @@ class MacroAssembler: public Assembler { Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0); + void profile_receiver_type(Register recv, Register mdp, int mdp_offset); + // only if +VerifyOops void _verify_oop(Register reg, const char* s, const char* file, int line); void _verify_oop_addr(Address addr, const char* s, const char* file, int line); diff --git a/src/hotspot/cpu/riscv/templateTable_riscv.cpp b/src/hotspot/cpu/riscv/templateTable_riscv.cpp index 0fb529d1683..5cc725e3af4 100644 --- a/src/hotspot/cpu/riscv/templateTable_riscv.cpp +++ b/src/hotspot/cpu/riscv/templateTable_riscv.cpp @@ -3279,7 +3279,7 @@ void TemplateTable::invokevirtual_helper(Register index, __ load_klass(x10, recv); // profile this call - __ profile_virtual_call(x10, xlocals, x13); + __ profile_virtual_call(x10, xlocals); // get target Method & entry point __ lookup_virtual_method(x10, index, method); @@ -3406,7 +3406,7 @@ void TemplateTable::invokeinterface(int byte_no) { /*return_method=*/false); // profile this call - __ profile_virtual_call(x13, x30, x9); + __ profile_virtual_call(x13, x30); // Get declaring interface class from method, and itable index __ load_method_holder(x10, xmethod);