8357258: x86: Improve receiver type profiling reliability

Reviewed-by: kvn, vlivanov
This commit is contained in:
Aleksey Shipilev 2026-01-05 09:35:50 +00:00
parent 163038222a
commit e676c9de3d
8 changed files with 221 additions and 216 deletions

View File

@ -1261,29 +1261,9 @@ void LIR_Assembler::emit_alloc_array(LIR_OpAllocArray* op) {
void LIR_Assembler::type_profile_helper(Register mdo,
ciMethodData *md, ciProfileData *data,
Register recv, Label* update_done) {
for (uint i = 0; i < ReceiverTypeData::row_limit(); i++) {
Label next_test;
// See if the receiver is receiver[n].
__ cmpptr(recv, Address(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i))));
__ jccb(Assembler::notEqual, next_test);
Address data_addr(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i)));
__ addptr(data_addr, DataLayout::counter_increment);
__ jmp(*update_done);
__ bind(next_test);
}
// Didn't find receiver; find next empty slot and fill it in
for (uint i = 0; i < ReceiverTypeData::row_limit(); i++) {
Label next_test;
Address recv_addr(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i)));
__ cmpptr(recv_addr, NULL_WORD);
__ jccb(Assembler::notEqual, next_test);
__ movptr(recv_addr, recv);
__ movptr(Address(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i))), DataLayout::counter_increment);
__ jmp(*update_done);
__ bind(next_test);
}
Register recv) {
int mdp_offset = md->byte_offset_of_slot(data, in_ByteSize(0));
__ profile_receiver_type(recv, mdo, mdp_offset);
}
void LIR_Assembler::emit_typecheck_helper(LIR_OpTypeCheck *op, Label* success, Label* failure, Label* obj_is_null) {
@ -1341,15 +1321,9 @@ void LIR_Assembler::emit_typecheck_helper(LIR_OpTypeCheck *op, Label* success, L
__ jmp(*obj_is_null);
__ bind(not_null);
Label update_done;
Register recv = k_RInfo;
__ load_klass(recv, obj, tmp_load_klass);
type_profile_helper(mdo, md, data, recv, &update_done);
Address nonprofiled_receiver_count_addr(mdo, md->byte_offset_of_slot(data, CounterData::count_offset()));
__ addptr(nonprofiled_receiver_count_addr, DataLayout::counter_increment);
__ bind(update_done);
type_profile_helper(mdo, md, data, recv);
} else {
__ jcc(Assembler::equal, *obj_is_null);
}
@ -1461,14 +1435,9 @@ void LIR_Assembler::emit_opTypeCheck(LIR_OpTypeCheck* op) {
__ jmp(done);
__ bind(not_null);
Label update_done;
Register recv = k_RInfo;
__ load_klass(recv, value, tmp_load_klass);
type_profile_helper(mdo, md, data, recv, &update_done);
Address counter_addr(mdo, md->byte_offset_of_slot(data, CounterData::count_offset()));
__ addptr(counter_addr, DataLayout::counter_increment);
__ bind(update_done);
type_profile_helper(mdo, md, data, recv);
} else {
__ jcc(Assembler::equal, done);
}
@ -2791,13 +2760,9 @@ void LIR_Assembler::emit_profile_call(LIR_OpProfileCall* op) {
if (C1OptimizeVirtualCallProfiling && known_klass != nullptr) {
// We know the type that will be seen at this call site; we can
// statically update the MethodData* rather than needing to do
// dynamic tests on the receiver type
// NOTE: we should probably put a lock around this search to
// avoid collisions by concurrent compilations
// dynamic tests on the receiver type.
ciVirtualCallData* vc_data = (ciVirtualCallData*) data;
uint i;
for (i = 0; i < VirtualCallData::row_limit(); i++) {
for (uint i = 0; i < VirtualCallData::row_limit(); i++) {
ciKlass* receiver = vc_data->receiver(i);
if (known_klass->equals(receiver)) {
Address data_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)));
@ -2805,32 +2770,13 @@ void LIR_Assembler::emit_profile_call(LIR_OpProfileCall* op) {
return;
}
}
// Receiver type not found in profile data; select an empty slot
// Note that this is less efficient than it should be because it
// always does a write to the receiver part of the
// VirtualCallData rather than just the first time
for (i = 0; i < VirtualCallData::row_limit(); i++) {
ciKlass* receiver = vc_data->receiver(i);
if (receiver == nullptr) {
Address recv_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_offset(i)));
__ mov_metadata(recv_addr, known_klass->constant_encoding(), rscratch1);
Address data_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)));
__ addptr(data_addr, DataLayout::counter_increment);
return;
}
}
// Receiver type is not found in profile data.
// Fall back to runtime helper to handle the rest at runtime.
__ mov_metadata(recv, known_klass->constant_encoding());
} else {
__ load_klass(recv, recv, tmp_load_klass);
Label update_done;
type_profile_helper(mdo, md, data, recv, &update_done);
// Receiver did not match any saved receiver and there is no empty row for it.
// Increment total counter to indicate polymorphic case.
__ addptr(counter_addr, DataLayout::counter_increment);
__ bind(update_done);
}
type_profile_helper(mdo, md, data, recv);
} else {
// Static call
__ addptr(counter_addr, DataLayout::counter_increment);

View File

@ -43,7 +43,7 @@
// Record the type of the receiver in ReceiverTypeData
void type_profile_helper(Register mdo,
ciMethodData *md, ciProfileData *data,
Register recv, Label* update_done);
Register recv);
enum {
_call_stub_size = 28,

View File

@ -580,17 +580,16 @@ void InterpreterMacroAssembler::load_resolved_klass_at_index(Register klass,
// Rsub_klass: subklass
//
// Kills:
// rcx, rdi
// rcx
void InterpreterMacroAssembler::gen_subtype_check(Register Rsub_klass,
Label& ok_is_subtype) {
assert(Rsub_klass != rax, "rax holds superklass");
assert(Rsub_klass != r14, "r14 holds locals");
assert(Rsub_klass != r13, "r13 holds bcp");
assert(Rsub_klass != rcx, "rcx holds 2ndary super array length");
assert(Rsub_klass != rdi, "rdi holds 2ndary super array scan ptr");
// Profile the not-null value's klass.
profile_typecheck(rcx, Rsub_klass, rdi); // blows rcx, reloads rdi
profile_typecheck(rcx, Rsub_klass); // blows rcx
// Do the check.
check_klass_subtype(Rsub_klass, rax, rcx, ok_is_subtype); // blows rcx
@ -1394,7 +1393,6 @@ void InterpreterMacroAssembler::profile_final_call(Register mdp) {
void InterpreterMacroAssembler::profile_virtual_call(Register receiver,
Register mdp,
Register reg2,
bool receiver_can_be_null) {
if (ProfileInterpreter) {
Label profile_continue;
@ -1414,7 +1412,7 @@ void InterpreterMacroAssembler::profile_virtual_call(Register receiver,
}
// Record the receiver type.
record_klass_in_profile(receiver, mdp, reg2, true);
profile_receiver_type(receiver, mdp, 0);
bind(skip_receiver_profile);
// The method data pointer needs to be updated to reflect the new target.
@ -1423,135 +1421,6 @@ void InterpreterMacroAssembler::profile_virtual_call(Register receiver,
}
}
// This routine creates a state machine for updating the multi-row
// type profile at a virtual call site (or other type-sensitive bytecode).
// The machine visits each row (of receiver/count) until the receiver type
// is found, or until it runs out of rows. At the same time, it remembers
// the location of the first empty row. (An empty row records null for its
// receiver, and can be allocated for a newly-observed receiver type.)
// Because there are two degrees of freedom in the state, a simple linear
// search will not work; it must be a decision tree. Hence this helper
// function is recursive, to generate the required tree structured code.
// It's the interpreter, so we are trading off code space for speed.
// See below for example code.
void InterpreterMacroAssembler::record_klass_in_profile_helper(
Register receiver, Register mdp,
Register reg2, int start_row,
Label& done, bool is_virtual_call) {
if (TypeProfileWidth == 0) {
increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()));
} else {
record_item_in_profile_helper(receiver, mdp, reg2, 0, done, TypeProfileWidth,
&VirtualCallData::receiver_offset, &VirtualCallData::receiver_count_offset);
}
}
void InterpreterMacroAssembler::record_item_in_profile_helper(Register item, Register mdp, Register reg2, int start_row,
Label& done, int total_rows,
OffsetFunction item_offset_fn,
OffsetFunction item_count_offset_fn) {
int last_row = total_rows - 1;
assert(start_row <= last_row, "must be work left to do");
// Test this row for both the item and for null.
// Take any of three different outcomes:
// 1. found item => increment count and goto done
// 2. found null => keep looking for case 1, maybe allocate this cell
// 3. found something else => keep looking for cases 1 and 2
// Case 3 is handled by a recursive call.
for (int row = start_row; row <= last_row; row++) {
Label next_test;
bool test_for_null_also = (row == start_row);
// See if the item is item[n].
int item_offset = in_bytes(item_offset_fn(row));
test_mdp_data_at(mdp, item_offset, item,
(test_for_null_also ? reg2 : noreg),
next_test);
// (Reg2 now contains the item from the CallData.)
// The item is item[n]. Increment count[n].
int count_offset = in_bytes(item_count_offset_fn(row));
increment_mdp_data_at(mdp, count_offset);
jmp(done);
bind(next_test);
if (test_for_null_also) {
// Failed the equality check on item[n]... Test for null.
testptr(reg2, reg2);
if (start_row == last_row) {
// The only thing left to do is handle the null case.
Label found_null;
jccb(Assembler::zero, found_null);
// Item did not match any saved item and there is no empty row for it.
// Increment total counter to indicate polymorphic case.
increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()));
jmp(done);
bind(found_null);
break;
}
Label found_null;
// Since null is rare, make it be the branch-taken case.
jcc(Assembler::zero, found_null);
// Put all the "Case 3" tests here.
record_item_in_profile_helper(item, mdp, reg2, start_row + 1, done, total_rows,
item_offset_fn, item_count_offset_fn);
// Found a null. Keep searching for a matching item,
// but remember that this is an empty (unused) slot.
bind(found_null);
}
}
// In the fall-through case, we found no matching item, but we
// observed the item[start_row] is null.
// Fill in the item field and increment the count.
int item_offset = in_bytes(item_offset_fn(start_row));
set_mdp_data_at(mdp, item_offset, item);
int count_offset = in_bytes(item_count_offset_fn(start_row));
movl(reg2, DataLayout::counter_increment);
set_mdp_data_at(mdp, count_offset, reg2);
if (start_row > 0) {
jmp(done);
}
}
// Example state machine code for three profile rows:
// // main copy of decision tree, rooted at row[1]
// if (row[0].rec == rec) { row[0].incr(); goto done; }
// if (row[0].rec != nullptr) {
// // inner copy of decision tree, rooted at row[1]
// if (row[1].rec == rec) { row[1].incr(); goto done; }
// if (row[1].rec != nullptr) {
// // degenerate decision tree, rooted at row[2]
// if (row[2].rec == rec) { row[2].incr(); goto done; }
// if (row[2].rec != nullptr) { count.incr(); goto done; } // overflow
// row[2].init(rec); goto done;
// } else {
// // remember row[1] is empty
// if (row[2].rec == rec) { row[2].incr(); goto done; }
// row[1].init(rec); goto done;
// }
// } else {
// // remember row[0] is empty
// if (row[1].rec == rec) { row[1].incr(); goto done; }
// if (row[2].rec == rec) { row[2].incr(); goto done; }
// row[0].init(rec); goto done;
// }
// done:
void InterpreterMacroAssembler::record_klass_in_profile(Register receiver,
Register mdp, Register reg2,
bool is_virtual_call) {
assert(ProfileInterpreter, "must be profiling");
Label done;
record_klass_in_profile_helper(receiver, mdp, reg2, 0, done, is_virtual_call);
bind (done);
}
void InterpreterMacroAssembler::profile_ret(Register return_bci,
Register mdp) {
if (ProfileInterpreter) {
@ -1611,7 +1480,7 @@ void InterpreterMacroAssembler::profile_null_seen(Register mdp) {
}
void InterpreterMacroAssembler::profile_typecheck(Register mdp, Register klass, Register reg2) {
void InterpreterMacroAssembler::profile_typecheck(Register mdp, Register klass) {
if (ProfileInterpreter) {
Label profile_continue;
@ -1624,7 +1493,7 @@ void InterpreterMacroAssembler::profile_typecheck(Register mdp, Register klass,
mdp_delta = in_bytes(VirtualCallData::virtual_call_data_size());
// Record the object type.
record_klass_in_profile(klass, mdp, reg2, false);
profile_receiver_type(klass, mdp, 0);
}
update_mdp_by_constant(mdp, mdp_delta);

View File

@ -234,16 +234,6 @@ class InterpreterMacroAssembler: public MacroAssembler {
Register test_value_out,
Label& not_equal_continue);
void record_klass_in_profile(Register receiver, Register mdp,
Register reg2, bool is_virtual_call);
void record_klass_in_profile_helper(Register receiver, Register mdp,
Register reg2, int start_row,
Label& done, bool is_virtual_call);
void record_item_in_profile_helper(Register item, Register mdp, Register reg2, int start_row,
Label& done, int total_rows,
OffsetFunction item_offset_fn,
OffsetFunction item_count_offset_fn);
void update_mdp_by_offset(Register mdp_in, int offset_of_offset);
void update_mdp_by_offset(Register mdp_in, Register reg, int offset_of_disp);
void update_mdp_by_constant(Register mdp_in, int constant);
@ -254,11 +244,10 @@ class InterpreterMacroAssembler: public MacroAssembler {
void profile_call(Register mdp);
void profile_final_call(Register mdp);
void profile_virtual_call(Register receiver, Register mdp,
Register scratch2,
bool receiver_can_be_null = false);
void profile_ret(Register return_bci, Register mdp);
void profile_null_seen(Register mdp);
void profile_typecheck(Register mdp, Register klass, Register scratch);
void profile_typecheck(Register mdp, Register klass);
void profile_switch_default(Register mdp);
void profile_switch_case(Register index_in_scratch, Register mdp,

View File

@ -4749,6 +4749,203 @@ Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
return Address(rsp, scale_reg, scale_factor, offset);
}
// Handle the receiver type profile update given the "recv" klass.
//
// Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset".
// If there are no matching or claimable receiver entries in RD, updates
// the polymorphic counter.
//
// This code expected to run by either the interpreter or JIT-ed code, without
// extra synchronization. For safety, receiver cells are claimed atomically, which
// avoids grossly misrepresenting the profiles under concurrent updates. For speed,
// counter updates are not atomic.
//
void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset) {
int base_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(0));
int end_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(ReceiverTypeData::row_limit()));
int poly_count_offset = in_bytes(CounterData::count_offset());
int receiver_step = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset;
int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset;
// Adjust for MDP offsets. Slots are pointer-sized, so is the global offset.
assert(is_aligned(mdp_offset, BytesPerWord), "sanity");
base_receiver_offset += mdp_offset;
end_receiver_offset += mdp_offset;
poly_count_offset += mdp_offset;
// Scale down to optimize encoding. Slots are pointer-sized.
assert(is_aligned(base_receiver_offset, BytesPerWord), "sanity");
assert(is_aligned(end_receiver_offset, BytesPerWord), "sanity");
assert(is_aligned(poly_count_offset, BytesPerWord), "sanity");
assert(is_aligned(receiver_step, BytesPerWord), "sanity");
assert(is_aligned(receiver_to_count_step, BytesPerWord), "sanity");
base_receiver_offset >>= LogBytesPerWord;
end_receiver_offset >>= LogBytesPerWord;
poly_count_offset >>= LogBytesPerWord;
receiver_step >>= LogBytesPerWord;
receiver_to_count_step >>= LogBytesPerWord;
#ifdef ASSERT
// We are about to walk the MDO slots without asking for offsets.
// Check that our math hits all the right spots.
for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) {
int real_recv_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c));
int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c));
int offset = base_receiver_offset + receiver_step*c;
int count_offset = offset + receiver_to_count_step;
assert((offset << LogBytesPerWord) == real_recv_offset, "receiver slot math");
assert((count_offset << LogBytesPerWord) == real_count_offset, "receiver count math");
}
int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset());
assert(poly_count_offset << LogBytesPerWord == real_poly_count_offset, "poly counter math");
#endif
// Corner case: no profile table. Increment poly counter and exit.
if (ReceiverTypeData::row_limit() == 0) {
addptr(Address(mdp, poly_count_offset, Address::times_ptr), DataLayout::counter_increment);
return;
}
Register offset = rscratch1;
Label L_loop_search_receiver, L_loop_search_empty;
Label L_restart, L_found_recv, L_found_empty, L_polymorphic, L_count_update;
// The code here recognizes three major cases:
// A. Fastest: receiver found in the table
// B. Fast: no receiver in the table, and the table is full
// C. Slow: no receiver in the table, free slots in the table
//
// The case A performance is most important, as perfectly-behaved code would end up
// there, especially with larger TypeProfileWidth. The case B performance is
// important as well, this is where bulk of code would land for normally megamorphic
// cases. The case C performance is not essential, its job is to deal with installation
// races, we optimize for code density instead. Case C needs to make sure that receiver
// rows are only claimed once. This makes sure we never overwrite a row for another
// receiver and never duplicate the receivers in the list, making profile type-accurate.
//
// It is very tempting to handle these cases in a single loop, and claim the first slot
// without checking the rest of the table. But, profiling code should tolerate free slots
// in the table, as class unloading can clear them. After such cleanup, the receiver
// we need might be _after_ the free slot. Therefore, we need to let at least full scan
// to complete, before trying to install new slots. Splitting the code in several tight
// loops also helpfully optimizes for cases A and B.
//
// This code is effectively:
//
// restart:
// // Fastest: receiver is already installed
// for (i = 0; i < receiver_count(); i++) {
// if (receiver(i) == recv) goto found_recv(i);
// }
//
// // Fast: no receiver, but profile is full
// for (i = 0; i < receiver_count(); i++) {
// if (receiver(i) == null) goto found_null(i);
// }
// goto polymorphic
//
// // Slow: try to install receiver
// found_null(i):
// CAS(&receiver(i), null, recv);
// goto restart
//
// polymorphic:
// count++;
// return
//
// found_recv(i):
// *receiver_count(i)++
//
bind(L_restart);
// Fastest: receiver is already installed
movptr(offset, base_receiver_offset);
bind(L_loop_search_receiver);
cmpptr(recv, Address(mdp, offset, Address::times_ptr));
jccb(Assembler::equal, L_found_recv);
addptr(offset, receiver_step);
cmpptr(offset, end_receiver_offset);
jccb(Assembler::notEqual, L_loop_search_receiver);
// Fast: no receiver, but profile is full
movptr(offset, base_receiver_offset);
bind(L_loop_search_empty);
cmpptr(Address(mdp, offset, Address::times_ptr), NULL_WORD);
jccb(Assembler::equal, L_found_empty);
addptr(offset, receiver_step);
cmpptr(offset, end_receiver_offset);
jccb(Assembler::notEqual, L_loop_search_empty);
jmpb(L_polymorphic);
// Slow: try to install receiver
bind(L_found_empty);
// Atomically swing receiver slot: null -> recv.
//
// The update code uses CAS, which wants RAX register specifically, *and* it needs
// other important registers untouched, as they form the address. Therefore, we need
// to shift any important registers from RAX into some other spare register. If we
// have a spare register, we are forced to save it on stack here.
Register spare_reg = noreg;
Register shifted_mdp = mdp;
Register shifted_recv = recv;
if (recv == rax || mdp == rax) {
spare_reg = (recv != rbx && mdp != rbx) ? rbx :
(recv != rcx && mdp != rcx) ? rcx :
rdx;
assert_different_registers(mdp, recv, offset, spare_reg);
push(spare_reg);
if (recv == rax) {
movptr(spare_reg, recv);
shifted_recv = spare_reg;
} else {
assert(mdp == rax, "Remaining case");
movptr(spare_reg, mdp);
shifted_mdp = spare_reg;
}
} else {
push(rax);
}
// None of the important registers are in RAX after this shuffle.
assert_different_registers(rax, shifted_mdp, shifted_recv, offset);
xorptr(rax, rax);
cmpxchgptr(shifted_recv, Address(shifted_mdp, offset, Address::times_ptr));
// Unshift registers.
if (recv == rax || mdp == rax) {
movptr(rax, spare_reg);
pop(spare_reg);
} else {
pop(rax);
}
// CAS success means the slot now has the receiver we want. CAS failure means
// something had claimed the slot concurrently: it can be the same receiver we want,
// or something else. Since this is a slow path, we can optimize for code density,
// and just restart the search from the beginning.
jmpb(L_restart);
// Counter updates:
// Increment polymorphic counter instead of receiver slot.
bind(L_polymorphic);
movptr(offset, poly_count_offset);
jmpb(L_count_update);
// Found a receiver, convert its slot offset to corresponding count offset.
bind(L_found_recv);
addptr(offset, receiver_to_count_step);
bind(L_count_update);
addptr(Address(mdp, offset, Address::times_ptr), DataLayout::counter_increment);
}
void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
if (!VerifyOops) return;

View File

@ -668,6 +668,8 @@ public:
// method handles (JSR 292)
Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0);
void profile_receiver_type(Register recv, Register mdp, int mdp_offset);
// Debugging
// only if +VerifyOops

View File

@ -3266,7 +3266,7 @@ void TemplateTable::invokevirtual_helper(Register index,
__ load_klass(rax, recv, rscratch1);
// profile this call
__ profile_virtual_call(rax, rlocals, rdx);
__ profile_virtual_call(rax, rlocals);
// get target Method* & entry point
__ lookup_virtual_method(rax, index, method);
@ -3407,7 +3407,7 @@ void TemplateTable::invokeinterface(int byte_no) {
// profile this call
__ restore_bcp(); // rbcp was destroyed by receiver type check
__ profile_virtual_call(rdx, rbcp, rlocals);
__ profile_virtual_call(rdx, rbcp);
// Get declaring interface class from method, and itable index
__ load_method_holder(rax, rbx);

View File

@ -1148,6 +1148,8 @@ public:
// is seen. A per ReceiverTypeData counter is incremented on type
// overflow (when there's no more room for a not yet profiled Klass*).
//
// Updated by platform-specific code, for example MacroAssembler::profile_receiver_type.
//
class ReceiverTypeData : public CounterData {
friend class VMStructs;
friend class JVMCIVMStructs;