8385746: S390: Improve receiver type profiling reliability

Reviewed-by: shade, mdoerr
This commit is contained in:
Amit Kumar 2026-06-16 12:58:40 +00:00
parent 5b1b98c109
commit 8de834cd8a
7 changed files with 170 additions and 192 deletions

View File

@ -4450,7 +4450,7 @@ void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_
addi(offset, offset, receiver_step);
bdnz(L_loop_search_receiver);
// Fast: no receiver, but profile is full
// Fast: no receiver, but profile is not full
if (count != noreg) {
mtctr(count);
} else {

View File

@ -2413,32 +2413,9 @@ void LIR_Assembler::emit_alloc_array(LIR_OpAllocArray* op) {
}
void LIR_Assembler::type_profile_helper(Register mdo, ciMethodData *md, ciProfileData *data,
Register recv, Register tmp1, Label* update_done) {
uint i;
for (i = 0; i < VirtualCallData::row_limit(); i++) {
Label next_test;
// See if the receiver is receiver[n].
Address receiver_addr(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i)));
__ z_cg(recv, receiver_addr);
__ z_brne(next_test);
Address data_addr(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i)));
__ add2mem_64(data_addr, DataLayout::counter_increment, tmp1);
__ branch_optimized(Assembler::bcondAlways, *update_done);
__ bind(next_test);
}
// Didn't find receiver; find next empty slot and fill it in.
for (i = 0; i < VirtualCallData::row_limit(); i++) {
Label next_test;
Address recv_addr(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i)));
__ z_ltg(Z_R0_scratch, recv_addr);
__ z_brne(next_test);
__ z_stg(recv, recv_addr);
__ load_const_optimized(tmp1, DataLayout::counter_increment);
__ z_stg(tmp1, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i)), mdo);
__ branch_optimized(Assembler::bcondAlways, *update_done);
__ bind(next_test);
}
Register recv, Register tmp1) {
int mdp_offset = md->byte_offset_of_slot(data, in_ByteSize(0));
__ profile_receiver_type(recv, mdo, mdp_offset, tmp1);
}
void LIR_Assembler::setup_md_access(ciMethod* method, int bci,
@ -2510,13 +2487,9 @@ void LIR_Assembler::emit_typecheck_helper(LIR_OpTypeCheck *op, Label* success, L
__ branch_optimized(Assembler::bcondAlways, *obj_is_null);
__ bind(not_null);
NearLabel update_done;
Register recv = k_RInfo;
__ load_klass(recv, obj);
type_profile_helper(mdo, md, data, recv, Rtmp1, &update_done);
Address counter_addr(mdo, md->byte_offset_of_slot(data, CounterData::count_offset()));
__ add2mem_64(counter_addr, DataLayout::counter_increment, Rtmp1);
__ bind(update_done);
type_profile_helper(mdo, md, data, recv, Rtmp1);
} else {
__ compareU64_and_branch(obj, (intptr_t) 0, Assembler::bcondEqual, *obj_is_null);
}
@ -2606,13 +2579,9 @@ void LIR_Assembler::emit_opTypeCheck(LIR_OpTypeCheck* op) {
__ branch_optimized(Assembler::bcondAlways, done);
__ bind(not_null);
NearLabel update_done;
Register recv = k_RInfo;
__ load_klass(recv, value);
type_profile_helper(mdo, md, data, recv, Rtmp1, &update_done);
Address counter_addr(mdo, md->byte_offset_of_slot(data, CounterData::count_offset()));
__ add2mem_64(counter_addr, DataLayout::counter_increment, Rtmp1);
__ bind(update_done);
type_profile_helper(mdo, md, data, recv, Rtmp1);
} else {
__ compareU64_and_branch(value, (intptr_t) 0, Assembler::bcondEqual, done);
}
@ -2772,11 +2741,8 @@ void LIR_Assembler::emit_profile_call(LIR_OpProfileCall* op) {
// statically update the MethodData* rather than needing to do
// dynamic tests on the receiver type.
// NOTE: we should probably put a lock around this search to
// avoid collisions by concurrent compilations.
ciVirtualCallData* vc_data = (ciVirtualCallData*) data;
uint i;
for (i = 0; i < VirtualCallData::row_limit(); i++) {
for (uint i = 0; i < VirtualCallData::row_limit(); i++) {
ciKlass* receiver = vc_data->receiver(i);
if (known_klass->equals(receiver)) {
Address data_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)));
@ -2784,32 +2750,13 @@ void LIR_Assembler::emit_profile_call(LIR_OpProfileCall* op) {
return;
}
}
// Receiver type not found in profile data. Select an empty slot.
// Note that this is less efficient than it should be because it
// always does a write to the receiver part of the
// VirtualCallData rather than just the first time.
for (i = 0; i < VirtualCallData::row_limit(); i++) {
ciKlass* receiver = vc_data->receiver(i);
if (receiver == nullptr) {
Address recv_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_offset(i)));
metadata2reg(known_klass->constant_encoding(), tmp1);
__ z_stg(tmp1, recv_addr);
Address data_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)));
__ add2mem_64(data_addr, DataLayout::counter_increment, tmp1);
return;
}
}
// Receiver type is not found in profile data.
// Fall back to runtime helper to handle the rest at runtime.
metadata2reg(known_klass->constant_encoding(), recv);
} else {
__ load_klass(recv, recv);
NearLabel update_done;
type_profile_helper(mdo, md, data, recv, tmp1, &update_done);
// Receiver did not match any saved receiver and there is no empty row for it.
// Increment total counter to indicate polymorphic case.
__ add2mem_64(counter_addr, DataLayout::counter_increment, tmp1);
__ bind(update_done);
}
type_profile_helper(mdo, md, data, recv, tmp1);
} else {
// static call
__ add2mem_64(counter_addr, DataLayout::counter_increment, tmp1);

View File

@ -30,7 +30,7 @@
// Record the type of the receiver in ReceiverTypeData.
void type_profile_helper(Register mdo, ciMethodData *md, ciProfileData *data,
Register recv, Register tmp1, Label* update_done);
Register recv, Register tmp1);
// Setup pointers to MDO, MDO slot, also compute offset bias to access the slot.
void setup_md_access(ciMethod* method, int bci,
ciMethodData*& md, ciProfileData*& data, int& mdo_offset_bias);

View File

@ -1267,7 +1267,7 @@ void InterpreterMacroAssembler::profile_virtual_call(Register receiver,
test_method_data_pointer(mdp, profile_continue);
// Record the receiver type.
record_klass_in_profile(receiver, mdp, reg2);
profile_receiver_type(receiver, mdp, 0, reg2);
// The method data pointer needs to be updated to reflect the new target.
update_mdp_by_constant(mdp, in_bytes(VirtualCallData::virtual_call_data_size()));
@ -1275,125 +1275,6 @@ void InterpreterMacroAssembler::profile_virtual_call(Register receiver,
}
}
// This routine creates a state machine for updating the multi-row
// type profile at a virtual call site (or other type-sensitive bytecode).
// The machine visits each row (of receiver/count) until the receiver type
// is found, or until it runs out of rows. At the same time, it remembers
// the location of the first empty row. (An empty row records null for its
// receiver, and can be allocated for a newly-observed receiver type.)
// Because there are two degrees of freedom in the state, a simple linear
// search will not work; it must be a decision tree. Hence this helper
// function is recursive, to generate the required tree structured code.
// It's the interpreter, so we are trading off code space for speed.
// See below for example code.
void InterpreterMacroAssembler::record_klass_in_profile_helper(
Register receiver, Register mdp,
Register reg2, int start_row,
Label& done) {
if (TypeProfileWidth == 0) {
increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()));
return;
}
int last_row = VirtualCallData::row_limit() - 1;
assert(start_row <= last_row, "must be work left to do");
// Test this row for both the receiver and for null.
// Take any of three different outcomes:
// 1. found receiver => increment count and goto done
// 2. found null => keep looking for case 1, maybe allocate this cell
// 3. found something else => keep looking for cases 1 and 2
// Case 3 is handled by a recursive call.
for (int row = start_row; row <= last_row; row++) {
NearLabel next_test;
bool test_for_null_also = (row == start_row);
// See if the receiver is receiver[n].
int recvr_offset = in_bytes(VirtualCallData::receiver_offset(row));
test_mdp_data_at(mdp, recvr_offset, receiver,
(test_for_null_also ? reg2 : noreg),
next_test);
// (Reg2 now contains the receiver from the CallData.)
// The receiver is receiver[n]. Increment count[n].
int count_offset = in_bytes(VirtualCallData::receiver_count_offset(row));
increment_mdp_data_at(mdp, count_offset);
z_bru(done);
bind(next_test);
if (test_for_null_also) {
Label found_null;
// Failed the equality check on receiver[n]... Test for null.
z_ltgr(reg2, reg2);
if (start_row == last_row) {
// The only thing left to do is handle the null case.
z_brz(found_null);
// Receiver did not match any saved receiver and there is no empty row for it.
// Increment total counter to indicate polymorphic case.
increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()));
z_bru(done);
bind(found_null);
break;
}
// Since null is rare, make it be the branch-taken case.
z_brz(found_null);
// Put all the "Case 3" tests here.
record_klass_in_profile_helper(receiver, mdp, reg2, start_row + 1, done);
// Found a null. Keep searching for a matching receiver,
// but remember that this is an empty (unused) slot.
bind(found_null);
}
}
// In the fall-through case, we found no matching receiver, but we
// observed the receiver[start_row] is null.
// Fill in the receiver field and increment the count.
int recvr_offset = in_bytes(VirtualCallData::receiver_offset(start_row));
set_mdp_data_at(mdp, recvr_offset, receiver);
int count_offset = in_bytes(VirtualCallData::receiver_count_offset(start_row));
load_const_optimized(reg2, DataLayout::counter_increment);
set_mdp_data_at(mdp, count_offset, reg2);
if (start_row > 0) {
z_bru(done);
}
}
// Example state machine code for three profile rows:
// // main copy of decision tree, rooted at row[1]
// if (row[0].rec == rec) { row[0].incr(); goto done; }
// if (row[0].rec != nullptr) {
// // inner copy of decision tree, rooted at row[1]
// if (row[1].rec == rec) { row[1].incr(); goto done; }
// if (row[1].rec != nullptr) {
// // degenerate decision tree, rooted at row[2]
// if (row[2].rec == rec) { row[2].incr(); goto done; }
// if (row[2].rec != nullptr) { count.incr(); goto done; } // overflow
// row[2].init(rec); goto done;
// } else {
// // remember row[1] is empty
// if (row[2].rec == rec) { row[2].incr(); goto done; }
// row[1].init(rec); goto done;
// }
// } else {
// // remember row[0] is empty
// if (row[1].rec == rec) { row[1].incr(); goto done; }
// if (row[2].rec == rec) { row[2].incr(); goto done; }
// row[0].init(rec); goto done;
// }
// done:
void InterpreterMacroAssembler::record_klass_in_profile(Register receiver,
Register mdp, Register reg2) {
assert(ProfileInterpreter, "must be profiling");
Label done;
record_klass_in_profile_helper(receiver, mdp, reg2, 0, done);
bind (done);
}
void InterpreterMacroAssembler::profile_ret(Register return_bci, Register mdp) {
if (ProfileInterpreter) {
NearLabel profile_continue;
@ -1462,7 +1343,7 @@ void InterpreterMacroAssembler::profile_typecheck(Register mdp, Register klass,
mdp_delta = in_bytes(VirtualCallData::virtual_call_data_size());
// Record the object type.
record_klass_in_profile(klass, mdp, reg2);
profile_receiver_type(klass, mdp, 0, reg2);
}
update_mdp_by_constant(mdp, mdp_delta);

View File

@ -280,12 +280,6 @@ class InterpreterMacroAssembler: public MacroAssembler {
Register test_value_out,
Label& not_equal_continue);
void record_klass_in_profile(Register receiver, Register mdp,
Register reg2);
void record_klass_in_profile_helper(Register receiver, Register mdp,
Register reg2, int start_row,
Label& done);
void update_mdp_by_offset(Register mdp_in, int offset_of_offset);
void update_mdp_by_offset(Register mdp_in, Register dataidx, int offset_of_disp);
void update_mdp_by_constant(Register mdp_in, int constant);

View File

@ -39,6 +39,7 @@
#include "oops/compressedKlass.inline.hpp"
#include "oops/compressedOops.inline.hpp"
#include "oops/klass.inline.hpp"
#include "oops/methodData.hpp"
#include "prims/methodHandles.hpp"
#include "registerSaver_s390.hpp"
#include "runtime/icache.hpp"
@ -6766,3 +6767,156 @@ void MacroAssembler::load_on_condition_imm_64(Register dst, int64_t i2, branch_c
bind(done);
}
}
// Handle the receiver type profile update given the "recv" klass.
//
// Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset".
// If there are no matching or claimable receiver entries in RD, updates
// the polymorphic counter.
//
// This code expected to run by either the interpreter or JIT-ed code, without
// extra synchronization. For safety, receiver cells are claimed atomically, which
// avoids grossly misrepresenting the profiles under concurrent updates. For speed,
// counter updates are not atomic.
//
void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset, Register scratch) {
Register r0_tmp = Z_R0_scratch; // cannot be used in address calculation
assert_different_registers(recv, mdp, scratch, r0_tmp);
int base_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(0));
int end_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(ReceiverTypeData::row_limit()));
int poly_count_offset = in_bytes(CounterData::count_offset());
int receiver_step = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset;
int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset;
// Adjust for MDP offsets.
base_receiver_offset += mdp_offset;
end_receiver_offset += mdp_offset;
poly_count_offset += mdp_offset;
#ifdef ASSERT
// We are about to walk the MDO slots without asking for offsets.
// Check that our math hits all the right spots.
for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) {
int real_recv_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c));
int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c));
int offset = base_receiver_offset + receiver_step*c;
int count_offset = offset + receiver_to_count_step;
assert(offset == real_recv_offset, "receiver slot math");
assert(count_offset == real_count_offset, "receiver count math");
}
int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset());
assert(poly_count_offset == real_poly_count_offset, "poly counter math");
#endif
// Corner case: no profile table. Increment poly counter and exit.
if (ReceiverTypeData::row_limit() == 0) {
add2mem_64(Address(mdp, poly_count_offset), DataLayout::counter_increment, scratch);
return;
}
NearLabel L_loop_search_receiver, L_loop_search_empty;
NearLabel L_restart, L_found_recv, L_found_empty, L_count_update;
Register offset = scratch;
// The code here recognizes three major cases:
// A. Fastest: receiver found in the table
// B. Fast: no receiver in the table, and the table is full
// C. Slow: no receiver in the table, free slots in the table
//
// The case A performance is most important, as perfectly-behaved code would end up
// there, especially with larger TypeProfileWidth. The case B performance is
// important as well, this is where bulk of code would land for normally megamorphic
// cases. The case C performance is not essential, its job is to deal with installation
// races, we optimize for code density instead. Case C needs to make sure that receiver
// rows are only claimed once. This makes sure we never overwrite a row for another
// receiver and never duplicate the receivers in the list, making profile type-accurate.
//
// It is very tempting to handle these cases in a single loop, and claim the first slot
// without checking the rest of the table. But, profiling code should tolerate free slots
// in the table, as class unloading can clear them. After such cleanup, the receiver
// we need might be _after_ the free slot. Therefore, we need to let at least full scan
// to complete, before trying to install new slots. Splitting the code in several tight
// loops also helpfully optimizes for cases A and B.
//
// This code is effectively:
//
// restart:
// // Fastest: receiver is already installed
// for (i = 0; i < receiver_count(); i++) {
// if (receiver(i) == recv) goto found_recv(i);
// }
//
// // Fast: no receiver, but profile is not full
// for (i = 0; i < receiver_count(); i++) {
// if (receiver(i) == null) goto found_null(i);
// }
// goto polymorphic
//
// // Slow: try to install receiver
// found_null(i):
// CAS(&receiver(i), null, recv);
// goto restart
//
// polymorphic:
// count++;
// return
//
// found_recv(i):
// *receiver_count(i)++
//
bind(L_restart);
// Fastest: receiver is already installed
load_const_optimized(offset, base_receiver_offset);
bind(L_loop_search_receiver);
z_cg(recv, Address(mdp, offset));
z_bre(L_found_recv);
add2reg(offset, receiver_step);
compare64_and_branch(offset, end_receiver_offset, bcondNotEqual, L_loop_search_receiver);
// Fast: no receiver, but profile is not full
load_const_optimized(offset, base_receiver_offset);
bind(L_loop_search_empty);
z_ltg(r0_tmp, Address(mdp, offset));
z_brz(L_found_empty);
add2reg(offset, receiver_step);
compare64_and_branch(offset, end_receiver_offset, bcondNotEqual, L_loop_search_empty);
// Slow: Receiver is not found and table is full.
// Increment polymorphic counter instead of receiver slot.
load_const_optimized(offset, poly_count_offset);
z_bru(L_count_update);
// Slowest: try to install receiver
bind(L_found_empty);
{
// Atomically swing receiver slot: null -> recv.
// Use compare-and-swap to claim the slot.
Register receiver_addr = offset;
z_agr(receiver_addr, mdp); // receiver_addr = mdp + offset
// r0_tmp is used as expected value (0), recv is the new value
z_lghi(r0_tmp, 0);
z_csg(r0_tmp, recv, 0, receiver_addr);
}
// CAS success means the slot now has the receiver we want. CAS failure means
// something had claimed the slot concurrently: it can be the same receiver we want,
// or something else. Since this is a slow path, we can optimize for code density,
// and just restart the search from the beginning.
z_bru(L_restart);
// Found a receiver, convert its slot offset to corresponding count offset.
bind(L_found_recv);
add2reg(offset, receiver_to_count_step);
// Finally, update the counter
bind(L_count_update);
z_agr(offset, mdp);
add2mem_64(Address(offset), DataLayout::counter_increment, r0_tmp);
}

View File

@ -1111,6 +1111,8 @@ class MacroAssembler: public Assembler {
void load_on_condition_imm_32(Register dst, int64_t i2, branch_condition cc);
void load_on_condition_imm_64(Register dst, int64_t i2, branch_condition cc);
void profile_receiver_type(Register recv, Register mdp, int mdp_offset, Register tmp1);
};
#ifdef ASSERT