8385648: PPC64: Improve receiver type profiling reliability

Reviewed-by: shade, dbriemann
This commit is contained in:
Martin Doerr 2026-06-02 13:29:16 +00:00
parent 3f8a7213ef
commit 3efa011b96
6 changed files with 192 additions and 170 deletions

View File

@ -2226,39 +2226,12 @@ void LIR_Assembler::emit_alloc_array(LIR_OpAllocArray* op) {
}
// kills recv
void LIR_Assembler::type_profile_helper(Register mdo, int mdo_offset_bias,
ciMethodData *md, ciProfileData *data,
Register recv, Register tmp1, Label* update_done) {
uint i;
for (i = 0; i < VirtualCallData::row_limit(); i++) {
Label next_test;
// See if the receiver is receiver[n].
__ ld(tmp1, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i)) - mdo_offset_bias, mdo);
__ verify_klass_ptr(tmp1);
__ cmpd(CR0, recv, tmp1);
__ bne(CR0, next_test);
__ ld(tmp1, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i)) - mdo_offset_bias, mdo);
__ addi(tmp1, tmp1, DataLayout::counter_increment);
__ std(tmp1, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i)) - mdo_offset_bias, mdo);
__ b(*update_done);
__ bind(next_test);
}
// Didn't find receiver; find next empty slot and fill it in.
for (i = 0; i < VirtualCallData::row_limit(); i++) {
Label next_test;
__ ld(tmp1, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i)) - mdo_offset_bias, mdo);
__ cmpdi(CR0, tmp1, 0);
__ bne(CR0, next_test);
__ li(tmp1, DataLayout::counter_increment);
__ std(recv, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i)) - mdo_offset_bias, mdo);
__ std(tmp1, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i)) - mdo_offset_bias, mdo);
__ b(*update_done);
__ bind(next_test);
}
Register recv, Register tmp) {
int mdp_offset = md->byte_offset_of_slot(data, in_ByteSize(0)) - mdo_offset_bias;
__ profile_receiver_type(recv, mdo, mdp_offset, tmp, noreg);
}
@ -2320,15 +2293,9 @@ void LIR_Assembler::emit_typecheck_helper(LIR_OpTypeCheck *op, Label* success, L
__ b(*obj_is_null);
__ bind(not_null);
Label update_done;
Register recv = klass_RInfo;
__ load_klass(recv, obj);
type_profile_helper(mdo, mdo_offset_bias, md, data, recv, Rtmp1, &update_done);
const int slot_offset = md->byte_offset_of_slot(data, CounterData::count_offset()) - mdo_offset_bias;
__ ld(Rtmp1, slot_offset, mdo);
__ addi(Rtmp1, Rtmp1, DataLayout::counter_increment);
__ std(Rtmp1, slot_offset, mdo);
__ bind(update_done);
type_profile_helper(mdo, mdo_offset_bias, md, data, recv, Rtmp1); // kills recv
} else {
__ cmpdi(CR0, obj, 0);
__ beq(CR0, *obj_is_null);
@ -2427,15 +2394,9 @@ void LIR_Assembler::emit_opTypeCheck(LIR_OpTypeCheck* op) {
__ b(done);
__ bind(not_null);
Label update_done;
Register recv = klass_RInfo;
__ load_klass(recv, value);
type_profile_helper(mdo, mdo_offset_bias, md, data, recv, Rtmp1, &update_done);
const int slot_offset = md->byte_offset_of_slot(data, CounterData::count_offset()) - mdo_offset_bias;
__ ld(Rtmp1, slot_offset, mdo);
__ addi(Rtmp1, Rtmp1, DataLayout::counter_increment);
__ std(Rtmp1, slot_offset, mdo);
__ bind(update_done);
type_profile_helper(mdo, mdo_offset_bias, md, data, recv, Rtmp1); // kills recv
} else {
__ cmpdi(CR0, value, 0);
__ beq(CR0, done);
@ -2648,55 +2609,27 @@ void LIR_Assembler::emit_profile_call(LIR_OpProfileCall* op) {
// We know the type that will be seen at this call site; we can
// statically update the MethodData* rather than needing to do
// dynamic tests on the receiver type.
// NOTE: we should probably put a lock around this search to
// avoid collisions by concurrent compilations.
ciVirtualCallData* vc_data = (ciVirtualCallData*) data;
uint i;
for (i = 0; i < VirtualCallData::row_limit(); i++) {
for (uint i = 0; i < VirtualCallData::row_limit(); i++) {
ciKlass* receiver = vc_data->receiver(i);
if (known_klass->equals(receiver)) {
__ ld(tmp1, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)) - mdo_offset_bias, mdo);
__ addi(tmp1, tmp1, DataLayout::counter_increment);
__ std(tmp1, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)) - mdo_offset_bias, mdo);
__ increment_mem64(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)) - mdo_offset_bias,
DataLayout::counter_increment, tmp1);
return;
}
}
// Receiver type not found in profile data; select an empty slot.
// Note that this is less efficient than it should be because it
// always does a write to the receiver part of the
// VirtualCallData rather than just the first time.
for (i = 0; i < VirtualCallData::row_limit(); i++) {
ciKlass* receiver = vc_data->receiver(i);
if (receiver == nullptr) {
metadata2reg(known_klass->constant_encoding(), tmp1);
__ std(tmp1, md->byte_offset_of_slot(data, VirtualCallData::receiver_offset(i)) - mdo_offset_bias, mdo);
__ ld(tmp1, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)) - mdo_offset_bias, mdo);
__ addi(tmp1, tmp1, DataLayout::counter_increment);
__ std(tmp1, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)) - mdo_offset_bias, mdo);
return;
}
}
// Receiver type is not found in profile data.
// Fall back to runtime helper to handle the rest at runtime.
metadata2reg(known_klass->constant_encoding(), recv);
} else {
__ load_klass(recv, recv);
Label update_done;
type_profile_helper(mdo, mdo_offset_bias, md, data, recv, tmp1, &update_done);
// Receiver did not match any saved receiver and there is no empty row for it.
// Increment total counter to indicate polymorphic case.
__ ld(tmp1, md->byte_offset_of_slot(data, CounterData::count_offset()) - mdo_offset_bias, mdo);
__ addi(tmp1, tmp1, DataLayout::counter_increment);
__ std(tmp1, md->byte_offset_of_slot(data, CounterData::count_offset()) - mdo_offset_bias, mdo);
__ bind(update_done);
}
type_profile_helper(mdo, mdo_offset_bias, md, data, recv, tmp1); // kills recv
} else {
// Static call
__ ld(tmp1, md->byte_offset_of_slot(data, CounterData::count_offset()) - mdo_offset_bias, mdo);
__ addi(tmp1, tmp1, DataLayout::counter_increment);
__ std(tmp1, md->byte_offset_of_slot(data, CounterData::count_offset()) - mdo_offset_bias, mdo);
__ increment_mem64(mdo, md->byte_offset_of_slot(data, CounterData::count_offset()) - mdo_offset_bias,
DataLayout::counter_increment, tmp1);
}
}

View File

@ -52,7 +52,7 @@ friend class ArrayCopyStub;
// Record the type of the receiver in ReceiverTypeData.
void type_profile_helper(Register mdo, int mdo_offset_bias,
ciMethodData *md, ciProfileData *data,
Register recv, Register tmp1, Label* update_done);
Register recv, Register tmp);
// Setup pointers to MDO, MDO slot, also compute offset bias to access the slot.
void setup_md_access(ciMethod* method, int bci,
ciMethodData*& md, ciProfileData*& data, int& mdo_offset_bias);

View File

@ -1,6 +1,6 @@
/*
* Copyright (c) 2002, 2026, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2025 SAP SE. All rights reserved.
* Copyright (c) 2012, 2026 SAP SE. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -264,8 +264,6 @@ class InterpreterMacroAssembler: public MacroAssembler {
void profile_switch_default(Register scratch1, Register scratch2);
void profile_switch_case(Register index, Register scratch1,Register scratch2, Register scratch3);
void profile_null_seen(Register Rscratch1, Register Rscratch2);
void record_klass_in_profile(Register receiver, Register scratch1, Register scratch2);
void record_klass_in_profile_helper(Register receiver, Register scratch1, Register scratch2, int start_row, Label& done);
// Argument and return type profiling.
void profile_obj_type(Register obj, Register mdo_addr_base, RegisterOrConstant mdo_addr_offs, Register tmp, Register tmp2);

View File

@ -1348,7 +1348,7 @@ void InterpreterMacroAssembler::profile_virtual_call(Register Rreceiver,
test_method_data_pointer(profile_continue);
// Record the receiver type.
record_klass_in_profile(Rreceiver, Rscratch1, Rscratch2);
profile_receiver_type(Rreceiver, R28_mdx, 0, Rscratch1, Rscratch2);
// The method data pointer needs to be updated to reflect the new target.
update_mdp_by_constant(in_bytes(VirtualCallData::virtual_call_data_size()));
@ -1367,7 +1367,7 @@ void InterpreterMacroAssembler::profile_typecheck(Register Rklass, Register Rscr
mdp_delta = in_bytes(VirtualCallData::virtual_call_data_size());
// Record the object type.
record_klass_in_profile(Rklass, Rscratch1, Rscratch2);
profile_receiver_type(Rklass, R28_mdx, 0, Rscratch1, Rscratch2);
}
// The method data pointer needs to be updated.
@ -1481,88 +1481,6 @@ void InterpreterMacroAssembler::profile_null_seen(Register Rscratch1, Register R
}
}
void InterpreterMacroAssembler::record_klass_in_profile(Register Rreceiver,
Register Rscratch1, Register Rscratch2) {
assert(ProfileInterpreter, "must be profiling");
assert_different_registers(Rreceiver, Rscratch1, Rscratch2);
Label done;
record_klass_in_profile_helper(Rreceiver, Rscratch1, Rscratch2, 0, done);
bind (done);
}
void InterpreterMacroAssembler::record_klass_in_profile_helper(
Register receiver, Register scratch1, Register scratch2,
int start_row, Label& done) {
if (TypeProfileWidth == 0) {
increment_mdp_data_at(in_bytes(CounterData::count_offset()), scratch1, scratch2);
return;
}
int last_row = VirtualCallData::row_limit() - 1;
assert(start_row <= last_row, "must be work left to do");
// Test this row for both the receiver and for null.
// Take any of three different outcomes:
// 1. found receiver => increment count and goto done
// 2. found null => keep looking for case 1, maybe allocate this cell
// 3. found something else => keep looking for cases 1 and 2
// Case 3 is handled by a recursive call.
for (int row = start_row; row <= last_row; row++) {
Label next_test;
bool test_for_null_also = (row == start_row);
// See if the receiver is receiver[n].
int recvr_offset = in_bytes(VirtualCallData::receiver_offset(row));
test_mdp_data_at(recvr_offset, receiver, next_test, scratch1);
// delayed()->tst(scratch);
// The receiver is receiver[n]. Increment count[n].
int count_offset = in_bytes(VirtualCallData::receiver_count_offset(row));
increment_mdp_data_at(count_offset, scratch1, scratch2);
b(done);
bind(next_test);
if (test_for_null_also) {
Label found_null;
// Failed the equality check on receiver[n]... Test for null.
if (start_row == last_row) {
// The only thing left to do is handle the null case.
// Scratch1 contains test_out from test_mdp_data_at.
cmpdi(CR0, scratch1, 0);
beq(CR0, found_null);
// Receiver did not match any saved receiver and there is no empty row for it.
// Increment total counter to indicate polymorphic case.
increment_mdp_data_at(in_bytes(CounterData::count_offset()), scratch1, scratch2);
b(done);
bind(found_null);
break;
}
// Since null is rare, make it be the branch-taken case.
cmpdi(CR0, scratch1, 0);
beq(CR0, found_null);
// Put all the "Case 3" tests here.
record_klass_in_profile_helper(receiver, scratch1, scratch2, start_row + 1, done);
// Found a null. Keep searching for a matching receiver,
// but remember that this is an empty (unused) slot.
bind(found_null);
}
}
// In the fall-through case, we found no matching receiver, but we
// observed the receiver[start_row] is null.
// Fill in the receiver field and increment the count.
int recvr_offset = in_bytes(VirtualCallData::receiver_offset(start_row));
set_mdp_data_at(recvr_offset, receiver);
int count_offset = in_bytes(VirtualCallData::receiver_count_offset(start_row));
li(scratch1, DataLayout::counter_increment);
set_mdp_data_at(count_offset, scratch1);
if (start_row > 0) {
b(done);
}
}
// Argument and return type profilig.
// kills: tmp, tmp2, R0, CR0, CR1

View File

@ -4329,6 +4329,173 @@ void MacroAssembler::multiply_to_len(Register x, Register xlen,
bind(L_done);
} // multiply_to_len
void MacroAssembler::increment_mem64(Register base, RegisterOrConstant ind_or_offs, int val, Register tmp) {
ld(tmp, ind_or_offs, base);
addi(tmp, tmp, val);
std(tmp, ind_or_offs, base);
}
// Handle the receiver type profile update given the "recv" klass.
//
// Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset".
// If there are no matching or claimable receiver entries in RD, updates
// the polymorphic counter.
//
// This code expected to run by either the interpreter or JIT-ed code, without
// extra synchronization. For safety, receiver cells are claimed atomically, which
// avoids grossly misrepresenting the profiles under concurrent updates. For speed,
// counter updates are not atomic.
//
void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset, Register tmp1, Register tmp2) {
assert_different_registers(recv, mdp, tmp1, tmp2);
int base_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(0));
int poly_count_offset = in_bytes(CounterData::count_offset());
int receiver_step = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset;
int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset;
// Adjust for MDP offsets.
base_receiver_offset += mdp_offset;
poly_count_offset += mdp_offset;
#ifdef ASSERT
// We are about to walk the MDO slots without asking for offsets.
// Check that our math hits all the right spots.
for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) {
int real_recv_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c));
int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c));
int offset = base_receiver_offset + receiver_step*c;
int count_offset = offset + receiver_to_count_step;
assert(offset == real_recv_offset, "receiver slot math");
assert(count_offset == real_count_offset, "receiver count math");
}
int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset());
assert(poly_count_offset == real_poly_count_offset, "poly counter math");
#endif
// Corner case: no profile table. Increment poly counter and exit.
if (ReceiverTypeData::row_limit() == 0) {
increment_mem64(mdp, poly_count_offset, DataLayout::counter_increment, tmp1);
return;
}
Label L_loop_search_receiver, L_loop_search_empty;
Label L_restart, L_found_recv, L_found_empty, L_count_update;
Register offset = tmp1, count = tmp2;
// The code here recognizes three major cases:
// A. Fastest: receiver found in the table
// B. Fast: no receiver in the table, and the table is full
// C. Slow: no receiver in the table, free slots in the table
//
// The case A performance is most important, as perfectly-behaved code would end up
// there, especially with larger TypeProfileWidth. The case B performance is
// important as well, this is where bulk of code would land for normally megamorphic
// cases. The case C performance is not essential, its job is to deal with installation
// races, we optimize for code density instead. Case C needs to make sure that receiver
// rows are only claimed once. This makes sure we never overwrite a row for another
// receiver and never duplicate the receivers in the list, making profile type-accurate.
//
// It is very tempting to handle these cases in a single loop, and claim the first slot
// without checking the rest of the table. But, profiling code should tolerate free slots
// in the table, as class unloading can clear them. After such cleanup, the receiver
// we need might be _after_ the free slot. Therefore, we need to let at least full scan
// to complete, before trying to install new slots. Splitting the code in several tight
// loops also helpfully optimizes for cases A and B.
//
// This code is effectively:
//
// restart:
// // Fastest: receiver is already installed
// for (i = 0; i < receiver_count(); i++) {
// if (receiver(i) == recv) goto found_recv(i);
// }
//
// // Fast: no receiver, but profile is full
// for (i = 0; i < receiver_count(); i++) {
// if (receiver(i) == null) goto found_null(i);
// }
// goto polymorphic
//
// // Slow: try to install receiver
// found_null(i):
// CAS(&receiver(i), null, recv);
// goto restart
//
// polymorphic:
// count++;
// return
//
// found_recv(i):
// *receiver_count(i)++
//
if (count != noreg) {
li(count, ReceiverTypeData::row_limit());
}
bind(L_restart);
// Fastest: receiver is already installed
if (count != noreg) {
mtctr(count);
} else {
li(R0, ReceiverTypeData::row_limit());
mtctr(R0);
}
li(offset, base_receiver_offset);
bind(L_loop_search_receiver);
ldx(R0, offset, mdp);
cmpd(CR0, R0, recv);
beq(CR0, L_found_recv);
addi(offset, offset, receiver_step);
bdnz(L_loop_search_receiver);
// Fast: no receiver, but profile is full
if (count != noreg) {
mtctr(count);
} else {
li(R0, ReceiverTypeData::row_limit());
mtctr(R0);
}
li(offset, base_receiver_offset);
bind(L_loop_search_empty);
ldx(R0, offset, mdp);
cmpdi(CR0, R0, 0);
beq(CR0, L_found_empty);
addi(offset, offset, receiver_step);
bdnz(L_loop_search_empty);
// Polymorphic: Increment polymorphic counter instead of receiver slot.
li(offset, poly_count_offset);
b(L_count_update);
// Slow: try to install receiver
bind(L_found_empty);
// Atomically swing receiver slot: null -> recv.
{
Register receiver_addr = offset;
add(receiver_addr, mdp, offset); // kills offset
cmpxchgd(CR0, R0, RegisterOrConstant(0), recv, receiver_addr, MemBarNone, cmpxchgx_hint_atomic_update(),
noreg, nullptr, /* check without ldarx first */ false, /* weak */ true);
}
// CAS success means the slot now has the receiver we want. CAS failure means
// something had claimed the slot concurrently: it can be the same receiver we want,
// or something else. Since this is a slow path, we can optimize for code density,
// and just restart the search from the beginning.
b(L_restart);
// Found a receiver, convert its slot offset to corresponding count offset.
bind(L_found_recv);
addi(offset, offset, receiver_to_count_step);
// Counter update
bind(L_count_update);
increment_mem64(mdp, offset, DataLayout::counter_increment, /* temp */ (count != noreg) ? count : recv);
}
#ifdef ASSERT
void MacroAssembler::asm_assert(AsmAssertCond cond, const char *msg) {
Label ok;

View File

@ -870,6 +870,12 @@ class MacroAssembler: public Assembler {
Register tmp6, Register tmp7, Register tmp8, Register tmp9, Register tmp10,
Register tmp11, Register tmp12, Register tmp13);
// non-atomic 64-bit memory increment by simm16
void increment_mem64(Register base, RegisterOrConstant ind_or_offs, int val, Register tmp);
// Bytecode profiling (tmp2 = noreg is allowed, but then recv is killed)
void profile_receiver_type(Register recv, Register mdp, int mdp_offset, Register tmp1, Register tmp2);
// Emitters for CRC32 calculation.
// A note on invertCRC:
// Unfortunately, internal representation of crc differs between CRC32 and CRC32C.