diff --git a/src/hotspot/cpu/ppc/c1_LIRAssembler_ppc.cpp b/src/hotspot/cpu/ppc/c1_LIRAssembler_ppc.cpp index a9f34b148c6..1270471d150 100644 --- a/src/hotspot/cpu/ppc/c1_LIRAssembler_ppc.cpp +++ b/src/hotspot/cpu/ppc/c1_LIRAssembler_ppc.cpp @@ -2226,39 +2226,12 @@ void LIR_Assembler::emit_alloc_array(LIR_OpAllocArray* op) { } +// kills recv void LIR_Assembler::type_profile_helper(Register mdo, int mdo_offset_bias, ciMethodData *md, ciProfileData *data, - Register recv, Register tmp1, Label* update_done) { - uint i; - for (i = 0; i < VirtualCallData::row_limit(); i++) { - Label next_test; - // See if the receiver is receiver[n]. - __ ld(tmp1, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i)) - mdo_offset_bias, mdo); - __ verify_klass_ptr(tmp1); - __ cmpd(CR0, recv, tmp1); - __ bne(CR0, next_test); - - __ ld(tmp1, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i)) - mdo_offset_bias, mdo); - __ addi(tmp1, tmp1, DataLayout::counter_increment); - __ std(tmp1, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i)) - mdo_offset_bias, mdo); - __ b(*update_done); - - __ bind(next_test); - } - - // Didn't find receiver; find next empty slot and fill it in. - for (i = 0; i < VirtualCallData::row_limit(); i++) { - Label next_test; - __ ld(tmp1, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i)) - mdo_offset_bias, mdo); - __ cmpdi(CR0, tmp1, 0); - __ bne(CR0, next_test); - __ li(tmp1, DataLayout::counter_increment); - __ std(recv, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i)) - mdo_offset_bias, mdo); - __ std(tmp1, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i)) - mdo_offset_bias, mdo); - __ b(*update_done); - - __ bind(next_test); - } + Register recv, Register tmp) { + int mdp_offset = md->byte_offset_of_slot(data, in_ByteSize(0)) - mdo_offset_bias; + __ profile_receiver_type(recv, mdo, mdp_offset, tmp, noreg); } @@ -2320,15 +2293,9 @@ void LIR_Assembler::emit_typecheck_helper(LIR_OpTypeCheck *op, Label* success, L __ b(*obj_is_null); __ bind(not_null); - Label update_done; Register recv = klass_RInfo; __ load_klass(recv, obj); - type_profile_helper(mdo, mdo_offset_bias, md, data, recv, Rtmp1, &update_done); - const int slot_offset = md->byte_offset_of_slot(data, CounterData::count_offset()) - mdo_offset_bias; - __ ld(Rtmp1, slot_offset, mdo); - __ addi(Rtmp1, Rtmp1, DataLayout::counter_increment); - __ std(Rtmp1, slot_offset, mdo); - __ bind(update_done); + type_profile_helper(mdo, mdo_offset_bias, md, data, recv, Rtmp1); // kills recv } else { __ cmpdi(CR0, obj, 0); __ beq(CR0, *obj_is_null); @@ -2427,15 +2394,9 @@ void LIR_Assembler::emit_opTypeCheck(LIR_OpTypeCheck* op) { __ b(done); __ bind(not_null); - Label update_done; Register recv = klass_RInfo; __ load_klass(recv, value); - type_profile_helper(mdo, mdo_offset_bias, md, data, recv, Rtmp1, &update_done); - const int slot_offset = md->byte_offset_of_slot(data, CounterData::count_offset()) - mdo_offset_bias; - __ ld(Rtmp1, slot_offset, mdo); - __ addi(Rtmp1, Rtmp1, DataLayout::counter_increment); - __ std(Rtmp1, slot_offset, mdo); - __ bind(update_done); + type_profile_helper(mdo, mdo_offset_bias, md, data, recv, Rtmp1); // kills recv } else { __ cmpdi(CR0, value, 0); __ beq(CR0, done); @@ -2648,55 +2609,27 @@ void LIR_Assembler::emit_profile_call(LIR_OpProfileCall* op) { // We know the type that will be seen at this call site; we can // statically update the MethodData* rather than needing to do // dynamic tests on the receiver type. - - // NOTE: we should probably put a lock around this search to - // avoid collisions by concurrent compilations. ciVirtualCallData* vc_data = (ciVirtualCallData*) data; - uint i; - for (i = 0; i < VirtualCallData::row_limit(); i++) { + for (uint i = 0; i < VirtualCallData::row_limit(); i++) { ciKlass* receiver = vc_data->receiver(i); if (known_klass->equals(receiver)) { - __ ld(tmp1, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)) - mdo_offset_bias, mdo); - __ addi(tmp1, tmp1, DataLayout::counter_increment); - __ std(tmp1, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)) - mdo_offset_bias, mdo); + __ increment_mem64(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)) - mdo_offset_bias, + DataLayout::counter_increment, tmp1); return; } } - // Receiver type not found in profile data; select an empty slot. - - // Note that this is less efficient than it should be because it - // always does a write to the receiver part of the - // VirtualCallData rather than just the first time. - for (i = 0; i < VirtualCallData::row_limit(); i++) { - ciKlass* receiver = vc_data->receiver(i); - if (receiver == nullptr) { - metadata2reg(known_klass->constant_encoding(), tmp1); - __ std(tmp1, md->byte_offset_of_slot(data, VirtualCallData::receiver_offset(i)) - mdo_offset_bias, mdo); - - __ ld(tmp1, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)) - mdo_offset_bias, mdo); - __ addi(tmp1, tmp1, DataLayout::counter_increment); - __ std(tmp1, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)) - mdo_offset_bias, mdo); - return; - } - } + // Receiver type is not found in profile data. + // Fall back to runtime helper to handle the rest at runtime. + metadata2reg(known_klass->constant_encoding(), recv); } else { __ load_klass(recv, recv); - Label update_done; - type_profile_helper(mdo, mdo_offset_bias, md, data, recv, tmp1, &update_done); - // Receiver did not match any saved receiver and there is no empty row for it. - // Increment total counter to indicate polymorphic case. - __ ld(tmp1, md->byte_offset_of_slot(data, CounterData::count_offset()) - mdo_offset_bias, mdo); - __ addi(tmp1, tmp1, DataLayout::counter_increment); - __ std(tmp1, md->byte_offset_of_slot(data, CounterData::count_offset()) - mdo_offset_bias, mdo); - - __ bind(update_done); } + type_profile_helper(mdo, mdo_offset_bias, md, data, recv, tmp1); // kills recv } else { // Static call - __ ld(tmp1, md->byte_offset_of_slot(data, CounterData::count_offset()) - mdo_offset_bias, mdo); - __ addi(tmp1, tmp1, DataLayout::counter_increment); - __ std(tmp1, md->byte_offset_of_slot(data, CounterData::count_offset()) - mdo_offset_bias, mdo); + __ increment_mem64(mdo, md->byte_offset_of_slot(data, CounterData::count_offset()) - mdo_offset_bias, + DataLayout::counter_increment, tmp1); } } diff --git a/src/hotspot/cpu/ppc/c1_LIRAssembler_ppc.hpp b/src/hotspot/cpu/ppc/c1_LIRAssembler_ppc.hpp index 7399a4544e6..5a065d364b2 100644 --- a/src/hotspot/cpu/ppc/c1_LIRAssembler_ppc.hpp +++ b/src/hotspot/cpu/ppc/c1_LIRAssembler_ppc.hpp @@ -52,7 +52,7 @@ friend class ArrayCopyStub; // Record the type of the receiver in ReceiverTypeData. void type_profile_helper(Register mdo, int mdo_offset_bias, ciMethodData *md, ciProfileData *data, - Register recv, Register tmp1, Label* update_done); + Register recv, Register tmp); // Setup pointers to MDO, MDO slot, also compute offset bias to access the slot. void setup_md_access(ciMethod* method, int bci, ciMethodData*& md, ciProfileData*& data, int& mdo_offset_bias); diff --git a/src/hotspot/cpu/ppc/interp_masm_ppc.hpp b/src/hotspot/cpu/ppc/interp_masm_ppc.hpp index 275ff92c699..45af9bfc252 100644 --- a/src/hotspot/cpu/ppc/interp_masm_ppc.hpp +++ b/src/hotspot/cpu/ppc/interp_masm_ppc.hpp @@ -1,6 +1,6 @@ /* * Copyright (c) 2002, 2026, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2025 SAP SE. All rights reserved. + * Copyright (c) 2012, 2026 SAP SE. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -264,8 +264,6 @@ class InterpreterMacroAssembler: public MacroAssembler { void profile_switch_default(Register scratch1, Register scratch2); void profile_switch_case(Register index, Register scratch1,Register scratch2, Register scratch3); void profile_null_seen(Register Rscratch1, Register Rscratch2); - void record_klass_in_profile(Register receiver, Register scratch1, Register scratch2); - void record_klass_in_profile_helper(Register receiver, Register scratch1, Register scratch2, int start_row, Label& done); // Argument and return type profiling. void profile_obj_type(Register obj, Register mdo_addr_base, RegisterOrConstant mdo_addr_offs, Register tmp, Register tmp2); diff --git a/src/hotspot/cpu/ppc/interp_masm_ppc_64.cpp b/src/hotspot/cpu/ppc/interp_masm_ppc_64.cpp index a1798289b62..789f8da9574 100644 --- a/src/hotspot/cpu/ppc/interp_masm_ppc_64.cpp +++ b/src/hotspot/cpu/ppc/interp_masm_ppc_64.cpp @@ -1348,7 +1348,7 @@ void InterpreterMacroAssembler::profile_virtual_call(Register Rreceiver, test_method_data_pointer(profile_continue); // Record the receiver type. - record_klass_in_profile(Rreceiver, Rscratch1, Rscratch2); + profile_receiver_type(Rreceiver, R28_mdx, 0, Rscratch1, Rscratch2); // The method data pointer needs to be updated to reflect the new target. update_mdp_by_constant(in_bytes(VirtualCallData::virtual_call_data_size())); @@ -1367,7 +1367,7 @@ void InterpreterMacroAssembler::profile_typecheck(Register Rklass, Register Rscr mdp_delta = in_bytes(VirtualCallData::virtual_call_data_size()); // Record the object type. - record_klass_in_profile(Rklass, Rscratch1, Rscratch2); + profile_receiver_type(Rklass, R28_mdx, 0, Rscratch1, Rscratch2); } // The method data pointer needs to be updated. @@ -1481,88 +1481,6 @@ void InterpreterMacroAssembler::profile_null_seen(Register Rscratch1, Register R } } -void InterpreterMacroAssembler::record_klass_in_profile(Register Rreceiver, - Register Rscratch1, Register Rscratch2) { - assert(ProfileInterpreter, "must be profiling"); - assert_different_registers(Rreceiver, Rscratch1, Rscratch2); - - Label done; - record_klass_in_profile_helper(Rreceiver, Rscratch1, Rscratch2, 0, done); - bind (done); -} - -void InterpreterMacroAssembler::record_klass_in_profile_helper( - Register receiver, Register scratch1, Register scratch2, - int start_row, Label& done) { - if (TypeProfileWidth == 0) { - increment_mdp_data_at(in_bytes(CounterData::count_offset()), scratch1, scratch2); - return; - } - - int last_row = VirtualCallData::row_limit() - 1; - assert(start_row <= last_row, "must be work left to do"); - // Test this row for both the receiver and for null. - // Take any of three different outcomes: - // 1. found receiver => increment count and goto done - // 2. found null => keep looking for case 1, maybe allocate this cell - // 3. found something else => keep looking for cases 1 and 2 - // Case 3 is handled by a recursive call. - for (int row = start_row; row <= last_row; row++) { - Label next_test; - bool test_for_null_also = (row == start_row); - - // See if the receiver is receiver[n]. - int recvr_offset = in_bytes(VirtualCallData::receiver_offset(row)); - test_mdp_data_at(recvr_offset, receiver, next_test, scratch1); - // delayed()->tst(scratch); - - // The receiver is receiver[n]. Increment count[n]. - int count_offset = in_bytes(VirtualCallData::receiver_count_offset(row)); - increment_mdp_data_at(count_offset, scratch1, scratch2); - b(done); - bind(next_test); - - if (test_for_null_also) { - Label found_null; - // Failed the equality check on receiver[n]... Test for null. - if (start_row == last_row) { - // The only thing left to do is handle the null case. - // Scratch1 contains test_out from test_mdp_data_at. - cmpdi(CR0, scratch1, 0); - beq(CR0, found_null); - // Receiver did not match any saved receiver and there is no empty row for it. - // Increment total counter to indicate polymorphic case. - increment_mdp_data_at(in_bytes(CounterData::count_offset()), scratch1, scratch2); - b(done); - bind(found_null); - break; - } - // Since null is rare, make it be the branch-taken case. - cmpdi(CR0, scratch1, 0); - beq(CR0, found_null); - - // Put all the "Case 3" tests here. - record_klass_in_profile_helper(receiver, scratch1, scratch2, start_row + 1, done); - - // Found a null. Keep searching for a matching receiver, - // but remember that this is an empty (unused) slot. - bind(found_null); - } - } - - // In the fall-through case, we found no matching receiver, but we - // observed the receiver[start_row] is null. - - // Fill in the receiver field and increment the count. - int recvr_offset = in_bytes(VirtualCallData::receiver_offset(start_row)); - set_mdp_data_at(recvr_offset, receiver); - int count_offset = in_bytes(VirtualCallData::receiver_count_offset(start_row)); - li(scratch1, DataLayout::counter_increment); - set_mdp_data_at(count_offset, scratch1); - if (start_row > 0) { - b(done); - } -} // Argument and return type profilig. // kills: tmp, tmp2, R0, CR0, CR1 diff --git a/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp b/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp index 95d58d470c8..0d6c272decb 100644 --- a/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp +++ b/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp @@ -4329,6 +4329,173 @@ void MacroAssembler::multiply_to_len(Register x, Register xlen, bind(L_done); } // multiply_to_len +void MacroAssembler::increment_mem64(Register base, RegisterOrConstant ind_or_offs, int val, Register tmp) { + ld(tmp, ind_or_offs, base); + addi(tmp, tmp, val); + std(tmp, ind_or_offs, base); +} + +// Handle the receiver type profile update given the "recv" klass. +// +// Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset". +// If there are no matching or claimable receiver entries in RD, updates +// the polymorphic counter. +// +// This code expected to run by either the interpreter or JIT-ed code, without +// extra synchronization. For safety, receiver cells are claimed atomically, which +// avoids grossly misrepresenting the profiles under concurrent updates. For speed, +// counter updates are not atomic. +// +void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset, Register tmp1, Register tmp2) { + assert_different_registers(recv, mdp, tmp1, tmp2); + + int base_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(0)); + int poly_count_offset = in_bytes(CounterData::count_offset()); + int receiver_step = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset; + int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset; + + // Adjust for MDP offsets. + base_receiver_offset += mdp_offset; + poly_count_offset += mdp_offset; + +#ifdef ASSERT + // We are about to walk the MDO slots without asking for offsets. + // Check that our math hits all the right spots. + for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) { + int real_recv_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c)); + int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c)); + int offset = base_receiver_offset + receiver_step*c; + int count_offset = offset + receiver_to_count_step; + assert(offset == real_recv_offset, "receiver slot math"); + assert(count_offset == real_count_offset, "receiver count math"); + } + int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset()); + assert(poly_count_offset == real_poly_count_offset, "poly counter math"); +#endif + + // Corner case: no profile table. Increment poly counter and exit. + if (ReceiverTypeData::row_limit() == 0) { + increment_mem64(mdp, poly_count_offset, DataLayout::counter_increment, tmp1); + return; + } + + Label L_loop_search_receiver, L_loop_search_empty; + Label L_restart, L_found_recv, L_found_empty, L_count_update; + Register offset = tmp1, count = tmp2; + + // The code here recognizes three major cases: + // A. Fastest: receiver found in the table + // B. Fast: no receiver in the table, and the table is full + // C. Slow: no receiver in the table, free slots in the table + // + // The case A performance is most important, as perfectly-behaved code would end up + // there, especially with larger TypeProfileWidth. The case B performance is + // important as well, this is where bulk of code would land for normally megamorphic + // cases. The case C performance is not essential, its job is to deal with installation + // races, we optimize for code density instead. Case C needs to make sure that receiver + // rows are only claimed once. This makes sure we never overwrite a row for another + // receiver and never duplicate the receivers in the list, making profile type-accurate. + // + // It is very tempting to handle these cases in a single loop, and claim the first slot + // without checking the rest of the table. But, profiling code should tolerate free slots + // in the table, as class unloading can clear them. After such cleanup, the receiver + // we need might be _after_ the free slot. Therefore, we need to let at least full scan + // to complete, before trying to install new slots. Splitting the code in several tight + // loops also helpfully optimizes for cases A and B. + // + // This code is effectively: + // + // restart: + // // Fastest: receiver is already installed + // for (i = 0; i < receiver_count(); i++) { + // if (receiver(i) == recv) goto found_recv(i); + // } + // + // // Fast: no receiver, but profile is full + // for (i = 0; i < receiver_count(); i++) { + // if (receiver(i) == null) goto found_null(i); + // } + // goto polymorphic + // + // // Slow: try to install receiver + // found_null(i): + // CAS(&receiver(i), null, recv); + // goto restart + // + // polymorphic: + // count++; + // return + // + // found_recv(i): + // *receiver_count(i)++ + // + + if (count != noreg) { + li(count, ReceiverTypeData::row_limit()); + } + + bind(L_restart); + + // Fastest: receiver is already installed + if (count != noreg) { + mtctr(count); + } else { + li(R0, ReceiverTypeData::row_limit()); + mtctr(R0); + } + li(offset, base_receiver_offset); + bind(L_loop_search_receiver); + ldx(R0, offset, mdp); + cmpd(CR0, R0, recv); + beq(CR0, L_found_recv); + addi(offset, offset, receiver_step); + bdnz(L_loop_search_receiver); + + // Fast: no receiver, but profile is full + if (count != noreg) { + mtctr(count); + } else { + li(R0, ReceiverTypeData::row_limit()); + mtctr(R0); + } + li(offset, base_receiver_offset); + bind(L_loop_search_empty); + ldx(R0, offset, mdp); + cmpdi(CR0, R0, 0); + beq(CR0, L_found_empty); + addi(offset, offset, receiver_step); + bdnz(L_loop_search_empty); + + // Polymorphic: Increment polymorphic counter instead of receiver slot. + li(offset, poly_count_offset); + b(L_count_update); + + // Slow: try to install receiver + bind(L_found_empty); + + // Atomically swing receiver slot: null -> recv. + { + Register receiver_addr = offset; + add(receiver_addr, mdp, offset); // kills offset + cmpxchgd(CR0, R0, RegisterOrConstant(0), recv, receiver_addr, MemBarNone, cmpxchgx_hint_atomic_update(), + noreg, nullptr, /* check without ldarx first */ false, /* weak */ true); + } + + // CAS success means the slot now has the receiver we want. CAS failure means + // something had claimed the slot concurrently: it can be the same receiver we want, + // or something else. Since this is a slow path, we can optimize for code density, + // and just restart the search from the beginning. + b(L_restart); + + // Found a receiver, convert its slot offset to corresponding count offset. + bind(L_found_recv); + addi(offset, offset, receiver_to_count_step); + + // Counter update + bind(L_count_update); + increment_mem64(mdp, offset, DataLayout::counter_increment, /* temp */ (count != noreg) ? count : recv); +} + #ifdef ASSERT void MacroAssembler::asm_assert(AsmAssertCond cond, const char *msg) { Label ok; diff --git a/src/hotspot/cpu/ppc/macroAssembler_ppc.hpp b/src/hotspot/cpu/ppc/macroAssembler_ppc.hpp index 21ab192373f..bbfa75f5151 100644 --- a/src/hotspot/cpu/ppc/macroAssembler_ppc.hpp +++ b/src/hotspot/cpu/ppc/macroAssembler_ppc.hpp @@ -870,6 +870,12 @@ class MacroAssembler: public Assembler { Register tmp6, Register tmp7, Register tmp8, Register tmp9, Register tmp10, Register tmp11, Register tmp12, Register tmp13); + // non-atomic 64-bit memory increment by simm16 + void increment_mem64(Register base, RegisterOrConstant ind_or_offs, int val, Register tmp); + + // Bytecode profiling (tmp2 = noreg is allowed, but then recv is killed) + void profile_receiver_type(Register recv, Register mdp, int mdp_offset, Register tmp1, Register tmp2); + // Emitters for CRC32 calculation. // A note on invertCRC: // Unfortunately, internal representation of crc differs between CRC32 and CRC32C.