diff --git a/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp index 37a6a130e0d..c0621cbd5c2 100644 --- a/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp @@ -1218,43 +1218,11 @@ void LIR_Assembler::emit_alloc_array(LIR_OpAllocArray* op) { __ bind(*op->stub()->continuation()); } -void LIR_Assembler::type_profile_helper(Register mdo, - ciMethodData *md, ciProfileData *data, - Register recv, Label* update_done) { +void LIR_Assembler::type_profile_helper(Register mdo, ciMethodData *md, + ciProfileData *data, Register recv) { - // Given a profile data offset, generate an Address which points to - // the corresponding slot in mdo->data(). - // Clobbers rscratch2. - auto slot_at = [=](ByteSize offset) -> Address { - return __ form_address(rscratch2, mdo, - md->byte_offset_of_slot(data, offset), - LogBytesPerWord); - }; - - for (uint i = 0; i < ReceiverTypeData::row_limit(); i++) { - Label next_test; - // See if the receiver is receiver[n]. - __ ldr(rscratch1, slot_at(ReceiverTypeData::receiver_offset(i))); - __ cmp(recv, rscratch1); - __ br(Assembler::NE, next_test); - __ addptr(slot_at(ReceiverTypeData::receiver_count_offset(i)), - DataLayout::counter_increment); - __ b(*update_done); - __ bind(next_test); - } - - // Didn't find receiver; find next empty slot and fill it in - for (uint i = 0; i < ReceiverTypeData::row_limit(); i++) { - Label next_test; - Address recv_addr(slot_at(ReceiverTypeData::receiver_offset(i))); - __ ldr(rscratch1, recv_addr); - __ cbnz(rscratch1, next_test); - __ str(recv, recv_addr); - __ mov(rscratch1, DataLayout::counter_increment); - __ str(rscratch1, slot_at(ReceiverTypeData::receiver_count_offset(i))); - __ b(*update_done); - __ bind(next_test); - } + int mdp_offset = md->byte_offset_of_slot(data, in_ByteSize(0)); + __ profile_receiver_type(recv, mdo, mdp_offset); } void LIR_Assembler::emit_typecheck_helper(LIR_OpTypeCheck *op, Label* success, Label* failure, Label* obj_is_null) { @@ -1316,14 +1284,9 @@ void LIR_Assembler::emit_typecheck_helper(LIR_OpTypeCheck *op, Label* success, L __ b(*obj_is_null); __ bind(not_null); - Label update_done; Register recv = k_RInfo; __ load_klass(recv, obj); - type_profile_helper(mdo, md, data, recv, &update_done); - Address counter_addr(mdo, md->byte_offset_of_slot(data, CounterData::count_offset())); - __ addptr(counter_addr, DataLayout::counter_increment); - - __ bind(update_done); + type_profile_helper(mdo, md, data, recv); } else { __ cbz(obj, *obj_is_null); } @@ -1430,13 +1393,9 @@ void LIR_Assembler::emit_opTypeCheck(LIR_OpTypeCheck* op) { __ b(done); __ bind(not_null); - Label update_done; Register recv = k_RInfo; __ load_klass(recv, value); - type_profile_helper(mdo, md, data, recv, &update_done); - Address counter_addr(mdo, md->byte_offset_of_slot(data, CounterData::count_offset())); - __ addptr(counter_addr, DataLayout::counter_increment); - __ bind(update_done); + type_profile_helper(mdo, md, data, recv); } else { __ cbz(value, done); } @@ -2540,13 +2499,9 @@ void LIR_Assembler::emit_profile_call(LIR_OpProfileCall* op) { if (C1OptimizeVirtualCallProfiling && known_klass != nullptr) { // We know the type that will be seen at this call site; we can // statically update the MethodData* rather than needing to do - // dynamic tests on the receiver type - - // NOTE: we should probably put a lock around this search to - // avoid collisions by concurrent compilations + // dynamic tests on the receiver type. ciVirtualCallData* vc_data = (ciVirtualCallData*) data; - uint i; - for (i = 0; i < VirtualCallData::row_limit(); i++) { + for (uint i = 0; i < VirtualCallData::row_limit(); i++) { ciKlass* receiver = vc_data->receiver(i); if (known_klass->equals(receiver)) { Address data_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i))); @@ -2554,36 +2509,13 @@ void LIR_Assembler::emit_profile_call(LIR_OpProfileCall* op) { return; } } - - // Receiver type not found in profile data; select an empty slot - - // Note that this is less efficient than it should be because it - // always does a write to the receiver part of the - // VirtualCallData rather than just the first time - for (i = 0; i < VirtualCallData::row_limit(); i++) { - ciKlass* receiver = vc_data->receiver(i); - if (receiver == nullptr) { - __ mov_metadata(rscratch1, known_klass->constant_encoding()); - Address recv_addr = - __ form_address(rscratch2, mdo, - md->byte_offset_of_slot(data, VirtualCallData::receiver_offset(i)), - LogBytesPerWord); - __ str(rscratch1, recv_addr); - Address data_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i))); - __ addptr(data_addr, DataLayout::counter_increment); - return; - } - } + // Receiver type is not found in profile data. + // Fall back to runtime helper to handle the rest at runtime. + __ mov_metadata(recv, known_klass->constant_encoding()); } else { __ load_klass(recv, recv); - Label update_done; - type_profile_helper(mdo, md, data, recv, &update_done); - // Receiver did not match any saved receiver and there is no empty row for it. - // Increment total counter to indicate polymorphic case. - __ addptr(counter_addr, DataLayout::counter_increment); - - __ bind(update_done); } + type_profile_helper(mdo, md, data, recv); } else { // Static call __ addptr(counter_addr, DataLayout::counter_increment); diff --git a/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.hpp index 21916a5f7dd..5af06fc6a1c 100644 --- a/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.hpp @@ -50,9 +50,8 @@ friend class ArrayCopyStub; Address stack_slot_address(int index, uint shift, Register tmp, int adjust = 0); // Record the type of the receiver in ReceiverTypeData - void type_profile_helper(Register mdo, - ciMethodData *md, ciProfileData *data, - Register recv, Label* update_done); + void type_profile_helper(Register mdo, ciMethodData *md, + ciProfileData *data, Register recv); void add_debug_info_for_branch(address adr, CodeEmitInfo* info); void casw(Register addr, Register newval, Register cmpval); diff --git a/src/hotspot/cpu/aarch64/interp_masm_aarch64.cpp b/src/hotspot/cpu/aarch64/interp_masm_aarch64.cpp index 957c2aee1c1..2b506b241e0 100644 --- a/src/hotspot/cpu/aarch64/interp_masm_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/interp_masm_aarch64.cpp @@ -240,15 +240,14 @@ void InterpreterMacroAssembler::load_resolved_klass_at_offset( // Rsub_klass: subklass // // Kills: -// r2, r5 +// r2 void InterpreterMacroAssembler::gen_subtype_check(Register Rsub_klass, Label& ok_is_subtype) { assert(Rsub_klass != r0, "r0 holds superklass"); assert(Rsub_klass != r2, "r2 holds 2ndary super array length"); - assert(Rsub_klass != r5, "r5 holds 2ndary super array scan ptr"); // Profile the not-null value's klass. - profile_typecheck(r2, Rsub_klass, r5); // blows r2, reloads r5 + profile_typecheck(r2, Rsub_klass); // blows r2 // Do the check. check_klass_subtype(Rsub_klass, r0, r2, ok_is_subtype); // blows r2 @@ -991,7 +990,6 @@ void InterpreterMacroAssembler::profile_final_call(Register mdp) { void InterpreterMacroAssembler::profile_virtual_call(Register receiver, Register mdp, - Register reg2, bool receiver_can_be_null) { if (ProfileInterpreter) { Label profile_continue; @@ -1009,7 +1007,7 @@ void InterpreterMacroAssembler::profile_virtual_call(Register receiver, } // Record the receiver type. - record_klass_in_profile(receiver, mdp, reg2); + profile_receiver_type(receiver, mdp, 0); bind(skip_receiver_profile); // The method data pointer needs to be updated to reflect the new target. @@ -1018,131 +1016,6 @@ void InterpreterMacroAssembler::profile_virtual_call(Register receiver, } } -// This routine creates a state machine for updating the multi-row -// type profile at a virtual call site (or other type-sensitive bytecode). -// The machine visits each row (of receiver/count) until the receiver type -// is found, or until it runs out of rows. At the same time, it remembers -// the location of the first empty row. (An empty row records null for its -// receiver, and can be allocated for a newly-observed receiver type.) -// Because there are two degrees of freedom in the state, a simple linear -// search will not work; it must be a decision tree. Hence this helper -// function is recursive, to generate the required tree structured code. -// It's the interpreter, so we are trading off code space for speed. -// See below for example code. -void InterpreterMacroAssembler::record_klass_in_profile_helper( - Register receiver, Register mdp, - Register reg2, int start_row, - Label& done) { - if (TypeProfileWidth == 0) { - increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset())); - } else { - record_item_in_profile_helper(receiver, mdp, reg2, 0, done, TypeProfileWidth, - &VirtualCallData::receiver_offset, &VirtualCallData::receiver_count_offset); - } -} - -void InterpreterMacroAssembler::record_item_in_profile_helper(Register item, Register mdp, - Register reg2, int start_row, Label& done, int total_rows, - OffsetFunction item_offset_fn, OffsetFunction item_count_offset_fn) { - int last_row = total_rows - 1; - assert(start_row <= last_row, "must be work left to do"); - // Test this row for both the item and for null. - // Take any of three different outcomes: - // 1. found item => increment count and goto done - // 2. found null => keep looking for case 1, maybe allocate this cell - // 3. found something else => keep looking for cases 1 and 2 - // Case 3 is handled by a recursive call. - for (int row = start_row; row <= last_row; row++) { - Label next_test; - bool test_for_null_also = (row == start_row); - - // See if the item is item[n]. - int item_offset = in_bytes(item_offset_fn(row)); - test_mdp_data_at(mdp, item_offset, item, - (test_for_null_also ? reg2 : noreg), - next_test); - // (Reg2 now contains the item from the CallData.) - - // The item is item[n]. Increment count[n]. - int count_offset = in_bytes(item_count_offset_fn(row)); - increment_mdp_data_at(mdp, count_offset); - b(done); - bind(next_test); - - if (test_for_null_also) { - Label found_null; - // Failed the equality check on item[n]... Test for null. - if (start_row == last_row) { - // The only thing left to do is handle the null case. - cbz(reg2, found_null); - // Item did not match any saved item and there is no empty row for it. - // Increment total counter to indicate polymorphic case. - increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset())); - b(done); - bind(found_null); - break; - } - // Since null is rare, make it be the branch-taken case. - cbz(reg2, found_null); - - // Put all the "Case 3" tests here. - record_item_in_profile_helper(item, mdp, reg2, start_row + 1, done, total_rows, - item_offset_fn, item_count_offset_fn); - - // Found a null. Keep searching for a matching item, - // but remember that this is an empty (unused) slot. - bind(found_null); - } - } - - // In the fall-through case, we found no matching item, but we - // observed the item[start_row] is null. - - // Fill in the item field and increment the count. - int item_offset = in_bytes(item_offset_fn(start_row)); - set_mdp_data_at(mdp, item_offset, item); - int count_offset = in_bytes(item_count_offset_fn(start_row)); - mov(reg2, DataLayout::counter_increment); - set_mdp_data_at(mdp, count_offset, reg2); - if (start_row > 0) { - b(done); - } -} - -// Example state machine code for three profile rows: -// // main copy of decision tree, rooted at row[1] -// if (row[0].rec == rec) { row[0].incr(); goto done; } -// if (row[0].rec != nullptr) { -// // inner copy of decision tree, rooted at row[1] -// if (row[1].rec == rec) { row[1].incr(); goto done; } -// if (row[1].rec != nullptr) { -// // degenerate decision tree, rooted at row[2] -// if (row[2].rec == rec) { row[2].incr(); goto done; } -// if (row[2].rec != nullptr) { count.incr(); goto done; } // overflow -// row[2].init(rec); goto done; -// } else { -// // remember row[1] is empty -// if (row[2].rec == rec) { row[2].incr(); goto done; } -// row[1].init(rec); goto done; -// } -// } else { -// // remember row[0] is empty -// if (row[1].rec == rec) { row[1].incr(); goto done; } -// if (row[2].rec == rec) { row[2].incr(); goto done; } -// row[0].init(rec); goto done; -// } -// done: - -void InterpreterMacroAssembler::record_klass_in_profile(Register receiver, - Register mdp, Register reg2) { - assert(ProfileInterpreter, "must be profiling"); - Label done; - - record_klass_in_profile_helper(receiver, mdp, reg2, 0, done); - - bind (done); -} - void InterpreterMacroAssembler::profile_ret(Register return_bci, Register mdp) { if (ProfileInterpreter) { @@ -1200,7 +1073,7 @@ void InterpreterMacroAssembler::profile_null_seen(Register mdp) { } } -void InterpreterMacroAssembler::profile_typecheck(Register mdp, Register klass, Register reg2) { +void InterpreterMacroAssembler::profile_typecheck(Register mdp, Register klass) { if (ProfileInterpreter) { Label profile_continue; @@ -1213,7 +1086,7 @@ void InterpreterMacroAssembler::profile_typecheck(Register mdp, Register klass, mdp_delta = in_bytes(VirtualCallData::virtual_call_data_size()); // Record the object type. - record_klass_in_profile(klass, mdp, reg2); + profile_receiver_type(klass, mdp, 0); } update_mdp_by_constant(mdp, mdp_delta); diff --git a/src/hotspot/cpu/aarch64/interp_masm_aarch64.hpp b/src/hotspot/cpu/aarch64/interp_masm_aarch64.hpp index 2b230a3b73e..74d4430000d 100644 --- a/src/hotspot/cpu/aarch64/interp_masm_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/interp_masm_aarch64.hpp @@ -273,15 +273,6 @@ class InterpreterMacroAssembler: public MacroAssembler { Register test_value_out, Label& not_equal_continue); - void record_klass_in_profile(Register receiver, Register mdp, - Register reg2); - void record_klass_in_profile_helper(Register receiver, Register mdp, - Register reg2, int start_row, - Label& done); - void record_item_in_profile_helper(Register item, Register mdp, - Register reg2, int start_row, Label& done, int total_rows, - OffsetFunction item_offset_fn, OffsetFunction item_count_offset_fn); - void update_mdp_by_offset(Register mdp_in, int offset_of_offset); void update_mdp_by_offset(Register mdp_in, Register reg, int offset_of_disp); void update_mdp_by_constant(Register mdp_in, int constant); @@ -295,11 +286,10 @@ class InterpreterMacroAssembler: public MacroAssembler { void profile_call(Register mdp); void profile_final_call(Register mdp); void profile_virtual_call(Register receiver, Register mdp, - Register scratch2, bool receiver_can_be_null = false); void profile_ret(Register return_bci, Register mdp); void profile_null_seen(Register mdp); - void profile_typecheck(Register mdp, Register klass, Register scratch); + void profile_typecheck(Register mdp, Register klass); void profile_typecheck_failed(Register mdp); void profile_switch_default(Register mdp); void profile_switch_case(Register index_in_scratch, Register mdp, diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp index e813a70372b..f8b5a6f825c 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp @@ -2118,6 +2118,161 @@ Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, } } +// Handle the receiver type profile update given the "recv" klass. +// +// Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset". +// If there are no matching or claimable receiver entries in RD, updates +// the polymorphic counter. +// +// This code expected to run by either the interpreter or JIT-ed code, without +// extra synchronization. For safety, receiver cells are claimed atomically, which +// avoids grossly misrepresenting the profiles under concurrent updates. For speed, +// counter updates are not atomic. +// +void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset) { + assert_different_registers(recv, mdp, rscratch1, rscratch2); + + int base_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(0)); + int end_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(ReceiverTypeData::row_limit())); + int poly_count_offset = in_bytes(CounterData::count_offset()); + int receiver_step = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset; + int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset; + + // Adjust for MDP offsets. + base_receiver_offset += mdp_offset; + end_receiver_offset += mdp_offset; + poly_count_offset += mdp_offset; + +#ifdef ASSERT + // We are about to walk the MDO slots without asking for offsets. + // Check that our math hits all the right spots. + for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) { + int real_recv_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c)); + int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c)); + int offset = base_receiver_offset + receiver_step*c; + int count_offset = offset + receiver_to_count_step; + assert(offset == real_recv_offset, "receiver slot math"); + assert(count_offset == real_count_offset, "receiver count math"); + } + int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset()); + assert(poly_count_offset == real_poly_count_offset, "poly counter math"); +#endif + + // Corner case: no profile table. Increment poly counter and exit. + if (ReceiverTypeData::row_limit() == 0) { + increment(Address(mdp, poly_count_offset), DataLayout::counter_increment); + return; + } + + Register offset = rscratch2; + + Label L_loop_search_receiver, L_loop_search_empty; + Label L_restart, L_found_recv, L_found_empty, L_polymorphic, L_count_update; + + // The code here recognizes three major cases: + // A. Fastest: receiver found in the table + // B. Fast: no receiver in the table, and the table is full + // C. Slow: no receiver in the table, free slots in the table + // + // The case A performance is most important, as perfectly-behaved code would end up + // there, especially with larger TypeProfileWidth. The case B performance is + // important as well, this is where bulk of code would land for normally megamorphic + // cases. The case C performance is not essential, its job is to deal with installation + // races, we optimize for code density instead. Case C needs to make sure that receiver + // rows are only claimed once. This makes sure we never overwrite a row for another + // receiver and never duplicate the receivers in the list, making profile type-accurate. + // + // It is very tempting to handle these cases in a single loop, and claim the first slot + // without checking the rest of the table. But, profiling code should tolerate free slots + // in the table, as class unloading can clear them. After such cleanup, the receiver + // we need might be _after_ the free slot. Therefore, we need to let at least full scan + // to complete, before trying to install new slots. Splitting the code in several tight + // loops also helpfully optimizes for cases A and B. + // + // This code is effectively: + // + // restart: + // // Fastest: receiver is already installed + // for (i = 0; i < receiver_count(); i++) { + // if (receiver(i) == recv) goto found_recv(i); + // } + // + // // Fast: no receiver, but profile is full + // for (i = 0; i < receiver_count(); i++) { + // if (receiver(i) == null) goto found_null(i); + // } + // goto polymorphic + // + // // Slow: try to install receiver + // found_null(i): + // CAS(&receiver(i), null, recv); + // goto restart + // + // polymorphic: + // count++; + // return + // + // found_recv(i): + // *receiver_count(i)++ + // + + bind(L_restart); + + // Fastest: receiver is already installed + mov(offset, base_receiver_offset); + bind(L_loop_search_receiver); + ldr(rscratch1, Address(mdp, offset)); + cmp(rscratch1, recv); + br(Assembler::EQ, L_found_recv); + add(offset, offset, receiver_step); + sub(rscratch1, offset, end_receiver_offset); + cbnz(rscratch1, L_loop_search_receiver); + + // Fast: no receiver, but profile is full + mov(offset, base_receiver_offset); + bind(L_loop_search_empty); + ldr(rscratch1, Address(mdp, offset)); + cbz(rscratch1, L_found_empty); + add(offset, offset, receiver_step); + sub(rscratch1, offset, end_receiver_offset); + cbnz(rscratch1, L_loop_search_empty); + b(L_polymorphic); + + // Slow: try to install receiver + bind(L_found_empty); + + // Atomically swing receiver slot: null -> recv. + // + // The update uses CAS, which clobbers rscratch1. Therefore, rscratch2 + // is used to hold the destination address. This is safe because the + // offset is no longer needed after the address is computed. + + lea(rscratch2, Address(mdp, offset)); + cmpxchg(/*addr*/ rscratch2, /*expected*/ zr, /*new*/ recv, Assembler::xword, + /*acquire*/ false, /*release*/ false, /*weak*/ true, noreg); + + // CAS success means the slot now has the receiver we want. CAS failure means + // something had claimed the slot concurrently: it can be the same receiver we want, + // or something else. Since this is a slow path, we can optimize for code density, + // and just restart the search from the beginning. + b(L_restart); + + // Counter updates: + + // Increment polymorphic counter instead of receiver slot. + bind(L_polymorphic); + mov(offset, poly_count_offset); + b(L_count_update); + + // Found a receiver, convert its slot offset to corresponding count offset. + bind(L_found_recv); + add(offset, offset, receiver_to_count_step); + + bind(L_count_update); + increment(Address(mdp, offset), DataLayout::counter_increment); +} + + void MacroAssembler::call_VM_leaf_base(address entry_point, int number_of_arguments, Label *retaddr) { diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp index 4baa07d7d49..7eb6cea0614 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp @@ -1122,6 +1122,8 @@ public: Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0); + void profile_receiver_type(Register recv, Register mdp, int mdp_offset); + void verify_sve_vector_length(Register tmp = rscratch1); void reinitialize_ptrue() { if (UseSVE > 0) { diff --git a/src/hotspot/cpu/aarch64/templateTable_aarch64.cpp b/src/hotspot/cpu/aarch64/templateTable_aarch64.cpp index 07b469650f0..5d4f7103a84 100644 --- a/src/hotspot/cpu/aarch64/templateTable_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/templateTable_aarch64.cpp @@ -3370,7 +3370,7 @@ void TemplateTable::invokevirtual_helper(Register index, __ load_klass(r0, recv); // profile this call - __ profile_virtual_call(r0, rlocals, r3); + __ profile_virtual_call(r0, rlocals); // get target Method & entry point __ lookup_virtual_method(r0, index, method); @@ -3500,7 +3500,7 @@ void TemplateTable::invokeinterface(int byte_no) { /*return_method=*/false); // profile this call - __ profile_virtual_call(r3, r13, r19); + __ profile_virtual_call(r3, r13); // Get declaring interface class from method, and itable index