/* * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. * */ #include "asm/assembler.hpp" #include "asm/assembler.inline.hpp" #include "opto/c2_MacroAssembler.hpp" #include "opto/compile.hpp" #include "opto/intrinsicnode.hpp" #include "opto/output.hpp" #include "opto/subnode.hpp" #include "runtime/stubRoutines.hpp" #include "utilities/globalDefinitions.hpp" #ifdef PRODUCT #define BLOCK_COMMENT(str) /* nothing */ #define STOP(error) stop(error) #else #define BLOCK_COMMENT(str) block_comment(str) #define STOP(error) block_comment(error); stop(error) #endif #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") void C2_MacroAssembler::fast_lock(Register obj, Register box, Register tmp1, Register tmp2, Register tmp3, Register tmp4) { // Flag register, zero for success; non-zero for failure. Register flag = t1; assert_different_registers(obj, box, tmp1, tmp2, tmp3, tmp4, flag, t0); mv(flag, 1); // Handle inflated monitor. Label inflated; // Finish fast lock successfully. MUST branch to with flag == 0 Label locked; // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0 Label slow_path; if (UseObjectMonitorTable) { // Clear cache in case fast locking succeeds or we need to take the slow-path. sd(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); } if (DiagnoseSyncOnValueBasedClasses != 0) { load_klass(tmp1, obj); lbu(tmp1, Address(tmp1, Klass::misc_flags_offset())); test_bit(tmp1, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class)); bnez(tmp1, slow_path); } const Register tmp1_mark = tmp1; const Register tmp3_t = tmp3; { // Fast locking // Push lock to the lock stack and finish successfully. MUST branch to with flag == 0 Label push; const Register tmp2_top = tmp2; // Check if lock-stack is full. lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); mv(tmp3_t, (unsigned)LockStack::end_offset()); bge(tmp2_top, tmp3_t, slow_path); // Check if recursive. add(tmp3_t, xthread, tmp2_top); ld(tmp3_t, Address(tmp3_t, -oopSize)); beq(obj, tmp3_t, push); // Relaxed normal load to check for monitor. Optimization for monitor case. ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value)); bnez(tmp3_t, inflated); // Not inflated assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la"); // Try to lock. Transition lock-bits 0b01 => 0b00 ori(tmp1_mark, tmp1_mark, markWord::unlocked_value); xori(tmp3_t, tmp1_mark, markWord::unlocked_value); cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64, /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_t); bne(tmp1_mark, tmp3_t, slow_path); bind(push); // After successful lock, push object on lock-stack. add(tmp3_t, xthread, tmp2_top); sd(obj, Address(tmp3_t)); addw(tmp2_top, tmp2_top, oopSize); sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); j(locked); } { // Handle inflated monitor. bind(inflated); const Register tmp1_monitor = tmp1; if (!UseObjectMonitorTable) { assert(tmp1_monitor == tmp1_mark, "should be the same here"); } else { Label monitor_found; // Load cache address la(tmp3_t, Address(xthread, JavaThread::om_cache_oops_offset())); const int num_unrolled = 2; for (int i = 0; i < num_unrolled; i++) { ld(tmp1, Address(tmp3_t)); beq(obj, tmp1, monitor_found); add(tmp3_t, tmp3_t, in_bytes(OMCache::oop_to_oop_difference())); } Label loop; // Search for obj in cache. bind(loop); // Check for match. ld(tmp1, Address(tmp3_t)); beq(obj, tmp1, monitor_found); // Search until null encountered, guaranteed _null_sentinel at end. add(tmp3_t, tmp3_t, in_bytes(OMCache::oop_to_oop_difference())); bnez(tmp1, loop); // Cache Miss. Take the slowpath. j(slow_path); bind(monitor_found); ld(tmp1_monitor, Address(tmp3_t, OMCache::oop_to_monitor_difference())); } const Register tmp2_owner_addr = tmp2; const Register tmp3_owner = tmp3; const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast(markWord::monitor_value)); const Address owner_address(tmp1_monitor, ObjectMonitor::owner_offset() - monitor_tag); const Address recursions_address(tmp1_monitor, ObjectMonitor::recursions_offset() - monitor_tag); Label monitor_locked; // Compute owner address. la(tmp2_owner_addr, owner_address); // Try to CAS owner (no owner => current thread's _monitor_owner_id). Register tid = tmp4; ld(tid, Address(xthread, JavaThread::monitor_owner_id_offset())); cmpxchg(/*addr*/ tmp2_owner_addr, /*expected*/ zr, /*new*/ tid, Assembler::int64, /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_owner); beqz(tmp3_owner, monitor_locked); // Check if recursive. bne(tmp3_owner, tid, slow_path); // Recursive. increment(recursions_address, 1, tmp2, tmp3); bind(monitor_locked); if (UseObjectMonitorTable) { sd(tmp1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); } } bind(locked); mv(flag, zr); #ifdef ASSERT // Check that locked label is reached with flag == 0. Label flag_correct; beqz(flag, flag_correct); stop("Fast Lock Flag != 0"); #endif bind(slow_path); #ifdef ASSERT // Check that slow_path label is reached with flag != 0. bnez(flag, flag_correct); stop("Fast Lock Flag == 0"); bind(flag_correct); #endif // C2 uses the value of flag (0 vs !0) to determine the continuation. } void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register tmp1, Register tmp2, Register tmp3) { // Flag register, zero for success; non-zero for failure. Register flag = t1; assert_different_registers(obj, box, tmp1, tmp2, tmp3, flag, t0); mv(flag, 1); // Handle inflated monitor. Label inflated, inflated_load_mark; // Finish fast unlock successfully. unlocked MUST branch to with flag == 0 Label unlocked; // Finish fast unlock unsuccessfully. MUST branch to with flag != 0 Label slow_path; const Register tmp1_mark = tmp1; const Register tmp2_top = tmp2; const Register tmp3_t = tmp3; { // Fast unlock Label push_and_slow_path; // Check if obj is top of lock-stack. lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); subw(tmp2_top, tmp2_top, oopSize); add(tmp3_t, xthread, tmp2_top); ld(tmp3_t, Address(tmp3_t)); // Top of lock stack was not obj. Must be monitor. bne(obj, tmp3_t, inflated_load_mark); // Pop lock-stack. DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);) DEBUG_ONLY(sd(zr, Address(tmp3_t));) sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); // Check if recursive. add(tmp3_t, xthread, tmp2_top); ld(tmp3_t, Address(tmp3_t, -oopSize)); beq(obj, tmp3_t, unlocked); // Not recursive. // Load Mark. ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); // Check header for monitor (0b10). // Because we got here by popping (meaning we pushed in locked) // there will be no monitor in the box. So we need to push back the obj // so that the runtime can fix any potential anonymous owner. test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value)); bnez(tmp3_t, UseObjectMonitorTable ? push_and_slow_path : inflated); // Try to unlock. Transition lock bits 0b00 => 0b01 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); ori(tmp3_t, tmp1_mark, markWord::unlocked_value); cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64, /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ tmp3_t); beq(tmp1_mark, tmp3_t, unlocked); bind(push_and_slow_path); // Compare and exchange failed. // Restore lock-stack and handle the unlock in runtime. DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);) DEBUG_ONLY(sd(obj, Address(tmp3_t));) addw(tmp2_top, tmp2_top, oopSize); sd(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); j(slow_path); } { // Handle inflated monitor. bind(inflated_load_mark); ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); #ifdef ASSERT test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value)); bnez(tmp3_t, inflated); stop("Fast Unlock not monitor"); #endif bind(inflated); #ifdef ASSERT Label check_done; subw(tmp2_top, tmp2_top, oopSize); mv(tmp3_t, in_bytes(JavaThread::lock_stack_base_offset())); blt(tmp2_top, tmp3_t, check_done); add(tmp3_t, xthread, tmp2_top); ld(tmp3_t, Address(tmp3_t)); bne(obj, tmp3_t, inflated); stop("Fast Unlock lock on stack"); bind(check_done); #endif const Register tmp1_monitor = tmp1; if (!UseObjectMonitorTable) { assert(tmp1_monitor == tmp1_mark, "should be the same here"); // Untag the monitor. subi(tmp1_monitor, tmp1_mark, (int)markWord::monitor_value); } else { ld(tmp1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); // No valid pointer below alignof(ObjectMonitor*). Take the slow path. mv(tmp3_t, alignof(ObjectMonitor*)); bltu(tmp1_monitor, tmp3_t, slow_path); } const Register tmp2_recursions = tmp2; Label not_recursive; // Check if recursive. ld(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset())); beqz(tmp2_recursions, not_recursive); // Recursive unlock. subi(tmp2_recursions, tmp2_recursions, 1); sd(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset())); j(unlocked); bind(not_recursive); const Register tmp2_owner_addr = tmp2; // Compute owner address. la(tmp2_owner_addr, Address(tmp1_monitor, ObjectMonitor::owner_offset())); // Set owner to null. // Release to satisfy the JMM membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore); sd(zr, Address(tmp2_owner_addr)); // We need a full fence after clearing owner to avoid stranding. // StoreLoad achieves this. membar(StoreLoad); // Check if the entry_list is empty. ld(t0, Address(tmp1_monitor, ObjectMonitor::entry_list_offset())); beqz(t0, unlocked); // If so we are done. // Check if there is a successor. ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::succ_offset())); bnez(tmp3_t, unlocked); // If so we are done. // Save the monitor pointer in the current thread, so we can try // to reacquire the lock in SharedRuntime::monitor_exit_helper(). sd(tmp1_monitor, Address(xthread, JavaThread::unlocked_inflated_monitor_offset())); mv(flag, 1); j(slow_path); } bind(unlocked); mv(flag, zr); #ifdef ASSERT // Check that unlocked label is reached with flag == 0. Label flag_correct; beqz(flag, flag_correct); stop("Fast Lock Flag != 0"); #endif bind(slow_path); #ifdef ASSERT // Check that slow_path label is reached with flag != 0. bnez(flag, flag_correct); stop("Fast Lock Flag == 0"); bind(flag_correct); #endif // C2 uses the value of flag (0 vs !0) to determine the continuation. } // short string // StringUTF16.indexOfChar // StringLatin1.indexOfChar void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1, Register ch, Register result, bool isL) { Register ch1 = t0; Register index = t1; BLOCK_COMMENT("string_indexof_char_short {"); Label LOOP, LOOP1, LOOP4, LOOP8; Label MATCH, MATCH1, MATCH2, MATCH3, MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH; mv(result, -1); mv(index, zr); bind(LOOP); addi(t0, index, 8); ble(t0, cnt1, LOOP8); addi(t0, index, 4); ble(t0, cnt1, LOOP4); j(LOOP1); bind(LOOP8); isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0)); beq(ch, ch1, MATCH); isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2)); beq(ch, ch1, MATCH1); isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4)); beq(ch, ch1, MATCH2); isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6)); beq(ch, ch1, MATCH3); isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8)); beq(ch, ch1, MATCH4); isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10)); beq(ch, ch1, MATCH5); isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12)); beq(ch, ch1, MATCH6); isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14)); beq(ch, ch1, MATCH7); addi(index, index, 8); addi(str1, str1, isL ? 8 : 16); blt(index, cnt1, LOOP); j(NOMATCH); bind(LOOP4); isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0)); beq(ch, ch1, MATCH); isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2)); beq(ch, ch1, MATCH1); isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4)); beq(ch, ch1, MATCH2); isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6)); beq(ch, ch1, MATCH3); addi(index, index, 4); addi(str1, str1, isL ? 4 : 8); bge(index, cnt1, NOMATCH); bind(LOOP1); isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1)); beq(ch, ch1, MATCH); addi(index, index, 1); addi(str1, str1, isL ? 1 : 2); blt(index, cnt1, LOOP1); j(NOMATCH); bind(MATCH1); addi(index, index, 1); j(MATCH); bind(MATCH2); addi(index, index, 2); j(MATCH); bind(MATCH3); addi(index, index, 3); j(MATCH); bind(MATCH4); addi(index, index, 4); j(MATCH); bind(MATCH5); addi(index, index, 5); j(MATCH); bind(MATCH6); addi(index, index, 6); j(MATCH); bind(MATCH7); addi(index, index, 7); bind(MATCH); mv(result, index); bind(NOMATCH); BLOCK_COMMENT("} string_indexof_char_short"); } // StringUTF16.indexOfChar // StringLatin1.indexOfChar void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, Register tmp1, Register tmp2, Register tmp3, Register tmp4, bool isL) { Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG; Register ch1 = t0; Register orig_cnt = t1; Register mask1 = tmp3; Register mask2 = tmp2; Register match_mask = tmp1; Register trailing_char = tmp4; Register unaligned_elems = tmp4; BLOCK_COMMENT("string_indexof_char {"); beqz(cnt1, NOMATCH); subi(t0, cnt1, isL ? 32 : 16); bgtz(t0, DO_LONG); string_indexof_char_short(str1, cnt1, ch, result, isL); j(DONE); bind(DO_LONG); mv(orig_cnt, cnt1); if (AvoidUnalignedAccesses) { Label ALIGNED; andi(unaligned_elems, str1, 0x7); beqz(unaligned_elems, ALIGNED); sub(unaligned_elems, unaligned_elems, 8); neg(unaligned_elems, unaligned_elems); if (!isL) { srli(unaligned_elems, unaligned_elems, 1); } // do unaligned part per element string_indexof_char_short(str1, unaligned_elems, ch, result, isL); bgez(result, DONE); mv(orig_cnt, cnt1); sub(cnt1, cnt1, unaligned_elems); bind(ALIGNED); } // duplicate ch if (isL) { slli(ch1, ch, 8); orr(ch, ch1, ch); } slli(ch1, ch, 16); orr(ch, ch1, ch); slli(ch1, ch, 32); orr(ch, ch1, ch); if (!isL) { slli(cnt1, cnt1, 1); } uint64_t mask0101 = UCONST64(0x0101010101010101); uint64_t mask0001 = UCONST64(0x0001000100010001); mv(mask1, isL ? mask0101 : mask0001); uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f); uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff); mv(mask2, isL ? mask7f7f : mask7fff); bind(CH1_LOOP); ld(ch1, Address(str1)); addi(str1, str1, 8); subi(cnt1, cnt1, 8); compute_match_mask(ch1, ch, match_mask, mask1, mask2); bnez(match_mask, HIT); bgtz(cnt1, CH1_LOOP); j(NOMATCH); bind(HIT); // count bits of trailing zero chars ctzc_bits(trailing_char, match_mask, isL, ch1, result); srli(trailing_char, trailing_char, 3); addi(cnt1, cnt1, 8); ble(cnt1, trailing_char, NOMATCH); // match case if (!isL) { srli(cnt1, cnt1, 1); srli(trailing_char, trailing_char, 1); } sub(result, orig_cnt, cnt1); add(result, result, trailing_char); j(DONE); bind(NOMATCH); mv(result, -1); bind(DONE); BLOCK_COMMENT("} string_indexof_char"); } typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp); // Search for needle in haystack and return index or -1 // x10: result // x11: haystack // x12: haystack_len // x13: needle // x14: needle_len void C2_MacroAssembler::string_indexof(Register haystack, Register needle, Register haystack_len, Register needle_len, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register tmp6, Register result, int ae) { assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH; Register ch1 = t0; Register ch2 = t1; Register nlen_tmp = tmp1; // needle len tmp Register hlen_tmp = tmp2; // haystack len tmp Register result_tmp = tmp4; bool isLL = ae == StrIntrinsicNode::LL; bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; int needle_chr_shift = needle_isL ? 0 : 1; int haystack_chr_shift = haystack_isL ? 0 : 1; int needle_chr_size = needle_isL ? 1 : 2; int haystack_chr_size = haystack_isL ? 1 : 2; load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu; load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu; BLOCK_COMMENT("string_indexof {"); // Note, inline_string_indexOf() generates checks: // if (pattern.count > src.count) return -1; // if (pattern.count == 0) return 0; // We have two strings, a source string in haystack, haystack_len and a pattern string // in needle, needle_len. Find the first occurrence of pattern in source or return -1. // For larger pattern and source we use a simplified Boyer Moore algorithm. // With a small pattern and source we use linear scan. // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm. sub(result_tmp, haystack_len, needle_len); // needle_len < 8, use linear scan sub(t0, needle_len, 8); bltz(t0, LINEARSEARCH); // needle_len >= 256, use linear scan sub(t0, needle_len, 256); bgez(t0, LINEARSTUB); // needle_len >= haystack_len/4, use linear scan srli(t0, haystack_len, 2); bge(needle_len, t0, LINEARSTUB); // Boyer-Moore-Horspool introduction: // The Boyer Moore alogorithm is based on the description here:- // // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm // // This describes and algorithm with 2 shift rules. The 'Bad Character' rule // and the 'Good Suffix' rule. // // These rules are essentially heuristics for how far we can shift the // pattern along the search string. // // The implementation here uses the 'Bad Character' rule only because of the // complexity of initialisation for the 'Good Suffix' rule. // // This is also known as the Boyer-Moore-Horspool algorithm: // // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm // // #define ASIZE 256 // // int bm(unsigned char *pattern, int m, unsigned char *src, int n) { // int i, j; // unsigned c; // unsigned char bc[ASIZE]; // // /* Preprocessing */ // for (i = 0; i < ASIZE; ++i) // bc[i] = m; // for (i = 0; i < m - 1; ) { // c = pattern[i]; // ++i; // // c < 256 for Latin1 string, so, no need for branch // #ifdef PATTERN_STRING_IS_LATIN1 // bc[c] = m - i; // #else // if (c < ASIZE) bc[c] = m - i; // #endif // } // // /* Searching */ // j = 0; // while (j <= n - m) { // c = src[i+j]; // if (pattern[m-1] == c) // int k; // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k); // if (k < 0) return j; // // c < 256 for Latin1 string, so, no need for branch // #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1 // // LL case: (c< 256) always true. Remove branch // j += bc[pattern[j+m-1]]; // #endif // #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF // // UU case: need if (c if not. // if (c < ASIZE) // j += bc[pattern[j+m-1]]; // else // j += m // #endif // } // return -1; // } // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; Register haystack_end = haystack_len; Register skipch = tmp2; // pattern length is >=8, so, we can read at least 1 register for cases when // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for // UL case. We'll re-read last character in inner pre-loop code to have // single outer pre-loop load const int firstStep = isLL ? 7 : 3; const int ASIZE = 256; const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd) subi(sp, sp, ASIZE); // init BC offset table with default value: needle_len slli(t0, needle_len, 8); orr(t0, t0, needle_len); // [63...16][needle_len][needle_len] slli(tmp1, t0, 16); orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len] slli(tmp1, t0, 32); orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len] mv(ch1, sp); // ch1 is t0 mv(tmp6, ASIZE / STORE_BYTES); // loop iterations bind(BM_INIT_LOOP); // for (i = 0; i < ASIZE; ++i) // bc[i] = m; for (int i = 0; i < 4; i++) { sd(tmp5, Address(ch1, i * wordSize)); } addi(ch1, ch1, 32); subi(tmp6, tmp6, 4); bgtz(tmp6, BM_INIT_LOOP); subi(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern Register orig_haystack = tmp5; mv(orig_haystack, haystack); // result_tmp = tmp4 shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift); subi(ch2, needle_len, 1); // bc offset init value, ch2 is t1 mv(tmp3, needle); // for (i = 0; i < m - 1; ) { // c = pattern[i]; // ++i; // // c < 256 for Latin1 string, so, no need for branch // #ifdef PATTERN_STRING_IS_LATIN1 // bc[c] = m - i; // #else // if (c < ASIZE) bc[c] = m - i; // #endif // } bind(BCLOOP); (this->*needle_load_1chr)(ch1, Address(tmp3), noreg); addi(tmp3, tmp3, needle_chr_size); if (!needle_isL) { // ae == StrIntrinsicNode::UU mv(tmp6, ASIZE); bgeu(ch1, tmp6, BCSKIP); } add(tmp4, sp, ch1); sb(ch2, Address(tmp4)); // store skip offset to BC offset table bind(BCSKIP); subi(ch2, ch2, 1); // for next pattern element, skip distance -1 bgtz(ch2, BCLOOP); // tmp6: pattern end, address after needle shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift); if (needle_isL == haystack_isL) { // load last 8 bytes (8LL/4UU symbols) ld(tmp6, Address(tmp6, -wordSize)); } else { // UL: from UTF-16(source) search Latin1(pattern) lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols) // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d // We'll have to wait until load completed, but it's still faster than per-character loads+checks srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a slli(ch2, tmp6, XLEN - 24); srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b slli(ch1, tmp6, XLEN - 16); srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c zext(tmp6, tmp6, 8); // pattern[m-4], 0x0000000d slli(ch2, ch2, 16); orr(ch2, ch2, ch1); // 0x00000b0c slli(result, tmp3, 48); // use result as temp register orr(tmp6, tmp6, result); // 0x0a00000d slli(result, ch2, 16); orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d } // i = m - 1; // skipch = j + i; // if (skipch == pattern[m - 1] // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k); // else // move j with bad char offset table bind(BMLOOPSTR2); // compare pattern to source string backward shadd(result, nlen_tmp, haystack, result, haystack_chr_shift); (this->*haystack_load_1chr)(skipch, Address(result), noreg); subi(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8 if (needle_isL == haystack_isL) { // re-init tmp3. It's for free because it's executed in parallel with // load above. Alternative is to initialize it before loop, but it'll // affect performance on in-order systems with 2 or more ld/st pipelines srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1] } if (!isLL) { // UU/UL case slli(ch2, nlen_tmp, 1); // offsets in bytes } bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char add(result, haystack, isLL ? nlen_tmp : ch2); // load 8 bytes from source string // if isLL is false then read granularity can be 2 load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway mv(ch1, tmp6); if (isLL) { j(BMLOOPSTR1_AFTER_LOAD); } else { subi(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 j(BMLOOPSTR1_CMP); } bind(BMLOOPSTR1); shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift); (this->*needle_load_1chr)(ch1, Address(ch1), noreg); shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift); (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); bind(BMLOOPSTR1_AFTER_LOAD); subi(nlen_tmp, nlen_tmp, 1); bltz(nlen_tmp, BMLOOPSTR1_LASTCMP); bind(BMLOOPSTR1_CMP); beq(ch1, ch2, BMLOOPSTR1); bind(BMSKIP); if (!isLL) { // if we've met UTF symbol while searching Latin1 pattern, then we can // skip needle_len symbols if (needle_isL != haystack_isL) { mv(result_tmp, needle_len); } else { mv(result_tmp, 1); } mv(t0, ASIZE); bgeu(skipch, t0, BMADV); } add(result_tmp, sp, skipch); lbu(result_tmp, Address(result_tmp)); // load skip offset bind(BMADV); subi(nlen_tmp, needle_len, 1); // move haystack after bad char skip offset shadd(haystack, result_tmp, haystack, result, haystack_chr_shift); ble(haystack, haystack_end, BMLOOPSTR2); addi(sp, sp, ASIZE); j(NOMATCH); bind(BMLOOPSTR1_LASTCMP); bne(ch1, ch2, BMSKIP); bind(BMMATCH); sub(result, haystack, orig_haystack); if (!haystack_isL) { srli(result, result, 1); } addi(sp, sp, ASIZE); j(DONE); bind(LINEARSTUB); subi(t0, needle_len, 16); // small patterns still should be handled by simple algorithm bltz(t0, LINEARSEARCH); mv(result, zr); RuntimeAddress stub = nullptr; if (isLL) { stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll()); assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); } else if (needle_isL) { stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul()); assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); } else { stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu()); assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); } address call = reloc_call(stub); if (call == nullptr) { DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH)); ciEnv::current()->record_failure("CodeCache is full"); return; } j(DONE); bind(NOMATCH); mv(result, -1); j(DONE); bind(LINEARSEARCH); string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae); bind(DONE); BLOCK_COMMENT("} string_indexof"); } // string_indexof // result: x10 // src: x11 // src_count: x12 // pattern: x13 // pattern_count: x14 or 1/2/3/4 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle, Register haystack_len, Register needle_len, Register tmp1, Register tmp2, Register tmp3, Register tmp4, int needle_con_cnt, Register result, int ae) { // Note: // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1 assert(needle_con_cnt <= 4, "Invalid needle constant count"); assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); Register ch1 = t0; Register ch2 = t1; Register hlen_neg = haystack_len, nlen_neg = needle_len; Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4; bool isLL = ae == StrIntrinsicNode::LL; bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; int needle_chr_shift = needle_isL ? 0 : 1; int haystack_chr_shift = haystack_isL ? 0 : 1; int needle_chr_size = needle_isL ? 1 : 2; int haystack_chr_size = haystack_isL ? 1 : 2; load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu; load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu; load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu; load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld; Label DO1, DO2, DO3, MATCH, NOMATCH, DONE; Register first = tmp3; if (needle_con_cnt == -1) { Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; subi(t0, needle_len, needle_isL == haystack_isL ? 4 : 2); bltz(t0, DOSHORT); (this->*needle_load_1chr)(first, Address(needle), noreg); slli(t0, needle_len, needle_chr_shift); add(needle, needle, t0); neg(nlen_neg, t0); slli(t0, result_tmp, haystack_chr_shift); add(haystack, haystack, t0); neg(hlen_neg, t0); bind(FIRST_LOOP); add(t0, haystack, hlen_neg); (this->*haystack_load_1chr)(ch2, Address(t0), noreg); beq(first, ch2, STR1_LOOP); bind(STR2_NEXT); addi(hlen_neg, hlen_neg, haystack_chr_size); blez(hlen_neg, FIRST_LOOP); j(NOMATCH); bind(STR1_LOOP); addi(nlen_tmp, nlen_neg, needle_chr_size); addi(hlen_tmp, hlen_neg, haystack_chr_size); bgez(nlen_tmp, MATCH); bind(STR1_NEXT); add(ch1, needle, nlen_tmp); (this->*needle_load_1chr)(ch1, Address(ch1), noreg); add(ch2, haystack, hlen_tmp); (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); bne(ch1, ch2, STR2_NEXT); addi(nlen_tmp, nlen_tmp, needle_chr_size); addi(hlen_tmp, hlen_tmp, haystack_chr_size); bltz(nlen_tmp, STR1_NEXT); j(MATCH); bind(DOSHORT); if (needle_isL == haystack_isL) { subi(t0, needle_len, 2); bltz(t0, DO1); bgtz(t0, DO3); } } if (needle_con_cnt == 4) { Label CH1_LOOP; (this->*load_4chr)(ch1, Address(needle), noreg); subi(result_tmp, haystack_len, 4); slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp add(haystack, haystack, tmp3); neg(hlen_neg, tmp3); if (AvoidUnalignedAccesses) { // preload first value, then we will read by 1 character per loop, instead of four // just shifting previous ch2 right by size of character in bits add(tmp3, haystack, hlen_neg); (this->*load_4chr)(ch2, Address(tmp3), noreg); if (isLL) { // need to erase 1 most significant byte in 32-bit value of ch2 slli(ch2, ch2, 40); srli(ch2, ch2, 32); } else { slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation } } bind(CH1_LOOP); add(tmp3, haystack, hlen_neg); if (AvoidUnalignedAccesses) { srli(ch2, ch2, isLL ? 8 : 16); (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg); slli(tmp3, tmp3, isLL ? 24 : 48); add(ch2, ch2, tmp3); } else { (this->*load_4chr)(ch2, Address(tmp3), noreg); } beq(ch1, ch2, MATCH); addi(hlen_neg, hlen_neg, haystack_chr_size); blez(hlen_neg, CH1_LOOP); j(NOMATCH); } if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) { Label CH1_LOOP; BLOCK_COMMENT("string_indexof DO2 {"); bind(DO2); (this->*load_2chr)(ch1, Address(needle), noreg); if (needle_con_cnt == 2) { subi(result_tmp, haystack_len, 2); } slli(tmp3, result_tmp, haystack_chr_shift); add(haystack, haystack, tmp3); neg(hlen_neg, tmp3); if (AvoidUnalignedAccesses) { // preload first value, then we will read by 1 character per loop, instead of two // just shifting previous ch2 right by size of character in bits add(tmp3, haystack, hlen_neg); (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg); slli(ch2, ch2, isLL ? 8 : 16); } bind(CH1_LOOP); add(tmp3, haystack, hlen_neg); if (AvoidUnalignedAccesses) { srli(ch2, ch2, isLL ? 8 : 16); (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg); slli(tmp3, tmp3, isLL ? 8 : 16); add(ch2, ch2, tmp3); } else { (this->*load_2chr)(ch2, Address(tmp3), noreg); } beq(ch1, ch2, MATCH); addi(hlen_neg, hlen_neg, haystack_chr_size); blez(hlen_neg, CH1_LOOP); j(NOMATCH); BLOCK_COMMENT("} string_indexof DO2"); } if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) { Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; BLOCK_COMMENT("string_indexof DO3 {"); bind(DO3); (this->*load_2chr)(first, Address(needle), noreg); (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg); if (needle_con_cnt == 3) { subi(result_tmp, haystack_len, 3); } slli(hlen_tmp, result_tmp, haystack_chr_shift); add(haystack, haystack, hlen_tmp); neg(hlen_neg, hlen_tmp); bind(FIRST_LOOP); add(ch2, haystack, hlen_neg); if (AvoidUnalignedAccesses) { (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); slli(tmp2, tmp2, isLL ? 8 : 16); add(ch2, ch2, tmp2); } else { (this->*load_2chr)(ch2, Address(ch2), noreg); } beq(first, ch2, STR1_LOOP); bind(STR2_NEXT); addi(hlen_neg, hlen_neg, haystack_chr_size); blez(hlen_neg, FIRST_LOOP); j(NOMATCH); bind(STR1_LOOP); addi(hlen_tmp, hlen_neg, 2 * haystack_chr_size); add(ch2, haystack, hlen_tmp); (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); bne(ch1, ch2, STR2_NEXT); j(MATCH); BLOCK_COMMENT("} string_indexof DO3"); } if (needle_con_cnt == -1 || needle_con_cnt == 1) { Label DO1_LOOP; BLOCK_COMMENT("string_indexof DO1 {"); bind(DO1); (this->*needle_load_1chr)(ch1, Address(needle), noreg); subi(result_tmp, haystack_len, 1); slli(tmp3, result_tmp, haystack_chr_shift); add(haystack, haystack, tmp3); neg(hlen_neg, tmp3); bind(DO1_LOOP); add(tmp3, haystack, hlen_neg); (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg); beq(ch1, ch2, MATCH); addi(hlen_neg, hlen_neg, haystack_chr_size); blez(hlen_neg, DO1_LOOP); BLOCK_COMMENT("} string_indexof DO1"); } bind(NOMATCH); mv(result, -1); j(DONE); bind(MATCH); srai(t0, hlen_neg, haystack_chr_shift); add(result, result_tmp, t0); bind(DONE); } // Compare longwords void C2_MacroAssembler::string_compare_long_same_encoding(Register result, Register str1, Register str2, const bool isLL, Register cnt1, Register cnt2, Register tmp1, Register tmp2, Register tmp3, const int STUB_THRESHOLD, Label *STUB, Label *SHORT_STRING, Label *DONE) { Label TAIL_CHECK, TAIL, NEXT_WORD, DIFFERENCE; const int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE); assert((base_offset % (UseCompactObjectHeaders ? 4 : (UseCompressedClassPointers ? 8 : 4))) == 0, "Must be"); const int minCharsInWord = isLL ? wordSize : wordSize / 2; // load first parts of strings and finish initialization while loading beq(str1, str2, *DONE); // Alignment if (AvoidUnalignedAccesses && (base_offset % 8) != 0) { lwu(tmp1, Address(str1)); lwu(tmp2, Address(str2)); bne(tmp1, tmp2, DIFFERENCE); addi(str1, str1, 4); addi(str2, str2, 4); subi(cnt2, cnt2, minCharsInWord / 2); // A very short string mv(t0, minCharsInWord); ble(cnt2, t0, *SHORT_STRING); } #ifdef ASSERT if (AvoidUnalignedAccesses) { Label align_ok; orr(t0, str1, str2); andi(t0, t0, 0x7); beqz(t0, align_ok); stop("bad alignment"); bind(align_ok); } #endif // load 8 bytes once to compare ld(tmp1, Address(str1)); ld(tmp2, Address(str2)); mv(t0, STUB_THRESHOLD); bge(cnt2, t0, *STUB); subi(cnt2, cnt2, minCharsInWord); beqz(cnt2, TAIL_CHECK); // convert cnt2 from characters to bytes if (!isLL) { slli(cnt2, cnt2, 1); } add(str2, str2, cnt2); add(str1, str1, cnt2); sub(cnt2, zr, cnt2); addi(cnt2, cnt2, 8); bne(tmp1, tmp2, DIFFERENCE); bgez(cnt2, TAIL); // main loop bind(NEXT_WORD); // 8-byte aligned loads when AvoidUnalignedAccesses is enabled add(t0, str1, cnt2); ld(tmp1, Address(t0)); add(t0, str2, cnt2); ld(tmp2, Address(t0)); addi(cnt2, cnt2, 8); bne(tmp1, tmp2, DIFFERENCE); bltz(cnt2, NEXT_WORD); bind(TAIL); load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2); load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2); bind(TAIL_CHECK); beq(tmp1, tmp2, *DONE); // Find the first different characters in the longwords and // compute their difference. bind(DIFFERENCE); xorr(tmp3, tmp1, tmp2); // count bits of trailing zero chars ctzc_bits(result, tmp3, isLL); srl(tmp1, tmp1, result); srl(tmp2, tmp2, result); if (isLL) { zext(tmp1, tmp1, 8); zext(tmp2, tmp2, 8); } else { zext(tmp1, tmp1, 16); zext(tmp2, tmp2, 16); } sub(result, tmp1, tmp2); j(*DONE); } // Compare longwords void C2_MacroAssembler::string_compare_long_different_encoding(Register result, Register str1, Register str2, bool isLU, Register cnt1, Register cnt2, Register tmp1, Register tmp2, Register tmp3, const int STUB_THRESHOLD, Label *STUB, Label *DONE) { Label TAIL, NEXT_WORD, DIFFERENCE; const int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE); assert((base_offset % (UseCompactObjectHeaders ? 4 : (UseCompressedClassPointers ? 8 : 4))) == 0, "Must be"); Register strL = isLU ? str1 : str2; Register strU = isLU ? str2 : str1; Register tmpL = tmp1, tmpU = tmp2; // load first parts of strings and finish initialization while loading mv(t0, STUB_THRESHOLD); bge(cnt2, t0, *STUB); lwu(tmpL, Address(strL)); load_long_misaligned(tmpU, Address(strU), tmp3, (base_offset % 8) != 0 ? 4 : 8); subi(cnt2, cnt2, 4); add(strL, strL, cnt2); sub(cnt1, zr, cnt2); slli(cnt2, cnt2, 1); add(strU, strU, cnt2); inflate_lo32(tmp3, tmpL); mv(tmpL, tmp3); sub(cnt2, zr, cnt2); addi(cnt1, cnt1, 4); addi(cnt2, cnt2, 8); bne(tmpL, tmpU, DIFFERENCE); bgez(cnt2, TAIL); // main loop bind(NEXT_WORD); add(t0, strL, cnt1); lwu(tmpL, Address(t0)); add(t0, strU, cnt2); load_long_misaligned(tmpU, Address(t0), tmp3, (base_offset % 8) != 0 ? 4 : 8); addi(cnt1, cnt1, 4); inflate_lo32(tmp3, tmpL); mv(tmpL, tmp3); addi(cnt2, cnt2, 8); bne(tmpL, tmpU, DIFFERENCE); bltz(cnt2, NEXT_WORD); bind(TAIL); load_int_misaligned(tmpL, Address(strL), tmp3, false); load_long_misaligned(tmpU, Address(strU), tmp3, 2); inflate_lo32(tmp3, tmpL); mv(tmpL, tmp3); beq(tmpL, tmpU, *DONE); // Find the first different characters in the longwords and // compute their difference. bind(DIFFERENCE); xorr(tmp3, tmpL, tmpU); // count bits of trailing zero chars ctzc_bits(result, tmp3); srl(tmpL, tmpL, result); srl(tmpU, tmpU, result); zext(tmpL, tmpL, 16); zext(tmpU, tmpU, 16); if (isLU) { sub(result, tmpL, tmpU); } else { sub(result, tmpU, tmpL); } j(*DONE); } // Compare strings. void C2_MacroAssembler::string_compare(Register str1, Register str2, Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, Register tmp3, int ae) { Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, STUB, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START, L; const int STUB_THRESHOLD = 64 + 8; bool isLL = ae == StrIntrinsicNode::LL; bool isLU = ae == StrIntrinsicNode::LU; bool isUL = ae == StrIntrinsicNode::UL; bool str1_isL = isLL || isLU; bool str2_isL = isLL || isUL; // for L strings, 1 byte for 1 character // for U strings, 2 bytes for 1 character int str1_chr_size = str1_isL ? 1 : 2; int str2_chr_size = str2_isL ? 1 : 2; int minCharsInWord = isLL ? wordSize : wordSize / 2; load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu; load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu; BLOCK_COMMENT("string_compare {"); // Bizarrely, the counts are passed in bytes, regardless of whether they // are L or U strings, however the result is always in characters. if (!str1_isL) { sraiw(cnt1, cnt1, 1); } if (!str2_isL) { sraiw(cnt2, cnt2, 1); } // Compute the minimum of the string lengths and save the difference in result. sub(result, cnt1, cnt2); bgt(cnt1, cnt2, L); mv(cnt2, cnt1); bind(L); // A very short string mv(t0, minCharsInWord); ble(cnt2, t0, SHORT_STRING); // Compare longwords { if (str1_isL == str2_isL) { // LL or UU string_compare_long_same_encoding(result, str1, str2, isLL, cnt1, cnt2, tmp1, tmp2, tmp3, STUB_THRESHOLD, &STUB, &SHORT_STRING, &DONE); } else { // LU or UL string_compare_long_different_encoding(result, str1, str2, isLU, cnt1, cnt2, tmp1, tmp2, tmp3, STUB_THRESHOLD, &STUB, &DONE); } } bind(STUB); RuntimeAddress stub = nullptr; switch (ae) { case StrIntrinsicNode::LL: stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL()); break; case StrIntrinsicNode::UU: stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU()); break; case StrIntrinsicNode::LU: stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU()); break; case StrIntrinsicNode::UL: stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL()); break; default: ShouldNotReachHere(); } assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); address call = reloc_call(stub); if (call == nullptr) { DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); ciEnv::current()->record_failure("CodeCache is full"); return; } j(DONE); bind(SHORT_STRING); // Is the minimum length zero? beqz(cnt2, DONE); // arrange code to do most branches while loading and loading next characters // while comparing previous (this->*str1_load_chr)(tmp1, Address(str1), t0); addi(str1, str1, str1_chr_size); subi(cnt2, cnt2, 1); beqz(cnt2, SHORT_LAST_INIT); (this->*str2_load_chr)(cnt1, Address(str2), t0); addi(str2, str2, str2_chr_size); j(SHORT_LOOP_START); bind(SHORT_LOOP); subi(cnt2, cnt2, 1); beqz(cnt2, SHORT_LAST); bind(SHORT_LOOP_START); (this->*str1_load_chr)(tmp2, Address(str1), t0); addi(str1, str1, str1_chr_size); (this->*str2_load_chr)(t0, Address(str2), t0); addi(str2, str2, str2_chr_size); bne(tmp1, cnt1, SHORT_LOOP_TAIL); subi(cnt2, cnt2, 1); beqz(cnt2, SHORT_LAST2); (this->*str1_load_chr)(tmp1, Address(str1), t0); addi(str1, str1, str1_chr_size); (this->*str2_load_chr)(cnt1, Address(str2), t0); addi(str2, str2, str2_chr_size); beq(tmp2, t0, SHORT_LOOP); sub(result, tmp2, t0); j(DONE); bind(SHORT_LOOP_TAIL); sub(result, tmp1, cnt1); j(DONE); bind(SHORT_LAST2); beq(tmp2, t0, DONE); sub(result, tmp2, t0); j(DONE); bind(SHORT_LAST_INIT); (this->*str2_load_chr)(cnt1, Address(str2), t0); addi(str2, str2, str2_chr_size); bind(SHORT_LAST); beq(tmp1, cnt1, DONE); sub(result, tmp1, cnt1); bind(DONE); BLOCK_COMMENT("} string_compare"); } void C2_MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp1, Register tmp2, Register tmp3, Register result, int elem_size) { assert(elem_size == 1 || elem_size == 2, "must be char or byte"); assert_different_registers(a1, a2, result, tmp1, tmp2, tmp3, t0); int elem_per_word = wordSize / elem_size; int log_elem_size = exact_log2(elem_size); int length_offset = arrayOopDesc::length_offset_in_bytes(); int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); assert((base_offset % (UseCompactObjectHeaders ? 4 : (UseCompressedClassPointers ? 8 : 4))) == 0, "Must be"); Register cnt1 = tmp3; Register cnt2 = tmp1; // cnt2 only used in array length compare Label DONE, SAME, NEXT_WORD, SHORT, TAIL03, TAIL01; BLOCK_COMMENT("arrays_equals {"); // if (a1 == a2), return true beq(a1, a2, SAME); mv(result, false); // if (a1 == nullptr || a2 == nullptr) // return false; beqz(a1, DONE); beqz(a2, DONE); // if (a1.length != a2.length) // return false; lwu(cnt1, Address(a1, length_offset)); lwu(cnt2, Address(a2, length_offset)); bne(cnt1, cnt2, DONE); la(a1, Address(a1, base_offset)); la(a2, Address(a2, base_offset)); // Load 4 bytes once to compare for alignment before main loop. if (AvoidUnalignedAccesses && (base_offset % 8) != 0) { subi(cnt1, cnt1, elem_per_word / 2); bltz(cnt1, TAIL03); lwu(tmp1, Address(a1)); lwu(tmp2, Address(a2)); addi(a1, a1, 4); addi(a2, a2, 4); bne(tmp1, tmp2, DONE); } // Check for short strings, i.e. smaller than wordSize. subi(cnt1, cnt1, elem_per_word); bltz(cnt1, SHORT); #ifdef ASSERT if (AvoidUnalignedAccesses) { Label align_ok; orr(t0, a1, a2); andi(t0, t0, 0x7); beqz(t0, align_ok); stop("bad alignment"); bind(align_ok); } #endif // Main 8 byte comparison loop. bind(NEXT_WORD); { ld(tmp1, Address(a1)); ld(tmp2, Address(a2)); subi(cnt1, cnt1, elem_per_word); addi(a1, a1, wordSize); addi(a2, a2, wordSize); bne(tmp1, tmp2, DONE); } bgez(cnt1, NEXT_WORD); addi(tmp1, cnt1, elem_per_word); beqz(tmp1, SAME); bind(SHORT); test_bit(tmp1, cnt1, 2 - log_elem_size); beqz(tmp1, TAIL03); // 0-7 bytes left. { lwu(tmp1, Address(a1)); lwu(tmp2, Address(a2)); addi(a1, a1, 4); addi(a2, a2, 4); bne(tmp1, tmp2, DONE); } bind(TAIL03); test_bit(tmp1, cnt1, 1 - log_elem_size); beqz(tmp1, TAIL01); // 0-3 bytes left. { lhu(tmp1, Address(a1)); lhu(tmp2, Address(a2)); addi(a1, a1, 2); addi(a2, a2, 2); bne(tmp1, tmp2, DONE); } bind(TAIL01); if (elem_size == 1) { // Only needed when comparing byte arrays. test_bit(tmp1, cnt1, 0); beqz(tmp1, SAME); // 0-1 bytes left. { lbu(tmp1, Address(a1)); lbu(tmp2, Address(a2)); bne(tmp1, tmp2, DONE); } } bind(SAME); mv(result, true); // That's it. bind(DONE); BLOCK_COMMENT("} arrays_equals"); } // Compare Strings // For Strings we're passed the address of the first characters in a1 and a2 // and the length in cnt1. There are two implementations. // For arrays >= 8 bytes, all comparisons (except for the tail) are performed // 8 bytes at a time. For the tail, we compare a halfword, then a short, and then a byte. // For strings < 8 bytes, we compare a halfword, then a short, and then a byte. void C2_MacroAssembler::string_equals(Register a1, Register a2, Register result, Register cnt1) { Label SAME, DONE, SHORT, NEXT_WORD, TAIL03, TAIL01; Register tmp1 = t0; Register tmp2 = t1; assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2); int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE); assert((base_offset % (UseCompactObjectHeaders ? 4 : (UseCompressedClassPointers ? 8 : 4))) == 0, "Must be"); BLOCK_COMMENT("string_equals {"); mv(result, false); // Load 4 bytes once to compare for alignment before main loop. if (AvoidUnalignedAccesses && (base_offset % 8) != 0) { subi(cnt1, cnt1, 4); bltz(cnt1, TAIL03); lwu(tmp1, Address(a1)); lwu(tmp2, Address(a2)); addi(a1, a1, 4); addi(a2, a2, 4); bne(tmp1, tmp2, DONE); } // Check for short strings, i.e. smaller than wordSize. subi(cnt1, cnt1, wordSize); bltz(cnt1, SHORT); #ifdef ASSERT if (AvoidUnalignedAccesses) { Label align_ok; orr(t0, a1, a2); andi(t0, t0, 0x7); beqz(t0, align_ok); stop("bad alignment"); bind(align_ok); } #endif // Main 8 byte comparison loop. bind(NEXT_WORD); { ld(tmp1, Address(a1)); ld(tmp2, Address(a2)); subi(cnt1, cnt1, wordSize); addi(a1, a1, wordSize); addi(a2, a2, wordSize); bne(tmp1, tmp2, DONE); } bgez(cnt1, NEXT_WORD); addi(tmp1, cnt1, wordSize); beqz(tmp1, SAME); bind(SHORT); // 0-7 bytes left. test_bit(tmp1, cnt1, 2); beqz(tmp1, TAIL03); { lwu(tmp1, Address(a1)); lwu(tmp2, Address(a2)); addi(a1, a1, 4); addi(a2, a2, 4); bne(tmp1, tmp2, DONE); } bind(TAIL03); // 0-3 bytes left. test_bit(tmp1, cnt1, 1); beqz(tmp1, TAIL01); { lhu(tmp1, Address(a1)); lhu(tmp2, Address(a2)); addi(a1, a1, 2); addi(a2, a2, 2); bne(tmp1, tmp2, DONE); } bind(TAIL01); // 0-1 bytes left. test_bit(tmp1, cnt1, 0); beqz(tmp1, SAME); { lbu(tmp1, Address(a1)); lbu(tmp2, Address(a2)); bne(tmp1, tmp2, DONE); } // Arrays are equal. bind(SAME); mv(result, true); // That's it. bind(DONE); BLOCK_COMMENT("} string_equals"); } // jdk.internal.util.ArraysSupport.vectorizedHashCode void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register tmp6, BasicType eltype) { assert(!UseRVV, "sanity"); assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1); const int elsize = arrays_hashcode_elsize(eltype); const int chunks_end_shift = exact_log2(elsize); switch (eltype) { case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; default: ShouldNotReachHere(); } const int stride = 4; const Register pow31_4 = tmp1; const Register pow31_3 = tmp2; const Register pow31_2 = tmp3; const Register chunks = tmp4; const Register chunks_end = chunks; Label DONE, TAIL, TAIL_LOOP, WIDE_LOOP; // result has a value initially beqz(cnt, DONE); andi(chunks, cnt, ~(stride - 1)); beqz(chunks, TAIL); mv(pow31_4, 923521); // [31^^4] mv(pow31_3, 29791); // [31^^3] mv(pow31_2, 961); // [31^^2] shadd(chunks_end, chunks, ary, t0, chunks_end_shift); andi(cnt, cnt, stride - 1); // don't forget about tail! bind(WIDE_LOOP); arrays_hashcode_elload(t0, Address(ary, 0 * elsize), eltype); arrays_hashcode_elload(t1, Address(ary, 1 * elsize), eltype); arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype); arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype); mulw(result, result, pow31_4); // 31^^4 * h mulw(t0, t0, pow31_3); // 31^^3 * ary[i+0] addw(result, result, t0); mulw(t1, t1, pow31_2); // 31^^2 * ary[i+1] addw(result, result, t1); slli(t0, tmp5, 5); // optimize 31^^1 * ary[i+2] subw(tmp5, t0, tmp5); // with ary[i+2]<<5 - ary[i+2] addw(result, result, tmp5); addw(result, result, tmp6); // 31^^4 * h + 31^^3 * ary[i+0] + 31^^2 * ary[i+1] // + 31^^1 * ary[i+2] + 31^^0 * ary[i+3] addi(ary, ary, elsize * stride); bne(ary, chunks_end, WIDE_LOOP); beqz(cnt, DONE); bind(TAIL); shadd(chunks_end, cnt, ary, t0, chunks_end_shift); bind(TAIL_LOOP); arrays_hashcode_elload(t0, Address(ary), eltype); slli(t1, result, 5); // optimize 31 * result subw(result, t1, result); // with result<<5 - result addw(result, result, t0); addi(ary, ary, elsize); bne(ary, chunks_end, TAIL_LOOP); bind(DONE); BLOCK_COMMENT("} // arrays_hashcode"); } void C2_MacroAssembler::arrays_hashcode_v(Register ary, Register cnt, Register result, Register tmp1, Register tmp2, Register tmp3, BasicType eltype) { assert(UseRVV, "sanity"); assert(StubRoutines::riscv::arrays_hashcode_powers_of_31() != nullptr, "sanity"); assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, t0, t1); // The MaxVectorSize should have been set by detecting RVV max vector register // size when check UseRVV (i.e. MaxVectorSize == VM_Version::_initial_vector_length). // Let's use T_INT as all hashCode calculations eventually deal with ints. const int lmul = 2; const int stride = MaxVectorSize / sizeof(jint) * lmul; const int elsize_bytes = arrays_hashcode_elsize(eltype); const int elsize_shift = exact_log2(elsize_bytes); switch (eltype) { case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode_v(unsigned byte) {"); break; case T_CHAR: BLOCK_COMMENT("arrays_hashcode_v(char) {"); break; case T_BYTE: BLOCK_COMMENT("arrays_hashcode_v(byte) {"); break; case T_SHORT: BLOCK_COMMENT("arrays_hashcode_v(short) {"); break; case T_INT: BLOCK_COMMENT("arrays_hashcode_v(int) {"); break; default: ShouldNotReachHere(); } const Register pow31_highest = tmp1; const Register ary_end = tmp2; const Register consumed = tmp3; const VectorRegister v_sum = v2; const VectorRegister v_src = v4; const VectorRegister v_coeffs = v6; const VectorRegister v_tmp = v8; const address adr_pows31 = StubRoutines::riscv::arrays_hashcode_powers_of_31() + sizeof(jint); Label VEC_LOOP, DONE, SCALAR_TAIL, SCALAR_TAIL_LOOP; // NB: at this point (a) 'result' already has some value, // (b) 'cnt' is not 0 or 1, see java code for details. andi(t0, cnt, ~(stride - 1)); beqz(t0, SCALAR_TAIL); la(t1, ExternalAddress(adr_pows31)); lw(pow31_highest, Address(t1, -1 * sizeof(jint))); vsetvli(consumed, cnt, Assembler::e32, Assembler::m2); vle32_v(v_coeffs, t1); // 31^^(stride - 1) ... 31^^0 vmv_v_x(v_sum, x0); bind(VEC_LOOP); arrays_hashcode_elload_v(v_src, v_tmp, ary, eltype); vmul_vv(v_src, v_src, v_coeffs); vmadd_vx(v_sum, pow31_highest, v_src); mulw(result, result, pow31_highest); shadd(ary, consumed, ary, t0, elsize_shift); subw(cnt, cnt, consumed); andi(t1, cnt, ~(stride - 1)); bnez(t1, VEC_LOOP); vmv_s_x(v_tmp, x0); vredsum_vs(v_sum, v_sum, v_tmp); vmv_x_s(t0, v_sum); addw(result, result, t0); beqz(cnt, DONE); bind(SCALAR_TAIL); shadd(ary_end, cnt, ary, t0, elsize_shift); bind(SCALAR_TAIL_LOOP); arrays_hashcode_elload(t0, Address(ary), eltype); slli(t1, result, 5); // optimize 31 * result subw(result, t1, result); // with result<<5 - result addw(result, result, t0); addi(ary, ary, elsize_bytes); bne(ary, ary_end, SCALAR_TAIL_LOOP); bind(DONE); BLOCK_COMMENT("} // arrays_hashcode_v"); } int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { switch (eltype) { case T_BOOLEAN: return sizeof(jboolean); case T_BYTE: return sizeof(jbyte); case T_SHORT: return sizeof(jshort); case T_CHAR: return sizeof(jchar); case T_INT: return sizeof(jint); default: ShouldNotReachHere(); return -1; } } void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { switch (eltype) { // T_BOOLEAN used as surrogate for unsigned byte case T_BOOLEAN: lbu(dst, src); break; case T_BYTE: lb(dst, src); break; case T_SHORT: lh(dst, src); break; case T_CHAR: lhu(dst, src); break; case T_INT: lw(dst, src); break; default: ShouldNotReachHere(); } } void C2_MacroAssembler::arrays_hashcode_elload_v(VectorRegister vdst, VectorRegister vtmp, Register src, BasicType eltype) { assert_different_registers(vdst, vtmp); switch (eltype) { case T_BOOLEAN: vle8_v(vtmp, src); vzext_vf4(vdst, vtmp); break; case T_BYTE: vle8_v(vtmp, src); vsext_vf4(vdst, vtmp); break; case T_CHAR: vle16_v(vtmp, src); vzext_vf2(vdst, vtmp); break; case T_SHORT: vle16_v(vtmp, src); vsext_vf2(vdst, vtmp); break; case T_INT: vle32_v(vdst, src); break; default: ShouldNotReachHere(); } } typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far); typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label, bool is_far, bool is_unordered); static conditional_branch_insn conditional_branches[] = { /* SHORT branches */ (conditional_branch_insn)&MacroAssembler::beq, (conditional_branch_insn)&MacroAssembler::bgt, nullptr, // BoolTest::overflow (conditional_branch_insn)&MacroAssembler::blt, (conditional_branch_insn)&MacroAssembler::bne, (conditional_branch_insn)&MacroAssembler::ble, nullptr, // BoolTest::no_overflow (conditional_branch_insn)&MacroAssembler::bge, /* UNSIGNED branches */ (conditional_branch_insn)&MacroAssembler::beq, (conditional_branch_insn)&MacroAssembler::bgtu, nullptr, (conditional_branch_insn)&MacroAssembler::bltu, (conditional_branch_insn)&MacroAssembler::bne, (conditional_branch_insn)&MacroAssembler::bleu, nullptr, (conditional_branch_insn)&MacroAssembler::bgeu }; static float_conditional_branch_insn float_conditional_branches[] = { /* FLOAT SHORT branches */ (float_conditional_branch_insn)&MacroAssembler::float_beq, (float_conditional_branch_insn)&MacroAssembler::float_bgt, nullptr, // BoolTest::overflow (float_conditional_branch_insn)&MacroAssembler::float_blt, (float_conditional_branch_insn)&MacroAssembler::float_bne, (float_conditional_branch_insn)&MacroAssembler::float_ble, nullptr, // BoolTest::no_overflow (float_conditional_branch_insn)&MacroAssembler::float_bge, /* DOUBLE SHORT branches */ (float_conditional_branch_insn)&MacroAssembler::double_beq, (float_conditional_branch_insn)&MacroAssembler::double_bgt, nullptr, (float_conditional_branch_insn)&MacroAssembler::double_blt, (float_conditional_branch_insn)&MacroAssembler::double_bne, (float_conditional_branch_insn)&MacroAssembler::double_ble, nullptr, (float_conditional_branch_insn)&MacroAssembler::double_bge }; void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) { assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])), "invalid conditional branch index"); (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far); } // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode(). void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) { assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])), "invalid float conditional branch index"); int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask); (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far, (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true); } void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) { switch (cmpFlag) { case BoolTest::eq: case BoolTest::le: beqz(op1, L, is_far); break; case BoolTest::ne: case BoolTest::gt: bnez(op1, L, is_far); break; default: ShouldNotReachHere(); } } void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) { switch (cmpFlag) { case BoolTest::eq: beqz(op1, L, is_far); break; case BoolTest::ne: bnez(op1, L, is_far); break; default: ShouldNotReachHere(); } } void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) { bool is_unsigned = (cmpFlag & unsigned_branch_mask) == unsigned_branch_mask; int op_select = cmpFlag & (~unsigned_branch_mask); switch (op_select) { case BoolTest::eq: cmov_eq(op1, op2, dst, src); break; case BoolTest::ne: cmov_ne(op1, op2, dst, src); break; case BoolTest::le: if (is_unsigned) { cmov_leu(op1, op2, dst, src); } else { cmov_le(op1, op2, dst, src); } break; case BoolTest::ge: if (is_unsigned) { cmov_geu(op1, op2, dst, src); } else { cmov_ge(op1, op2, dst, src); } break; case BoolTest::lt: if (is_unsigned) { cmov_ltu(op1, op2, dst, src); } else { cmov_lt(op1, op2, dst, src); } break; case BoolTest::gt: if (is_unsigned) { cmov_gtu(op1, op2, dst, src); } else { cmov_gt(op1, op2, dst, src); } break; default: assert(false, "unsupported compare condition"); ShouldNotReachHere(); } } void C2_MacroAssembler::enc_cmove_cmp_fp(int cmpFlag, FloatRegister op1, FloatRegister op2, Register dst, Register src, bool is_single) { int op_select = cmpFlag & (~unsigned_branch_mask); switch (op_select) { case BoolTest::eq: cmov_cmp_fp_eq(op1, op2, dst, src, is_single); break; case BoolTest::ne: cmov_cmp_fp_ne(op1, op2, dst, src, is_single); break; case BoolTest::le: cmov_cmp_fp_le(op1, op2, dst, src, is_single); break; case BoolTest::ge: cmov_cmp_fp_ge(op1, op2, dst, src, is_single); break; case BoolTest::lt: cmov_cmp_fp_lt(op1, op2, dst, src, is_single); break; case BoolTest::gt: cmov_cmp_fp_gt(op1, op2, dst, src, is_single); break; default: assert(false, "unsupported compare condition"); ShouldNotReachHere(); } } void C2_MacroAssembler::enc_cmove_fp_cmp(int cmpFlag, Register op1, Register op2, FloatRegister dst, FloatRegister src, bool is_single) { bool is_unsigned = (cmpFlag & unsigned_branch_mask) == unsigned_branch_mask; int op_select = cmpFlag & (~unsigned_branch_mask); switch (op_select) { case BoolTest::eq: cmov_fp_eq(op1, op2, dst, src, is_single); break; case BoolTest::ne: cmov_fp_ne(op1, op2, dst, src, is_single); break; case BoolTest::le: if (is_unsigned) { cmov_fp_leu(op1, op2, dst, src, is_single); } else { cmov_fp_le(op1, op2, dst, src, is_single); } break; case BoolTest::ge: if (is_unsigned) { cmov_fp_geu(op1, op2, dst, src, is_single); } else { cmov_fp_ge(op1, op2, dst, src, is_single); } break; case BoolTest::lt: if (is_unsigned) { cmov_fp_ltu(op1, op2, dst, src, is_single); } else { cmov_fp_lt(op1, op2, dst, src, is_single); } break; case BoolTest::gt: if (is_unsigned) { cmov_fp_gtu(op1, op2, dst, src, is_single); } else { cmov_fp_gt(op1, op2, dst, src, is_single); } break; default: assert(false, "unsupported compare condition"); ShouldNotReachHere(); } } void C2_MacroAssembler::enc_cmove_fp_cmp_fp(int cmpFlag, FloatRegister op1, FloatRegister op2, FloatRegister dst, FloatRegister src, bool cmp_single, bool cmov_single) { int op_select = cmpFlag & (~unsigned_branch_mask); switch (op_select) { case BoolTest::eq: cmov_fp_cmp_fp_eq(op1, op2, dst, src, cmp_single, cmov_single); break; case BoolTest::ne: cmov_fp_cmp_fp_ne(op1, op2, dst, src, cmp_single, cmov_single); break; case BoolTest::le: cmov_fp_cmp_fp_le(op1, op2, dst, src, cmp_single, cmov_single); break; case BoolTest::ge: cmov_fp_cmp_fp_ge(op1, op2, dst, src, cmp_single, cmov_single); break; case BoolTest::lt: cmov_fp_cmp_fp_lt(op1, op2, dst, src, cmp_single, cmov_single); break; case BoolTest::gt: cmov_fp_cmp_fp_gt(op1, op2, dst, src, cmp_single, cmov_single); break; default: assert(false, "unsupported compare condition"); ShouldNotReachHere(); } } // Set dst to NaN if any NaN input. void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2, FLOAT_TYPE ft, bool is_min) { assert_cond((ft != FLOAT_TYPE::half_precision) || UseZfh); Label Done, Compare; switch (ft) { case FLOAT_TYPE::half_precision: fclass_h(t0, src1); fclass_h(t1, src2); orr(t0, t0, t1); andi(t0, t0, FClassBits::nan); // if src1 or src2 is quiet or signaling NaN then return NaN beqz(t0, Compare); fadd_h(dst, src1, src2); j(Done); bind(Compare); if (is_min) { fmin_h(dst, src1, src2); } else { fmax_h(dst, src1, src2); } break; case FLOAT_TYPE::single_precision: fclass_s(t0, src1); fclass_s(t1, src2); orr(t0, t0, t1); andi(t0, t0, FClassBits::nan); // if src1 or src2 is quiet or signaling NaN then return NaN beqz(t0, Compare); fadd_s(dst, src1, src2); j(Done); bind(Compare); if (is_min) { fmin_s(dst, src1, src2); } else { fmax_s(dst, src1, src2); } break; case FLOAT_TYPE::double_precision: fclass_d(t0, src1); fclass_d(t1, src2); orr(t0, t0, t1); andi(t0, t0, FClassBits::nan); // if src1 or src2 is quiet or signaling NaN then return NaN beqz(t0, Compare); fadd_d(dst, src1, src2); j(Done); bind(Compare); if (is_min) { fmin_d(dst, src1, src2); } else { fmax_d(dst, src1, src2); } break; default: ShouldNotReachHere(); } bind(Done); } // According to Java SE specification, for floating-point round operations, if // the input is NaN, +/-infinity, or +/-0, the same input is returned as the // rounded result; this differs from behavior of RISC-V fcvt instructions (which // round out-of-range values to the nearest max or min value), therefore special // handling is needed by NaN, +/-Infinity, +/-0. void C2_MacroAssembler::round_double_mode(FloatRegister dst, FloatRegister src, int round_mode, Register tmp1, Register tmp2, Register tmp3) { assert_different_registers(dst, src); assert_different_registers(tmp1, tmp2, tmp3); // Set rounding mode for conversions // Here we use similar modes to double->long and long->double conversions // Different mode for long->double conversion matter only if long value was not representable as double, // we got long value as a result of double->long conversion so, it is definitely representable RoundingMode rm; switch (round_mode) { case RoundDoubleModeNode::rmode_ceil: rm = RoundingMode::rup; break; case RoundDoubleModeNode::rmode_floor: rm = RoundingMode::rdn; break; case RoundDoubleModeNode::rmode_rint: rm = RoundingMode::rne; break; default: ShouldNotReachHere(); } // tmp1 - is a register to store double converted to long int // tmp2 - is a register to create constant for comparison // tmp3 - is a register where we store modified result of double->long conversion Label done, bad_val; // Conversion from double to long fcvt_l_d(tmp1, src, rm); // Generate constant (tmp2) // tmp2 = 100...0000 addi(tmp2, zr, 1); slli(tmp2, tmp2, 63); // Prepare converted long (tmp1) // as a result when conversion overflow we got: // tmp1 = 011...1111 or 100...0000 // Convert it to: tmp3 = 100...0000 addi(tmp3, tmp1, 1); andi(tmp3, tmp3, -2); beq(tmp3, tmp2, bad_val); // Conversion from long to double fcvt_d_l(dst, tmp1, rm); // Add sign of input value to result for +/- 0 cases fsgnj_d(dst, dst, src); j(done); // If got conversion overflow return src bind(bad_val); fmv_d(dst, src); bind(done); } // According to Java SE specification, for floating-point signum operations, if // on input we have NaN or +/-0.0 value we should return it, // otherwise return +/- 1.0 using sign of input. // one - gives us a floating-point 1.0 (got from matching rule) // bool is_double - specifies single or double precision operations will be used. void C2_MacroAssembler::signum_fp(FloatRegister dst, FloatRegister one, bool is_double) { Label done; is_double ? fclass_d(t0, dst) : fclass_s(t0, dst); // check if input is -0, +0, signaling NaN or quiet NaN andi(t0, t0, FClassBits::zero | FClassBits::nan); bnez(t0, done); // use floating-point 1.0 with a sign of input is_double ? fsgnj_d(dst, one, dst) : fsgnj_s(dst, one, dst); bind(done); } static void float16_to_float_slow_path(C2_MacroAssembler& masm, C2GeneralStub& stub) { #define __ masm. FloatRegister dst = stub.data<0>(); Register src = stub.data<1>(); Register tmp = stub.data<2>(); __ bind(stub.entry()); // following instructions mainly focus on NaN, as riscv does not handle // NaN well with fcvt, but the code also works for Inf at the same time. // construct a NaN in 32 bits from the NaN in 16 bits, // we need the payloads of non-canonical NaNs to be preserved. __ mv(tmp, 0x7f800000); // sign-bit was already set via sign-extension if necessary. __ slli(t0, src, 13); __ orr(tmp, t0, tmp); __ fmv_w_x(dst, tmp); __ j(stub.continuation()); #undef __ } // j.l.Float.float16ToFloat void C2_MacroAssembler::float16_to_float(FloatRegister dst, Register src, Register tmp) { auto stub = C2CodeStub::make(dst, src, tmp, 20, float16_to_float_slow_path); // On riscv, NaN needs a special process as fcvt does not work in that case. // On riscv, Inf does not need a special process as fcvt can handle it correctly. // but we consider to get the slow path to process NaN and Inf at the same time, // as both of them are rare cases, and if we try to get the slow path to handle // only NaN case it would sacrifise the performance for normal cases, // i.e. non-NaN and non-Inf cases. // check whether it's a NaN or +/- Inf. mv(t0, 0x7c00); andr(tmp, src, t0); // jump to stub processing NaN and Inf cases. beq(t0, tmp, stub->entry(), true); // non-NaN or non-Inf cases, just use built-in instructions. fmv_h_x(dst, src); fcvt_s_h(dst, dst); bind(stub->continuation()); } static void float_to_float16_slow_path(C2_MacroAssembler& masm, C2GeneralStub& stub) { #define __ masm. Register dst = stub.data<0>(); FloatRegister src = stub.data<1>(); Register tmp = stub.data<2>(); __ bind(stub.entry()); __ float_to_float16_NaN(dst, src, t0, tmp); __ j(stub.continuation()); #undef __ } // j.l.Float.floatToFloat16 void C2_MacroAssembler::float_to_float16(Register dst, FloatRegister src, FloatRegister ftmp, Register xtmp) { auto stub = C2CodeStub::make(dst, src, xtmp, 64, float_to_float16_slow_path); // On riscv, NaN needs a special process as fcvt does not work in that case. // check whether it's a NaN. // replace fclass with feq as performance optimization. feq_s(t0, src, src); // jump to stub processing NaN cases. beqz(t0, stub->entry(), true); // non-NaN cases, just use built-in instructions. fcvt_h_s(ftmp, src); fmv_x_h(dst, ftmp); bind(stub->continuation()); } static void float16_to_float_v_slow_path(C2_MacroAssembler& masm, C2GeneralStub& stub) { #define __ masm. VectorRegister dst = stub.data<0>(); VectorRegister src = stub.data<1>(); uint vector_length = stub.data<2>(); __ bind(stub.entry()); // following instructions mainly focus on NaN, as riscv does not handle // NaN well with vfwcvt_f_f_v, but the code also works for Inf at the same time. // // construct NaN's in 32 bits from the NaN's in 16 bits, // we need the payloads of non-canonical NaNs to be preserved. // adjust vector type to 2 * SEW. __ vsetvli_helper(T_FLOAT, vector_length, Assembler::m1); // widen and sign-extend src data. __ vsext_vf2(dst, src, Assembler::v0_t); __ mv(t0, 0x7f800000); // sign-bit was already set via sign-extension if necessary. __ vsll_vi(dst, dst, 13, Assembler::v0_t); __ vor_vx(dst, dst, t0, Assembler::v0_t); __ j(stub.continuation()); #undef __ } // j.l.Float.float16ToFloat void C2_MacroAssembler::float16_to_float_v(VectorRegister dst, VectorRegister src, uint vector_length) { auto stub = C2CodeStub::make (dst, src, vector_length, 24, float16_to_float_v_slow_path); assert_different_registers(dst, src); // On riscv, NaN needs a special process as vfwcvt_f_f_v does not work in that case. // On riscv, Inf does not need a special process as vfwcvt_f_f_v can handle it correctly. // but we consider to get the slow path to process NaN and Inf at the same time, // as both of them are rare cases, and if we try to get the slow path to handle // only NaN case it would sacrifise the performance for normal cases, // i.e. non-NaN and non-Inf cases. vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2); // check whether there is a NaN or +/- Inf. mv(t0, 0x7c00); vand_vx(v0, src, t0); // v0 will be used as mask in slow path. vmseq_vx(v0, v0, t0); vcpop_m(t0, v0); // For non-NaN or non-Inf cases, just use built-in instructions. vfwcvt_f_f_v(dst, src); // jump to stub processing NaN and Inf cases if there is any of them in the vector-wide. bnez(t0, stub->entry(), true); bind(stub->continuation()); } static void float_to_float16_v_slow_path(C2_MacroAssembler& masm, C2GeneralStub& stub) { #define __ masm. VectorRegister dst = stub.data<0>(); VectorRegister src = stub.data<1>(); VectorRegister vtmp = stub.data<2>(); assert_different_registers(dst, src, vtmp); __ bind(stub.entry()); // Active elements (NaNs) are marked in v0 mask register. // mul is already set to mf2 in float_to_float16_v. // Float (32 bits) // Bit: 31 30 to 23 22 to 0 // +---+------------------+-----------------------------+ // | S | Exponent | Mantissa (Fraction) | // +---+------------------+-----------------------------+ // 1 bit 8 bits 23 bits // // Float (16 bits) // Bit: 15 14 to 10 9 to 0 // +---+----------------+------------------+ // | S | Exponent | Mantissa | // +---+----------------+------------------+ // 1 bit 5 bits 10 bits const int fp_sign_bits = 1; const int fp32_bits = 32; const int fp32_mantissa_2nd_part_bits = 9; const int fp32_mantissa_3rd_part_bits = 4; const int fp16_exponent_bits = 5; const int fp16_mantissa_bits = 10; // preserve the sign bit and exponent, clear mantissa. __ vnsra_wi(dst, src, fp32_bits - fp_sign_bits - fp16_exponent_bits, Assembler::v0_t); __ vsll_vi(dst, dst, fp16_mantissa_bits, Assembler::v0_t); // Preserve high order bit of float NaN in the // binary16 result NaN (tenth bit); OR in remaining // bits into lower 9 bits of binary 16 significand. // | (doppel & 0x007f_e000) >> 13 // 10 bits // | (doppel & 0x0000_1ff0) >> 4 // 9 bits // | (doppel & 0x0000_000f)); // 4 bits // // Check j.l.Float.floatToFloat16 for more information. // 10 bits __ vnsrl_wi(vtmp, src, fp32_mantissa_2nd_part_bits + fp32_mantissa_3rd_part_bits, Assembler::v0_t); __ mv(t0, 0x3ff); // retain first part of mantissa in a float 32 __ vand_vx(vtmp, vtmp, t0, Assembler::v0_t); __ vor_vv(dst, dst, vtmp, Assembler::v0_t); // 9 bits __ vnsrl_wi(vtmp, src, fp32_mantissa_3rd_part_bits, Assembler::v0_t); __ mv(t0, 0x1ff); // retain second part of mantissa in a float 32 __ vand_vx(vtmp, vtmp, t0, Assembler::v0_t); __ vor_vv(dst, dst, vtmp, Assembler::v0_t); // 4 bits // Narrow shift is necessary to move data from 32 bits element to 16 bits element in vector register. __ vnsrl_wi(vtmp, src, 0, Assembler::v0_t); __ vand_vi(vtmp, vtmp, 0xf, Assembler::v0_t); __ vor_vv(dst, dst, vtmp, Assembler::v0_t); __ j(stub.continuation()); #undef __ } // j.l.Float.float16ToFloat void C2_MacroAssembler::float_to_float16_v(VectorRegister dst, VectorRegister src, VectorRegister vtmp, Register tmp, uint vector_length) { assert_different_registers(dst, src, vtmp); auto stub = C2CodeStub::make (dst, src, vtmp, 56, float_to_float16_v_slow_path); // On riscv, NaN needs a special process as vfncvt_f_f_w does not work in that case. vsetvli_helper(BasicType::T_FLOAT, vector_length, Assembler::m1); // check whether there is a NaN. // replace v_fclass with vmfne_vv as performance optimization. vmfne_vv(v0, src, src); vcpop_m(t0, v0); vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2, tmp); // For non-NaN cases, just use built-in instructions. vfncvt_f_f_w(dst, src); // jump to stub processing NaN cases. bnez(t0, stub->entry(), true); bind(stub->continuation()); } void C2_MacroAssembler::signum_fp_v(VectorRegister dst, VectorRegister one, BasicType bt, int vlen) { vsetvli_helper(bt, vlen); // check if input is -0, +0, signaling NaN or quiet NaN vfclass_v(v0, dst); mv(t0, FClassBits::zero | FClassBits::nan); vand_vx(v0, v0, t0); vmseq_vi(v0, v0, 0); // use floating-point 1.0 with a sign of input vfsgnj_vv(dst, one, dst, v0_t); } // j.l.Math.round(float) // Returns the closest int to the argument, with ties rounding to positive infinity. // We need to handle 3 special cases defined by java api spec: // NaN, // float >= Integer.MAX_VALUE, // float <= Integer.MIN_VALUE. void C2_MacroAssembler::java_round_float_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp, BasicType bt, uint vector_length) { // In riscv, there is no straight corresponding rounding mode to satisfy the behaviour defined, // in java api spec, i.e. any rounding mode can not handle some corner cases, e.g. // RNE is the closest one, but it ties to "even", which means 1.5/2.5 both will be converted // to 2, instead of 2 and 3 respectively. // RUP does not work either, although java api requires "rounding to positive infinity", // but both 1.3/1.8 will be converted to 2, instead of 1 and 2 respectively. // // The optimal solution for non-NaN cases is: // src+0.5 => dst, with rdn rounding mode, // convert dst from float to int, with rnd rounding mode. // and, this solution works as expected for float >= Integer.MAX_VALUE and float <= Integer.MIN_VALUE. // // But, we still need to handle NaN explicilty with vector mask instructions. // // Check MacroAssembler::java_round_float and C2_MacroAssembler::vector_round_sve in aarch64 for more details. csrwi(CSR_FRM, C2_MacroAssembler::rdn); vsetvli_helper(bt, vector_length); // don't rearrage the instructions sequence order without performance testing. // check MacroAssembler::java_round_float in riscv64 for more details. mv(t0, jint_cast(0.5f)); fmv_w_x(ftmp, t0); // replacing vfclass with feq as performance optimization vmfeq_vv(v0, src, src); // set dst = 0 in cases of NaN vmv_v_x(dst, zr); // dst = (src + 0.5) rounded down towards negative infinity vfadd_vf(dst, src, ftmp, Assembler::v0_t); vfcvt_x_f_v(dst, dst, Assembler::v0_t); // in RoundingMode::rdn csrwi(CSR_FRM, C2_MacroAssembler::rne); } // java.lang.Math.round(double a) // Returns the closest long to the argument, with ties rounding to positive infinity. void C2_MacroAssembler::java_round_double_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp, BasicType bt, uint vector_length) { // check C2_MacroAssembler::java_round_float_v above for more details. csrwi(CSR_FRM, C2_MacroAssembler::rdn); vsetvli_helper(bt, vector_length); mv(t0, julong_cast(0.5)); fmv_d_x(ftmp, t0); // replacing vfclass with feq as performance optimization vmfeq_vv(v0, src, src); // set dst = 0 in cases of NaN vmv_v_x(dst, zr); // dst = (src + 0.5) rounded down towards negative infinity vfadd_vf(dst, src, ftmp, Assembler::v0_t); vfcvt_x_f_v(dst, dst, Assembler::v0_t); // in RoundingMode::rdn csrwi(CSR_FRM, C2_MacroAssembler::rne); } void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2, VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE, Assembler::LMUL lmul) { Label loop; Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16; bind(loop); vsetvli(tmp1, cnt, sew, lmul); vlex_v(vr1, a1, sew); vlex_v(vr2, a2, sew); vmsne_vv(vrs, vr1, vr2); vfirst_m(tmp2, vrs); bgez(tmp2, DONE); sub(cnt, cnt, tmp1); if (!islatin) { slli(tmp1, tmp1, 1); // get byte counts } add(a1, a1, tmp1); add(a2, a2, tmp1); bnez(cnt, loop); mv(result, true); } void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt) { Label DONE; Register tmp1 = t0; Register tmp2 = t1; BLOCK_COMMENT("string_equals_v {"); mv(result, false); element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, true, DONE, Assembler::m2); bind(DONE); BLOCK_COMMENT("} string_equals_v"); } // used by C2 ClearArray patterns. // base: Address of a buffer to be zeroed // cnt: Count in HeapWords // // base, cnt, v4, v5, v6, v7 and t0 are clobbered. void C2_MacroAssembler::clear_array_v(Register base, Register cnt) { Label loop; // making zero words vsetvli(t0, cnt, Assembler::e64, Assembler::m4); vxor_vv(v4, v4, v4); bind(loop); vsetvli(t0, cnt, Assembler::e64, Assembler::m4); vse64_v(v4, base); sub(cnt, cnt, t0); shadd(base, t0, base, t0, 3); bnez(cnt, loop); } void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result, Register cnt1, int elem_size) { assert(elem_size == 1 || elem_size == 2, "must be char or byte"); assert_different_registers(a1, a2, result, cnt1, t0, t1); Label DONE; Register tmp1 = t0; Register tmp2 = t1; Register cnt2 = tmp2; int length_offset = arrayOopDesc::length_offset_in_bytes(); int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); assert((base_offset % (UseCompactObjectHeaders ? 4 : (UseCompressedClassPointers ? 8 : 4))) == 0, "Must be"); BLOCK_COMMENT("arrays_equals_v {"); // if (a1 == a2), return true mv(result, true); beq(a1, a2, DONE); mv(result, false); // if a1 == null or a2 == null, return false beqz(a1, DONE); beqz(a2, DONE); // if (a1.length != a2.length), return false lwu(cnt1, Address(a1, length_offset)); lwu(cnt2, Address(a2, length_offset)); bne(cnt1, cnt2, DONE); la(a1, Address(a1, base_offset)); la(a2, Address(a2, base_offset)); element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE, Assembler::m2); bind(DONE); BLOCK_COMMENT("} arrays_equals_v"); } void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, int encForm) { Label DIFFERENCE, DONE, L, loop; bool encLL = encForm == StrIntrinsicNode::LL; bool encLU = encForm == StrIntrinsicNode::LU; bool encUL = encForm == StrIntrinsicNode::UL; bool str1_isL = encLL || encLU; bool str2_isL = encLL || encUL; int minCharsInWord = encLL ? wordSize : wordSize / 2; BLOCK_COMMENT("string_compare_v {"); // for Latin strings, 1 byte for 1 character // for UTF16 strings, 2 bytes for 1 character if (!str1_isL) sraiw(cnt1, cnt1, 1); if (!str2_isL) sraiw(cnt2, cnt2, 1); // if str1 == str2, return the difference // save the minimum of the string lengths in cnt2. sub(result, cnt1, cnt2); bgt(cnt1, cnt2, L); mv(cnt2, cnt1); bind(L); // We focus on the optimization of small sized string. // Please check below document for string size distribution statistics. // https://cr.openjdk.org/~shade/density/string-density-report.pdf if (str1_isL == str2_isL) { // LL or UU // Below construction of v regs and lmul is based on test on 2 different boards, // vlen == 128 and vlen == 256 respectively. if (!encLL && MaxVectorSize == 16) { // UU element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v4, v8, v4, encLL, DIFFERENCE, Assembler::m4); } else { // UU + MaxVectorSize or LL element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE, Assembler::m2); } j(DONE); } else { // LU or UL Register strL = encLU ? str1 : str2; Register strU = encLU ? str2 : str1; VectorRegister vstr1 = encLU ? v8 : v4; VectorRegister vstr2 = encLU ? v4 : v8; bind(loop); vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2); vle8_v(vstr1, strL); vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4); vzext_vf2(vstr2, vstr1); vle16_v(vstr1, strU); vmsne_vv(v4, vstr2, vstr1); vfirst_m(tmp2, v4); bgez(tmp2, DIFFERENCE); sub(cnt2, cnt2, tmp1); add(strL, strL, tmp1); shadd(strU, tmp1, strU, tmp1, 1); bnez(cnt2, loop); j(DONE); } bind(DIFFERENCE); slli(tmp1, tmp2, 1); add(str1, str1, str1_isL ? tmp2 : tmp1); add(str2, str2, str2_isL ? tmp2 : tmp1); str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0)); str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0)); sub(result, tmp1, tmp2); bind(DONE); BLOCK_COMMENT("} string_compare_v"); } void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) { Label loop; assert_different_registers(src, dst, len, tmp, t0); BLOCK_COMMENT("byte_array_inflate_v {"); bind(loop); vsetvli(tmp, len, Assembler::e8, Assembler::m2); vle8_v(v6, src); vsetvli(t0, len, Assembler::e16, Assembler::m4); vzext_vf2(v4, v6); vse16_v(v4, dst); sub(len, len, tmp); add(src, src, tmp); shadd(dst, tmp, dst, tmp, 1); bnez(len, loop); BLOCK_COMMENT("} byte_array_inflate_v"); } // Compress char[] array to byte[]. // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) // result: the array length if every element in array can be encoded, // otherwise, the index of first non-latin1 (> 0xff) character. void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len, Register result, Register tmp) { encode_iso_array_v(src, dst, len, result, tmp, false); } // Intrinsic for // // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray // return the number of characters copied. // - java/lang/StringUTF16.compress // return index of non-latin1 character if copy fails, otherwise 'len'. // // This version always returns the number of characters copied. A successful // copy will complete with the post-condition: 'res' == 'len', while an // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'. // // Clobbers: src, dst, len, result, t0 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len, Register result, Register tmp, bool ascii) { Label loop, fail, done; BLOCK_COMMENT("encode_iso_array_v {"); mv(result, 0); bind(loop); mv(tmp, ascii ? 0x7f : 0xff); vsetvli(t0, len, Assembler::e16, Assembler::m2); vle16_v(v2, src); vmsgtu_vx(v1, v2, tmp); vfirst_m(tmp, v1); vmsbf_m(v0, v1); // compress char to byte vsetvli(t0, len, Assembler::e8); vncvt_x_x_w(v1, v2, Assembler::v0_t); vse8_v(v1, dst, Assembler::v0_t); // fail if char > 0x7f/0xff bgez(tmp, fail); add(result, result, t0); add(dst, dst, t0); sub(len, len, t0); shadd(src, t0, src, t0, 1); bnez(len, loop); j(done); bind(fail); add(result, result, tmp); bind(done); BLOCK_COMMENT("} encode_iso_array_v"); } void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) { Label LOOP, SET_RESULT, DONE; BLOCK_COMMENT("count_positives_v {"); assert_different_registers(ary, len, result, tmp); mv(result, zr); bind(LOOP); vsetvli(t0, len, Assembler::e8, Assembler::m4); vle8_v(v4, ary); vmslt_vx(v4, v4, zr); vfirst_m(tmp, v4); bgez(tmp, SET_RESULT); // if tmp == -1, all bytes are positive add(result, result, t0); sub(len, len, t0); add(ary, ary, t0); bnez(len, LOOP); j(DONE); // add remaining positive bytes count bind(SET_RESULT); add(result, result, tmp); bind(DONE); BLOCK_COMMENT("} count_positives_v"); } void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1, Register ch, Register result, Register tmp1, Register tmp2, bool isL) { mv(result, zr); Label loop, MATCH, DONE; Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16; bind(loop); vsetvli(tmp1, cnt1, sew, Assembler::m4); vlex_v(v4, str1, sew); vmseq_vx(v4, v4, ch); vfirst_m(tmp2, v4); bgez(tmp2, MATCH); // if equal, return index add(result, result, tmp1); sub(cnt1, cnt1, tmp1); if (!isL) slli(tmp1, tmp1, 1); add(str1, str1, tmp1); bnez(cnt1, loop); mv(result, -1); j(DONE); bind(MATCH); add(result, result, tmp2); bind(DONE); } // Set dst to NaN if any NaN input. void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2, BasicType bt, bool is_min, uint vector_length) { assert_different_registers(dst, src1, src2); vsetvli_helper(bt, vector_length); is_min ? vfmin_vv(dst, src1, src2) : vfmax_vv(dst, src1, src2); vmfne_vv(v0, src1, src1); vfadd_vv(dst, src1, src1, Assembler::v0_t); vmfne_vv(v0, src2, src2); vfadd_vv(dst, src2, src2, Assembler::v0_t); } // Set dst to NaN if any NaN input. // The destination vector register elements corresponding to masked-off elements // are handled with a mask-undisturbed policy. void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2, VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2, BasicType bt, bool is_min, uint vector_length) { assert_different_registers(src1, src2, tmp1, tmp2); vsetvli_helper(bt, vector_length); // Check vector elements of src1 and src2 for NaN. vmfeq_vv(tmp1, src1, src1); vmfeq_vv(tmp2, src2, src2); vmandn_mm(v0, vmask, tmp1); vfadd_vv(dst, src1, src1, Assembler::v0_t); vmandn_mm(v0, vmask, tmp2); vfadd_vv(dst, src2, src2, Assembler::v0_t); vmand_mm(tmp2, tmp1, tmp2); vmand_mm(v0, vmask, tmp2); is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t) : vfmax_vv(dst, src1, src2, Assembler::v0_t); } // Set dst to NaN if any NaN input. void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst, FloatRegister src1, VectorRegister src2, VectorRegister tmp1, VectorRegister tmp2, bool is_double, bool is_min, uint vector_length, VectorMask vm) { assert_different_registers(dst, src1); assert_different_registers(src2, tmp1, tmp2); Label L_done, L_NaN_1, L_NaN_2; // Set dst to src1 if src1 is NaN is_double ? feq_d(t0, src1, src1) : feq_s(t0, src1, src1); beqz(t0, L_NaN_2); vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length); vfmv_s_f(tmp2, src1); is_min ? vfredmin_vs(tmp1, src2, tmp2, vm) : vfredmax_vs(tmp1, src2, tmp2, vm); vfmv_f_s(dst, tmp1); // Checking NaNs in src2 vmfne_vv(tmp1, src2, src2, vm); vcpop_m(t0, tmp1, vm); beqz(t0, L_done); bind(L_NaN_1); vfredusum_vs(tmp1, src2, tmp2, vm); vfmv_f_s(dst, tmp1); j(L_done); bind(L_NaN_2); is_double ? fmv_d(dst, src1) : fmv_s(dst, src1); bind(L_done); } bool C2_MacroAssembler::in_scratch_emit_size() { if (ciEnv::current()->task() != nullptr) { PhaseOutput* phase_output = Compile::current()->output(); if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { return true; } } return MacroAssembler::in_scratch_emit_size(); } void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1, VectorRegister src2, VectorRegister tmp, int opc, BasicType bt, uint vector_length, VectorMask vm) { assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); vsetvli_helper(bt, vector_length); vmv_s_x(tmp, src1); switch (opc) { case Op_AddReductionVI: case Op_AddReductionVL: vredsum_vs(tmp, src2, tmp, vm); break; case Op_AndReductionV: vredand_vs(tmp, src2, tmp, vm); break; case Op_OrReductionV: vredor_vs(tmp, src2, tmp, vm); break; case Op_XorReductionV: vredxor_vs(tmp, src2, tmp, vm); break; case Op_MaxReductionV: vredmax_vs(tmp, src2, tmp, vm); break; case Op_MinReductionV: vredmin_vs(tmp, src2, tmp, vm); break; default: ShouldNotReachHere(); } vmv_x_s(dst, tmp); } void C2_MacroAssembler::reduce_mul_integral_v(Register dst, Register src1, VectorRegister src2, VectorRegister vtmp1, VectorRegister vtmp2, BasicType bt, uint vector_length, VectorMask vm) { assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); vsetvli_helper(bt, vector_length); vector_length /= 2; if (vm != Assembler::unmasked) { // This behaviour is consistent with spec requirements of vector API, for `reduceLanes`: // If no elements are selected, an operation-specific identity value is returned. // If the operation is MUL, then the identity value is one. vmv_v_i(vtmp1, 1); vmerge_vvm(vtmp2, vtmp1, src2); // vm == v0 vslidedown_vi(vtmp1, vtmp2, vector_length); vsetvli_helper(bt, vector_length); vmul_vv(vtmp1, vtmp1, vtmp2); } else { vslidedown_vi(vtmp1, src2, vector_length); vsetvli_helper(bt, vector_length); vmul_vv(vtmp1, vtmp1, src2); } while (vector_length > 1) { vector_length /= 2; vslidedown_vi(vtmp2, vtmp1, vector_length); vsetvli_helper(bt, vector_length); vmul_vv(vtmp1, vtmp1, vtmp2); } vmv_x_s(dst, vtmp1); if (bt == T_INT) { mulw(dst, dst, src1); } else { mul(dst, dst, src1); } } // Set vl and vtype for full and partial vector operations. // (vma = mu, vta = tu, vill = false) void C2_MacroAssembler::vsetvli_helper(BasicType bt, uint vector_length, LMUL vlmul, Register tmp) { Assembler::SEW sew = Assembler::elemtype_to_sew(bt); if (vector_length <= 31) { vsetivli(tmp, vector_length, sew, vlmul); } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) { vsetvli(tmp, x0, sew, vlmul); } else { mv(tmp, vector_length); vsetvli(tmp, tmp, sew, vlmul); } } void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2, int cond, BasicType bt, uint vector_length, VectorMask vm) { assert(is_integral_type(bt), "unsupported element type"); assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers"); vsetvli_helper(bt, vector_length); if (vm == Assembler::v0_t) { vmclr_m(vd); } switch (cond) { case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break; case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break; case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break; case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break; case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break; case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break; case BoolTest::ule: vmsleu_vv(vd, src1, src2, vm); break; case BoolTest::uge: vmsgeu_vv(vd, src1, src2, vm); break; case BoolTest::ult: vmsltu_vv(vd, src1, src2, vm); break; case BoolTest::ugt: vmsgtu_vv(vd, src1, src2, vm); break; default: assert(false, "unsupported compare condition"); ShouldNotReachHere(); } } void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2, int cond, BasicType bt, uint vector_length, VectorMask vm) { assert(is_floating_point_type(bt), "unsupported element type"); assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers"); vsetvli_helper(bt, vector_length); if (vm == Assembler::v0_t) { vmclr_m(vd); } switch (cond) { case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break; case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break; case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break; case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break; case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break; case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break; default: assert(false, "unsupported compare condition"); ShouldNotReachHere(); } } // In Matcher::scalable_predicate_reg_slots, // we assume each predicate register is one-eighth of the size of // scalable vector register, one mask bit per vector byte. void C2_MacroAssembler::spill_vmask(VectorRegister v, int offset) { vsetvli_helper(T_BYTE, MaxVectorSize >> 3); add(t0, sp, offset); vse8_v(v, t0); } void C2_MacroAssembler::unspill_vmask(VectorRegister v, int offset) { vsetvli_helper(T_BYTE, MaxVectorSize >> 3); add(t0, sp, offset); vle8_v(v, t0); } void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, uint vector_length, VectorRegister src, BasicType src_bt, bool is_signed) { assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size"); assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type"); // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands // The destination EEW is greater than the source EEW, the source EMUL is at least 1, // and the overlap is in the highest-numbered part of the destination register group. // Since LMUL=1, vd and vs cannot be the same. assert_different_registers(dst, src); vsetvli_helper(dst_bt, vector_length); if (is_signed) { if (src_bt == T_BYTE) { switch (dst_bt) { case T_SHORT: vsext_vf2(dst, src); break; case T_INT: vsext_vf4(dst, src); break; case T_LONG: vsext_vf8(dst, src); break; default: ShouldNotReachHere(); } } else if (src_bt == T_SHORT) { if (dst_bt == T_INT) { vsext_vf2(dst, src); } else { vsext_vf4(dst, src); } } else if (src_bt == T_INT) { vsext_vf2(dst, src); } } else { if (src_bt == T_BYTE) { switch (dst_bt) { case T_SHORT: vzext_vf2(dst, src); break; case T_INT: vzext_vf4(dst, src); break; case T_LONG: vzext_vf8(dst, src); break; default: ShouldNotReachHere(); } } else if (src_bt == T_SHORT) { if (dst_bt == T_INT) { vzext_vf2(dst, src); } else { vzext_vf4(dst, src); } } else if (src_bt == T_INT) { vzext_vf2(dst, src); } } } // Vector narrow from src to dst with specified element sizes. // High part of dst vector will be filled with zero. void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, uint vector_length, VectorRegister src, BasicType src_bt) { assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size"); assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type"); mv(t0, vector_length); if (src_bt == T_LONG) { // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source. // So we can currently only scale down by 1/2 the width at a time. vsetvli(t0, t0, Assembler::e32, Assembler::mf2); vncvt_x_x_w(dst, src); if (dst_bt == T_SHORT || dst_bt == T_BYTE) { vsetvli(t0, t0, Assembler::e16, Assembler::mf2); vncvt_x_x_w(dst, dst); if (dst_bt == T_BYTE) { vsetvli(t0, t0, Assembler::e8, Assembler::mf2); vncvt_x_x_w(dst, dst); } } } else if (src_bt == T_INT) { // T_SHORT vsetvli(t0, t0, Assembler::e16, Assembler::mf2); vncvt_x_x_w(dst, src); if (dst_bt == T_BYTE) { vsetvli(t0, t0, Assembler::e8, Assembler::mf2); vncvt_x_x_w(dst, dst); } } else if (src_bt == T_SHORT) { vsetvli(t0, t0, Assembler::e8, Assembler::mf2); vncvt_x_x_w(dst, src); } } #define VFCVT_SAFE(VFLOATCVT) \ void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \ assert_different_registers(dst, src); \ vxor_vv(dst, dst, dst); \ vmfeq_vv(v0, src, src); \ VFLOATCVT(dst, src, Assembler::v0_t); \ } VFCVT_SAFE(vfcvt_rtz_x_f_v); #undef VFCVT_SAFE // Extract a scalar element from an vector at position 'idx'. // The input elements in src are expected to be of integral type. void C2_MacroAssembler::extract_v(Register dst, VectorRegister src, BasicType bt, int idx, VectorRegister tmp) { assert(is_integral_type(bt), "unsupported element type"); assert(idx >= 0, "idx cannot be negative"); // Only need the first element after vector slidedown vsetvli_helper(bt, 1); if (idx == 0) { vmv_x_s(dst, src); } else if (idx <= 31) { vslidedown_vi(tmp, src, idx); vmv_x_s(dst, tmp); } else { mv(t0, idx); vslidedown_vx(tmp, src, t0); vmv_x_s(dst, tmp); } } // Extract a scalar element from an vector at position 'idx'. // The input elements in src are expected to be of floating point type. void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src, BasicType bt, int idx, VectorRegister tmp) { assert(is_floating_point_type(bt), "unsupported element type"); assert(idx >= 0, "idx cannot be negative"); // Only need the first element after vector slidedown vsetvli_helper(bt, 1); if (idx == 0) { vfmv_f_s(dst, src); } else if (idx <= 31) { vslidedown_vi(tmp, src, idx); vfmv_f_s(dst, tmp); } else { mv(t0, idx); vslidedown_vx(tmp, src, t0); vfmv_f_s(dst, tmp); } }