diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp index 68960118076..4adf317cfca 100644 --- a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp +++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp @@ -1968,6 +1968,22 @@ void MacroAssembler::ror_imm(Register dst, Register src, uint32_t shift, Registe orr(dst, dst, tmp); } +// rotate left with shift bits, 32-bit version +void MacroAssembler::rolw_imm(Register dst, Register src, uint32_t shift, Register tmp) { + if (UseZbb) { + // no roliw available + roriw(dst, src, 32 - shift); + return; + } + + assert_different_registers(dst, tmp); + assert_different_registers(src, tmp); + assert(shift < 32, "shift amount must be < 32"); + srliw(tmp, src, 32 - shift); + slliw(dst, src, shift); + orr(dst, dst, tmp); +} + void MacroAssembler::andi(Register Rd, Register Rn, int64_t imm, Register tmp) { if (is_simm12(imm)) { and_imm12(Rd, Rn, imm); diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp index 2a34823c77a..2344aad3a1b 100644 --- a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp +++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp @@ -596,7 +596,9 @@ class MacroAssembler: public Assembler { void NAME(Register Rs1, Register Rs2, const address dest) { \ assert_cond(dest != nullptr); \ int64_t offset = dest - pc(); \ - guarantee(is_simm13(offset) && ((offset % 2) == 0), "offset is invalid."); \ + guarantee(is_simm13(offset) && is_even(offset), \ + "offset is invalid: is_simm_13: %s offset: " INT64_FORMAT, \ + BOOL_TO_STR(is_simm13(offset)), offset); \ Assembler::NAME(Rs1, Rs2, offset); \ } \ INSN_ENTRY_RELOC(void, NAME(Register Rs1, Register Rs2, address dest, relocInfo::relocType rtype)) \ @@ -772,6 +774,7 @@ public: void revb(Register Rd, Register Rs, Register tmp1 = t0, Register tmp2 = t1); // reverse bytes in doubleword void ror_imm(Register dst, Register src, uint32_t shift, Register tmp = t0); + void rolw_imm(Register dst, Register src, uint32_t, Register tmp = t0); void andi(Register Rd, Register Rn, int64_t imm, Register tmp = t0); void orptr(Address adr, RegisterOrConstant src, Register tmp1 = t0, Register tmp2 = t1); diff --git a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp index 304e0a9c222..41bbb1bdcfd 100644 --- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp @@ -3913,6 +3913,373 @@ class StubGenerator: public StubCodeGenerator { return start; } + // Set of L registers that correspond to a contiguous memory area. + // Each 64-bit register typically corresponds to 2 32-bit integers. + template + class RegCache { + private: + MacroAssembler *_masm; + Register _regs[L]; + + public: + RegCache(MacroAssembler *masm, RegSet rs): _masm(masm) { + assert(rs.size() == L, "%u registers are used to cache %u 4-byte data", rs.size(), 2 * L); + auto it = rs.begin(); + for (auto &r: _regs) { + r = *it; + ++it; + } + } + + void gen_loads(Register base) { + for (uint i = 0; i < L; i += 1) { + __ ld(_regs[i], Address(base, 8 * i)); + } + } + + // Generate code extracting i-th unsigned word (4 bytes). + void get_u32(Register dest, uint i, Register rmask32) { + assert(i < 2 * L, "invalid i: %u", i); + + if (i % 2 == 0) { + __ andr(dest, _regs[i / 2], rmask32); + } else { + __ srli(dest, _regs[i / 2], 32); + } + } + }; + + typedef RegCache<8> BufRegCache; + + // a += rtmp1 + x + ac; + // a = Integer.rotateLeft(a, s) + b; + void m5_FF_GG_HH_II_epilogue(BufRegCache& reg_cache, + Register a, Register b, Register c, Register d, + int k, int s, int t, + Register rtmp1, Register rtmp2, Register rmask32) { + // rtmp1 = rtmp1 + x + ac + reg_cache.get_u32(rtmp2, k, rmask32); + __ addw(rtmp1, rtmp1, rtmp2); + __ li(rtmp2, t); + __ addw(rtmp1, rtmp1, rtmp2); + + // a += rtmp1 + x + ac + __ addw(a, a, rtmp1); + + // a = Integer.rotateLeft(a, s) + b; + __ rolw_imm(a, a, s, rtmp1); + __ addw(a, a, b); + } + + // a += ((b & c) | ((~b) & d)) + x + ac; + // a = Integer.rotateLeft(a, s) + b; + void md5_FF(BufRegCache& reg_cache, + Register a, Register b, Register c, Register d, + int k, int s, int t, + Register rtmp1, Register rtmp2, Register rmask32) { + // rtmp1 = b & c + __ andr(rtmp1, b, c); + + // rtmp2 = (~b) & d + __ notr(rtmp2, b); + __ andr(rtmp2, rtmp2, d); + + // rtmp1 = (b & c) | ((~b) & d) + __ orr(rtmp1, rtmp1, rtmp2); + + m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, + rtmp1, rtmp2, rmask32); + } + + // a += ((b & d) | (c & (~d))) + x + ac; + // a = Integer.rotateLeft(a, s) + b; + void md5_GG(BufRegCache& reg_cache, + Register a, Register b, Register c, Register d, + int k, int s, int t, + Register rtmp1, Register rtmp2, Register rmask32) { + // rtmp1 = b & d + __ andr(rtmp1, b, d); + + // rtmp2 = (c & (~d)) + __ notr(rtmp2, d); + __ andr(rtmp2, rtmp2, c); + + // rtmp1 = (b & d) | (c & (~d)) + __ orr(rtmp1, rtmp1, rtmp2); + + m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, + rtmp1, rtmp2, rmask32); + } + + // a += ((b ^ c) ^ d) + x + ac; + // a = Integer.rotateLeft(a, s) + b; + void md5_HH(BufRegCache& reg_cache, + Register a, Register b, Register c, Register d, + int k, int s, int t, + Register rtmp1, Register rtmp2, Register rmask32) { + // rtmp1 = (b ^ c) ^ d + __ xorr(rtmp1, b, c); + __ xorr(rtmp1, rtmp1, d); + + m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, + rtmp1, rtmp2, rmask32); + } + + // a += (c ^ (b | (~d))) + x + ac; + // a = Integer.rotateLeft(a, s) + b; + void md5_II(BufRegCache& reg_cache, + Register a, Register b, Register c, Register d, + int k, int s, int t, + Register rtmp1, Register rtmp2, Register rmask32) { + // rtmp1 = c ^ (b | (~d)) + __ notr(rtmp2, d); + __ orr(rtmp1, b, rtmp2); + __ xorr(rtmp1, c, rtmp1); + + m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, + rtmp1, rtmp2, rmask32); + } + + // Arguments: + // + // Inputs: + // c_rarg0 - byte[] source+offset + // c_rarg1 - int[] SHA.state + // c_rarg2 - int offset (multi_block == True) + // c_rarg3 - int limit (multi_block == True) + // + // Registers: + // x0 zero (zero) + // x1 ra (return address) + // x2 sp (stack pointer) + // x3 gp (global pointer) + // x4 tp (thread pointer) + // x5 t0 state0 + // x6 t1 state1 + // x7 t2 state2 + // x8 f0/s0 (frame pointer) + // x9 s1 state3 [saved-reg] + // x10 a0 rtmp1 / c_rarg0 + // x11 a1 rtmp2 / c_rarg1 + // x12 a2 a / c_rarg2 + // x13 a3 b / c_rarg3 + // x14 a4 c + // x15 a5 d + // x16 a6 buf + // x17 a7 state + // x18 s2 ofs [saved-reg] (multi_block == True) + // x19 s3 limit [saved-reg] (multi_block == True) + // x20 s4 + // x21 s5 + // x22 s6 mask32 [saved-reg] + // x23 s7 + // x24 s8 buf0 [saved-reg] + // x25 s9 buf1 [saved-reg] + // x26 s10 buf2 [saved-reg] + // x27 s11 buf3 [saved-reg] + // x28 t3 buf4 + // x29 t4 buf5 + // x30 t5 buf6 + // x31 t6 buf7 + address generate_md5_implCompress(bool multi_block, const char *name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + + // rotation constants + const int S11 = 7; + const int S12 = 12; + const int S13 = 17; + const int S14 = 22; + const int S21 = 5; + const int S22 = 9; + const int S23 = 14; + const int S24 = 20; + const int S31 = 4; + const int S32 = 11; + const int S33 = 16; + const int S34 = 23; + const int S41 = 6; + const int S42 = 10; + const int S43 = 15; + const int S44 = 21; + + Register buf_arg = c_rarg0; // a0 + Register state_arg = c_rarg1; // a1 + Register ofs_arg = c_rarg2; // a2 + Register limit_arg = c_rarg3; // a3 + + // we'll copy the args to these registers to free up a0-a3 + // to use for other values manipulated by instructions + // that can be compressed + Register buf = x16; // a6 + Register state = x17; // a7 + Register ofs = x18; // s2 + Register limit = x19; // s3 + + // using x12->15 to allow compressed instructions + Register a = x12; // a2 + Register b = x13; // a3 + Register c = x14; // a4 + Register d = x15; // a5 + + Register state0 = x5; // t0 + Register state1 = x6; // t1 + Register state2 = x7; // t2 + Register state3 = x9; // s1 + + // using x9->x11 to allow compressed instructions + Register rtmp1 = x10; // a0 + Register rtmp2 = x11; // a1 + + const int64_t MASK_32 = 0xffffffff; + Register rmask32 = x22; // s6 + + RegSet reg_cache_saved_regs = RegSet::of(x24, x25, x26, x27); // s8, s9, s10, s11 + RegSet reg_cache_regs; + reg_cache_regs += reg_cache_saved_regs; + reg_cache_regs += RegSet::of(x28, x29, x30, x31); // t3, t4, t5, t6 + BufRegCache reg_cache(_masm, reg_cache_regs); + + RegSet saved_regs; + if (multi_block) { + saved_regs += RegSet::of(ofs, limit); + } + saved_regs += RegSet::of(state3, rmask32); + saved_regs += reg_cache_saved_regs; + + __ push_reg(saved_regs, sp); + + __ mv(buf, buf_arg); + __ mv(state, state_arg); + if (multi_block) { + __ mv(ofs, ofs_arg); + __ mv(limit, limit_arg); + } + __ li(rmask32, MASK_32); + + // to minimize the number of memory operations: + // read the 4 state 4-byte values in pairs, with a single ld, + // and split them into 2 registers + __ ld(state0, Address(state)); + __ srli(state1, state0, 32); + __ andr(state0, state0, rmask32); + __ ld(state2, Address(state, 8)); + __ srli(state3, state2, 32); + __ andr(state2, state2, rmask32); + + Label md5_loop; + __ BIND(md5_loop); + + reg_cache.gen_loads(buf); + + __ mv(a, state0); + __ mv(b, state1); + __ mv(c, state2); + __ mv(d, state3); + + // Round 1 + md5_FF(reg_cache, a, b, c, d, 0, S11, 0xd76aa478, rtmp1, rtmp2, rmask32); + md5_FF(reg_cache, d, a, b, c, 1, S12, 0xe8c7b756, rtmp1, rtmp2, rmask32); + md5_FF(reg_cache, c, d, a, b, 2, S13, 0x242070db, rtmp1, rtmp2, rmask32); + md5_FF(reg_cache, b, c, d, a, 3, S14, 0xc1bdceee, rtmp1, rtmp2, rmask32); + md5_FF(reg_cache, a, b, c, d, 4, S11, 0xf57c0faf, rtmp1, rtmp2, rmask32); + md5_FF(reg_cache, d, a, b, c, 5, S12, 0x4787c62a, rtmp1, rtmp2, rmask32); + md5_FF(reg_cache, c, d, a, b, 6, S13, 0xa8304613, rtmp1, rtmp2, rmask32); + md5_FF(reg_cache, b, c, d, a, 7, S14, 0xfd469501, rtmp1, rtmp2, rmask32); + md5_FF(reg_cache, a, b, c, d, 8, S11, 0x698098d8, rtmp1, rtmp2, rmask32); + md5_FF(reg_cache, d, a, b, c, 9, S12, 0x8b44f7af, rtmp1, rtmp2, rmask32); + md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2, rmask32); + md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2, rmask32); + md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2, rmask32); + md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2, rmask32); + md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2, rmask32); + md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2, rmask32); + + // Round 2 + md5_GG(reg_cache, a, b, c, d, 1, S21, 0xf61e2562, rtmp1, rtmp2, rmask32); + md5_GG(reg_cache, d, a, b, c, 6, S22, 0xc040b340, rtmp1, rtmp2, rmask32); + md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2, rmask32); + md5_GG(reg_cache, b, c, d, a, 0, S24, 0xe9b6c7aa, rtmp1, rtmp2, rmask32); + md5_GG(reg_cache, a, b, c, d, 5, S21, 0xd62f105d, rtmp1, rtmp2, rmask32); + md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2, rmask32); + md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2, rmask32); + md5_GG(reg_cache, b, c, d, a, 4, S24, 0xe7d3fbc8, rtmp1, rtmp2, rmask32); + md5_GG(reg_cache, a, b, c, d, 9, S21, 0x21e1cde6, rtmp1, rtmp2, rmask32); + md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2, rmask32); + md5_GG(reg_cache, c, d, a, b, 3, S23, 0xf4d50d87, rtmp1, rtmp2, rmask32); + md5_GG(reg_cache, b, c, d, a, 8, S24, 0x455a14ed, rtmp1, rtmp2, rmask32); + md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2, rmask32); + md5_GG(reg_cache, d, a, b, c, 2, S22, 0xfcefa3f8, rtmp1, rtmp2, rmask32); + md5_GG(reg_cache, c, d, a, b, 7, S23, 0x676f02d9, rtmp1, rtmp2, rmask32); + md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2, rmask32); + + // Round 3 + md5_HH(reg_cache, a, b, c, d, 5, S31, 0xfffa3942, rtmp1, rtmp2, rmask32); + md5_HH(reg_cache, d, a, b, c, 8, S32, 0x8771f681, rtmp1, rtmp2, rmask32); + md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2, rmask32); + md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2, rmask32); + md5_HH(reg_cache, a, b, c, d, 1, S31, 0xa4beea44, rtmp1, rtmp2, rmask32); + md5_HH(reg_cache, d, a, b, c, 4, S32, 0x4bdecfa9, rtmp1, rtmp2, rmask32); + md5_HH(reg_cache, c, d, a, b, 7, S33, 0xf6bb4b60, rtmp1, rtmp2, rmask32); + md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2, rmask32); + md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2, rmask32); + md5_HH(reg_cache, d, a, b, c, 0, S32, 0xeaa127fa, rtmp1, rtmp2, rmask32); + md5_HH(reg_cache, c, d, a, b, 3, S33, 0xd4ef3085, rtmp1, rtmp2, rmask32); + md5_HH(reg_cache, b, c, d, a, 6, S34, 0x04881d05, rtmp1, rtmp2, rmask32); + md5_HH(reg_cache, a, b, c, d, 9, S31, 0xd9d4d039, rtmp1, rtmp2, rmask32); + md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2, rmask32); + md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2, rmask32); + md5_HH(reg_cache, b, c, d, a, 2, S34, 0xc4ac5665, rtmp1, rtmp2, rmask32); + + // Round 4 + md5_II(reg_cache, a, b, c, d, 0, S41, 0xf4292244, rtmp1, rtmp2, rmask32); + md5_II(reg_cache, d, a, b, c, 7, S42, 0x432aff97, rtmp1, rtmp2, rmask32); + md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2, rmask32); + md5_II(reg_cache, b, c, d, a, 5, S44, 0xfc93a039, rtmp1, rtmp2, rmask32); + md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2, rmask32); + md5_II(reg_cache, d, a, b, c, 3, S42, 0x8f0ccc92, rtmp1, rtmp2, rmask32); + md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2, rmask32); + md5_II(reg_cache, b, c, d, a, 1, S44, 0x85845dd1, rtmp1, rtmp2, rmask32); + md5_II(reg_cache, a, b, c, d, 8, S41, 0x6fa87e4f, rtmp1, rtmp2, rmask32); + md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2, rmask32); + md5_II(reg_cache, c, d, a, b, 6, S43, 0xa3014314, rtmp1, rtmp2, rmask32); + md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2, rmask32); + md5_II(reg_cache, a, b, c, d, 4, S41, 0xf7537e82, rtmp1, rtmp2, rmask32); + md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2, rmask32); + md5_II(reg_cache, c, d, a, b, 2, S43, 0x2ad7d2bb, rtmp1, rtmp2, rmask32); + md5_II(reg_cache, b, c, d, a, 9, S44, 0xeb86d391, rtmp1, rtmp2, rmask32); + + __ addw(state0, state0, a); + __ addw(state1, state1, b); + __ addw(state2, state2, c); + __ addw(state3, state3, d); + + if (multi_block) { + __ addi(buf, buf, 64); + __ addi(ofs, ofs, 64); + // if (ofs <= limit) goto m5_loop + __ bge(limit, ofs, md5_loop); + __ mv(c_rarg0, ofs); // return ofs + } + + // to minimize the number of memory operations: + // write back the 4 state 4-byte values in pairs, with a single sd + __ andr(state0, state0, rmask32); + __ slli(state1, state1, 32); + __ orr(state0, state0, state1); + __ sd(state0, Address(state)); + __ andr(state2, state2, rmask32); + __ slli(state3, state3, 32); + __ orr(state2, state2, state3); + __ sd(state2, Address(state, 8)); + + __ pop_reg(saved_regs, sp); + __ ret(); + + return (address) start; + } + #if INCLUDE_JFR static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) { @@ -4127,6 +4494,11 @@ class StubGenerator: public StubCodeGenerator { generate_compare_long_strings(); generate_string_indexof_stubs(); + + if (UseMD5Intrinsics) { + StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); + StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); + } #endif // COMPILER2_OR_JVMCI } diff --git a/src/hotspot/cpu/riscv/vm_version_riscv.cpp b/src/hotspot/cpu/riscv/vm_version_riscv.cpp index d9b46131d66..83f6f38d253 100644 --- a/src/hotspot/cpu/riscv/vm_version_riscv.cpp +++ b/src/hotspot/cpu/riscv/vm_version_riscv.cpp @@ -171,9 +171,8 @@ void VM_Version::initialize() { FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false); } - if (UseMD5Intrinsics) { - warning("MD5 intrinsics are not available on this CPU."); - FLAG_SET_DEFAULT(UseMD5Intrinsics, false); + if (FLAG_IS_DEFAULT(UseMD5Intrinsics)) { + FLAG_SET_DEFAULT(UseMD5Intrinsics, true); } if (UseRVV) {