From fa331d71288f36ba67ee36af4c17ab80e9f2a23d Mon Sep 17 00:00:00 2001 From: Antonios Printezis Date: Wed, 8 Nov 2023 19:55:28 +0000 Subject: [PATCH] 8319184: RISC-V: improve MD5 intrinsic Reviewed-by: fyang, mli --- src/hotspot/cpu/riscv/stubGenerator_riscv.cpp | 249 +++++++++--------- 1 file changed, 127 insertions(+), 122 deletions(-) diff --git a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp index 2431df1178d..a2815db2b4e 100644 --- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp @@ -3932,43 +3932,46 @@ class StubGenerator: public StubCodeGenerator { } } - void gen_loads(Register base) { - for (uint i = 0; i < L; i += 1) { - __ ld(_regs[i], Address(base, 8 * i)); - } + // generate load for the i'th register + void gen_load(uint i, Register base) { + assert(i < L, "invalid i: %u", i); + __ ld(_regs[i], Address(base, 8 * i)); } - // Generate code extracting i-th unsigned word (4 bytes). - void get_u32(Register dest, uint i, Register rmask32) { + // add i'th 32-bit integer to dest + void add_u32(const Register dest, uint i, const Register rtmp = t0) { assert(i < 2 * L, "invalid i: %u", i); - if (i % 2 == 0) { - __ andr(dest, _regs[i / 2], rmask32); + if (is_even(i)) { + // Use the bottom 32 bits. No need to mask off the top 32 bits + // as addw will do the right thing. + __ addw(dest, dest, _regs[i / 2]); } else { - __ srli(dest, _regs[i / 2], 32); + // Use the top 32 bits by right-shifting them. + __ srli(rtmp, _regs[i / 2], 32); + __ addw(dest, dest, rtmp); } } }; typedef RegCache<8> BufRegCache; - // a += rtmp1 + x + ac; + // a += value + x + ac; // a = Integer.rotateLeft(a, s) + b; void m5_FF_GG_HH_II_epilogue(BufRegCache& reg_cache, Register a, Register b, Register c, Register d, int k, int s, int t, - Register rtmp1, Register rtmp2, Register rmask32) { - // rtmp1 = rtmp1 + x + ac - reg_cache.get_u32(rtmp2, k, rmask32); - __ addw(rtmp1, rtmp1, rtmp2); - __ mv(rtmp2, t); - __ addw(rtmp1, rtmp1, rtmp2); + Register value) { + // a += ac + __ addw(a, a, t, t1); - // a += rtmp1 + x + ac - __ addw(a, a, rtmp1); + // a += x; + reg_cache.add_u32(a, k); + // a += value; + __ addw(a, a, value); // a = Integer.rotateLeft(a, s) + b; - __ rolw_imm(a, a, s, rtmp1); + __ rolw_imm(a, a, s); __ addw(a, a, b); } @@ -3977,7 +3980,7 @@ class StubGenerator: public StubCodeGenerator { void md5_FF(BufRegCache& reg_cache, Register a, Register b, Register c, Register d, int k, int s, int t, - Register rtmp1, Register rtmp2, Register rmask32) { + Register rtmp1, Register rtmp2) { // rtmp1 = b & c __ andr(rtmp1, b, c); @@ -3987,8 +3990,7 @@ class StubGenerator: public StubCodeGenerator { // rtmp1 = (b & c) | ((~b) & d) __ orr(rtmp1, rtmp1, rtmp2); - m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, - rtmp1, rtmp2, rmask32); + m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); } // a += ((b & d) | (c & (~d))) + x + ac; @@ -3996,7 +3998,7 @@ class StubGenerator: public StubCodeGenerator { void md5_GG(BufRegCache& reg_cache, Register a, Register b, Register c, Register d, int k, int s, int t, - Register rtmp1, Register rtmp2, Register rmask32) { + Register rtmp1, Register rtmp2) { // rtmp1 = b & d __ andr(rtmp1, b, d); @@ -4006,8 +4008,7 @@ class StubGenerator: public StubCodeGenerator { // rtmp1 = (b & d) | (c & (~d)) __ orr(rtmp1, rtmp1, rtmp2); - m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, - rtmp1, rtmp2, rmask32); + m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); } // a += ((b ^ c) ^ d) + x + ac; @@ -4015,13 +4016,12 @@ class StubGenerator: public StubCodeGenerator { void md5_HH(BufRegCache& reg_cache, Register a, Register b, Register c, Register d, int k, int s, int t, - Register rtmp1, Register rtmp2, Register rmask32) { + Register rtmp1, Register rtmp2) { // rtmp1 = (b ^ c) ^ d - __ xorr(rtmp1, b, c); - __ xorr(rtmp1, rtmp1, d); + __ xorr(rtmp2, b, c); + __ xorr(rtmp1, rtmp2, d); - m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, - rtmp1, rtmp2, rmask32); + m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); } // a += (c ^ (b | (~d))) + x + ac; @@ -4029,13 +4029,12 @@ class StubGenerator: public StubCodeGenerator { void md5_II(BufRegCache& reg_cache, Register a, Register b, Register c, Register d, int k, int s, int t, - Register rtmp1, Register rtmp2, Register rmask32) { + Register rtmp1, Register rtmp2) { // rtmp1 = c ^ (b | (~d)) - __ orn(rtmp1, b, d); - __ xorr(rtmp1, c, rtmp1); + __ orn(rtmp2, b, d); + __ xorr(rtmp1, c, rtmp2); - m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, - rtmp1, rtmp2, rmask32); + m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); } // Arguments: @@ -4052,11 +4051,11 @@ class StubGenerator: public StubCodeGenerator { // x2 sp (stack pointer) // x3 gp (global pointer) // x4 tp (thread pointer) - // x5 t0 state0 - // x6 t1 state1 - // x7 t2 state2 + // x5 t0 (tmp register) + // x6 t1 (tmp register) + // x7 t2 state0 // x8 f0/s0 (frame pointer) - // x9 s1 state3 [saved-reg] + // x9 s1 // x10 a0 rtmp1 / c_rarg0 // x11 a1 rtmp2 / c_rarg1 // x12 a2 a / c_rarg2 @@ -4067,9 +4066,9 @@ class StubGenerator: public StubCodeGenerator { // x17 a7 state // x18 s2 ofs [saved-reg] (multi_block == True) // x19 s3 limit [saved-reg] (multi_block == True) - // x20 s4 - // x21 s5 - // x22 s6 mask32 [saved-reg] + // x20 s4 state1 [saved-reg] + // x21 s5 state2 [saved-reg] + // x22 s6 state3 [saved-reg] // x23 s7 // x24 s8 buf0 [saved-reg] // x25 s9 buf1 [saved-reg] @@ -4102,6 +4101,8 @@ class StubGenerator: public StubCodeGenerator { const int S43 = 15; const int S44 = 21; + const int64_t mask32 = 0xffffffff; + Register buf_arg = c_rarg0; // a0 Register state_arg = c_rarg1; // a1 Register ofs_arg = c_rarg2; // a2 @@ -4121,18 +4122,15 @@ class StubGenerator: public StubCodeGenerator { Register c = x14; // a4 Register d = x15; // a5 - Register state0 = x5; // t0 - Register state1 = x6; // t1 - Register state2 = x7; // t2 - Register state3 = x9; // s1 + Register state0 = x7; // t2 + Register state1 = x20; // s4 + Register state2 = x21; // s5 + Register state3 = x22; // s6 - // using x9->x11 to allow compressed instructions + // using x10->x11 to allow compressed instructions Register rtmp1 = x10; // a0 Register rtmp2 = x11; // a1 - const int64_t MASK_32 = 0xffffffff; - Register rmask32 = x22; // s6 - RegSet reg_cache_saved_regs = RegSet::of(x24, x25, x26, x27); // s8, s9, s10, s11 RegSet reg_cache_regs; reg_cache_regs += reg_cache_saved_regs; @@ -4143,7 +4141,7 @@ class StubGenerator: public StubCodeGenerator { if (multi_block) { saved_regs += RegSet::of(ofs, limit); } - saved_regs += RegSet::of(state3, rmask32); + saved_regs += RegSet::of(state1, state2, state3); saved_regs += reg_cache_saved_regs; __ push_reg(saved_regs, sp); @@ -4154,99 +4152,105 @@ class StubGenerator: public StubCodeGenerator { __ mv(ofs, ofs_arg); __ mv(limit, limit_arg); } - __ mv(rmask32, MASK_32); // to minimize the number of memory operations: // read the 4 state 4-byte values in pairs, with a single ld, // and split them into 2 registers + __ mv(t0, mask32); __ ld(state0, Address(state)); __ srli(state1, state0, 32); - __ andr(state0, state0, rmask32); + __ andr(state0, state0, t0); __ ld(state2, Address(state, 8)); __ srli(state3, state2, 32); - __ andr(state2, state2, rmask32); + __ andr(state2, state2, t0); Label md5_loop; __ BIND(md5_loop); - reg_cache.gen_loads(buf); - __ mv(a, state0); __ mv(b, state1); __ mv(c, state2); __ mv(d, state3); // Round 1 - md5_FF(reg_cache, a, b, c, d, 0, S11, 0xd76aa478, rtmp1, rtmp2, rmask32); - md5_FF(reg_cache, d, a, b, c, 1, S12, 0xe8c7b756, rtmp1, rtmp2, rmask32); - md5_FF(reg_cache, c, d, a, b, 2, S13, 0x242070db, rtmp1, rtmp2, rmask32); - md5_FF(reg_cache, b, c, d, a, 3, S14, 0xc1bdceee, rtmp1, rtmp2, rmask32); - md5_FF(reg_cache, a, b, c, d, 4, S11, 0xf57c0faf, rtmp1, rtmp2, rmask32); - md5_FF(reg_cache, d, a, b, c, 5, S12, 0x4787c62a, rtmp1, rtmp2, rmask32); - md5_FF(reg_cache, c, d, a, b, 6, S13, 0xa8304613, rtmp1, rtmp2, rmask32); - md5_FF(reg_cache, b, c, d, a, 7, S14, 0xfd469501, rtmp1, rtmp2, rmask32); - md5_FF(reg_cache, a, b, c, d, 8, S11, 0x698098d8, rtmp1, rtmp2, rmask32); - md5_FF(reg_cache, d, a, b, c, 9, S12, 0x8b44f7af, rtmp1, rtmp2, rmask32); - md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2, rmask32); - md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2, rmask32); - md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2, rmask32); - md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2, rmask32); - md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2, rmask32); - md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2, rmask32); + reg_cache.gen_load(0, buf); + md5_FF(reg_cache, a, b, c, d, 0, S11, 0xd76aa478, rtmp1, rtmp2); + md5_FF(reg_cache, d, a, b, c, 1, S12, 0xe8c7b756, rtmp1, rtmp2); + reg_cache.gen_load(1, buf); + md5_FF(reg_cache, c, d, a, b, 2, S13, 0x242070db, rtmp1, rtmp2); + md5_FF(reg_cache, b, c, d, a, 3, S14, 0xc1bdceee, rtmp1, rtmp2); + reg_cache.gen_load(2, buf); + md5_FF(reg_cache, a, b, c, d, 4, S11, 0xf57c0faf, rtmp1, rtmp2); + md5_FF(reg_cache, d, a, b, c, 5, S12, 0x4787c62a, rtmp1, rtmp2); + reg_cache.gen_load(3, buf); + md5_FF(reg_cache, c, d, a, b, 6, S13, 0xa8304613, rtmp1, rtmp2); + md5_FF(reg_cache, b, c, d, a, 7, S14, 0xfd469501, rtmp1, rtmp2); + reg_cache.gen_load(4, buf); + md5_FF(reg_cache, a, b, c, d, 8, S11, 0x698098d8, rtmp1, rtmp2); + md5_FF(reg_cache, d, a, b, c, 9, S12, 0x8b44f7af, rtmp1, rtmp2); + reg_cache.gen_load(5, buf); + md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2); + md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2); + reg_cache.gen_load(6, buf); + md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2); + md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2); + reg_cache.gen_load(7, buf); + md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2); + md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2); // Round 2 - md5_GG(reg_cache, a, b, c, d, 1, S21, 0xf61e2562, rtmp1, rtmp2, rmask32); - md5_GG(reg_cache, d, a, b, c, 6, S22, 0xc040b340, rtmp1, rtmp2, rmask32); - md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2, rmask32); - md5_GG(reg_cache, b, c, d, a, 0, S24, 0xe9b6c7aa, rtmp1, rtmp2, rmask32); - md5_GG(reg_cache, a, b, c, d, 5, S21, 0xd62f105d, rtmp1, rtmp2, rmask32); - md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2, rmask32); - md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2, rmask32); - md5_GG(reg_cache, b, c, d, a, 4, S24, 0xe7d3fbc8, rtmp1, rtmp2, rmask32); - md5_GG(reg_cache, a, b, c, d, 9, S21, 0x21e1cde6, rtmp1, rtmp2, rmask32); - md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2, rmask32); - md5_GG(reg_cache, c, d, a, b, 3, S23, 0xf4d50d87, rtmp1, rtmp2, rmask32); - md5_GG(reg_cache, b, c, d, a, 8, S24, 0x455a14ed, rtmp1, rtmp2, rmask32); - md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2, rmask32); - md5_GG(reg_cache, d, a, b, c, 2, S22, 0xfcefa3f8, rtmp1, rtmp2, rmask32); - md5_GG(reg_cache, c, d, a, b, 7, S23, 0x676f02d9, rtmp1, rtmp2, rmask32); - md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2, rmask32); + md5_GG(reg_cache, a, b, c, d, 1, S21, 0xf61e2562, rtmp1, rtmp2); + md5_GG(reg_cache, d, a, b, c, 6, S22, 0xc040b340, rtmp1, rtmp2); + md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2); + md5_GG(reg_cache, b, c, d, a, 0, S24, 0xe9b6c7aa, rtmp1, rtmp2); + md5_GG(reg_cache, a, b, c, d, 5, S21, 0xd62f105d, rtmp1, rtmp2); + md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2); + md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2); + md5_GG(reg_cache, b, c, d, a, 4, S24, 0xe7d3fbc8, rtmp1, rtmp2); + md5_GG(reg_cache, a, b, c, d, 9, S21, 0x21e1cde6, rtmp1, rtmp2); + md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2); + md5_GG(reg_cache, c, d, a, b, 3, S23, 0xf4d50d87, rtmp1, rtmp2); + md5_GG(reg_cache, b, c, d, a, 8, S24, 0x455a14ed, rtmp1, rtmp2); + md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2); + md5_GG(reg_cache, d, a, b, c, 2, S22, 0xfcefa3f8, rtmp1, rtmp2); + md5_GG(reg_cache, c, d, a, b, 7, S23, 0x676f02d9, rtmp1, rtmp2); + md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2); // Round 3 - md5_HH(reg_cache, a, b, c, d, 5, S31, 0xfffa3942, rtmp1, rtmp2, rmask32); - md5_HH(reg_cache, d, a, b, c, 8, S32, 0x8771f681, rtmp1, rtmp2, rmask32); - md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2, rmask32); - md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2, rmask32); - md5_HH(reg_cache, a, b, c, d, 1, S31, 0xa4beea44, rtmp1, rtmp2, rmask32); - md5_HH(reg_cache, d, a, b, c, 4, S32, 0x4bdecfa9, rtmp1, rtmp2, rmask32); - md5_HH(reg_cache, c, d, a, b, 7, S33, 0xf6bb4b60, rtmp1, rtmp2, rmask32); - md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2, rmask32); - md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2, rmask32); - md5_HH(reg_cache, d, a, b, c, 0, S32, 0xeaa127fa, rtmp1, rtmp2, rmask32); - md5_HH(reg_cache, c, d, a, b, 3, S33, 0xd4ef3085, rtmp1, rtmp2, rmask32); - md5_HH(reg_cache, b, c, d, a, 6, S34, 0x04881d05, rtmp1, rtmp2, rmask32); - md5_HH(reg_cache, a, b, c, d, 9, S31, 0xd9d4d039, rtmp1, rtmp2, rmask32); - md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2, rmask32); - md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2, rmask32); - md5_HH(reg_cache, b, c, d, a, 2, S34, 0xc4ac5665, rtmp1, rtmp2, rmask32); + md5_HH(reg_cache, a, b, c, d, 5, S31, 0xfffa3942, rtmp1, rtmp2); + md5_HH(reg_cache, d, a, b, c, 8, S32, 0x8771f681, rtmp1, rtmp2); + md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2); + md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2); + md5_HH(reg_cache, a, b, c, d, 1, S31, 0xa4beea44, rtmp1, rtmp2); + md5_HH(reg_cache, d, a, b, c, 4, S32, 0x4bdecfa9, rtmp1, rtmp2); + md5_HH(reg_cache, c, d, a, b, 7, S33, 0xf6bb4b60, rtmp1, rtmp2); + md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2); + md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2); + md5_HH(reg_cache, d, a, b, c, 0, S32, 0xeaa127fa, rtmp1, rtmp2); + md5_HH(reg_cache, c, d, a, b, 3, S33, 0xd4ef3085, rtmp1, rtmp2); + md5_HH(reg_cache, b, c, d, a, 6, S34, 0x04881d05, rtmp1, rtmp2); + md5_HH(reg_cache, a, b, c, d, 9, S31, 0xd9d4d039, rtmp1, rtmp2); + md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2); + md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2); + md5_HH(reg_cache, b, c, d, a, 2, S34, 0xc4ac5665, rtmp1, rtmp2); // Round 4 - md5_II(reg_cache, a, b, c, d, 0, S41, 0xf4292244, rtmp1, rtmp2, rmask32); - md5_II(reg_cache, d, a, b, c, 7, S42, 0x432aff97, rtmp1, rtmp2, rmask32); - md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2, rmask32); - md5_II(reg_cache, b, c, d, a, 5, S44, 0xfc93a039, rtmp1, rtmp2, rmask32); - md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2, rmask32); - md5_II(reg_cache, d, a, b, c, 3, S42, 0x8f0ccc92, rtmp1, rtmp2, rmask32); - md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2, rmask32); - md5_II(reg_cache, b, c, d, a, 1, S44, 0x85845dd1, rtmp1, rtmp2, rmask32); - md5_II(reg_cache, a, b, c, d, 8, S41, 0x6fa87e4f, rtmp1, rtmp2, rmask32); - md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2, rmask32); - md5_II(reg_cache, c, d, a, b, 6, S43, 0xa3014314, rtmp1, rtmp2, rmask32); - md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2, rmask32); - md5_II(reg_cache, a, b, c, d, 4, S41, 0xf7537e82, rtmp1, rtmp2, rmask32); - md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2, rmask32); - md5_II(reg_cache, c, d, a, b, 2, S43, 0x2ad7d2bb, rtmp1, rtmp2, rmask32); - md5_II(reg_cache, b, c, d, a, 9, S44, 0xeb86d391, rtmp1, rtmp2, rmask32); + md5_II(reg_cache, a, b, c, d, 0, S41, 0xf4292244, rtmp1, rtmp2); + md5_II(reg_cache, d, a, b, c, 7, S42, 0x432aff97, rtmp1, rtmp2); + md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2); + md5_II(reg_cache, b, c, d, a, 5, S44, 0xfc93a039, rtmp1, rtmp2); + md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2); + md5_II(reg_cache, d, a, b, c, 3, S42, 0x8f0ccc92, rtmp1, rtmp2); + md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2); + md5_II(reg_cache, b, c, d, a, 1, S44, 0x85845dd1, rtmp1, rtmp2); + md5_II(reg_cache, a, b, c, d, 8, S41, 0x6fa87e4f, rtmp1, rtmp2); + md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2); + md5_II(reg_cache, c, d, a, b, 6, S43, 0xa3014314, rtmp1, rtmp2); + md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2); + md5_II(reg_cache, a, b, c, d, 4, S41, 0xf7537e82, rtmp1, rtmp2); + md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2); + md5_II(reg_cache, c, d, a, b, 2, S43, 0x2ad7d2bb, rtmp1, rtmp2); + md5_II(reg_cache, b, c, d, a, 9, S44, 0xeb86d391, rtmp1, rtmp2); __ addw(state0, state0, a); __ addw(state1, state1, b); @@ -4263,11 +4267,12 @@ class StubGenerator: public StubCodeGenerator { // to minimize the number of memory operations: // write back the 4 state 4-byte values in pairs, with a single sd - __ andr(state0, state0, rmask32); + __ mv(t0, mask32); + __ andr(state0, state0, t0); __ slli(state1, state1, 32); __ orr(state0, state0, state1); __ sd(state0, Address(state)); - __ andr(state2, state2, rmask32); + __ andr(state2, state2, t0); __ slli(state3, state3, 32); __ orr(state2, state2, state3); __ sd(state2, Address(state, 8));