diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp index a8f378e524f..25b5fcd6b4a 100644 --- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp @@ -3150,6 +3150,20 @@ private: f(1, 21), rf(Vm, 16), f(0b111000, 15, 10), rf(Vn, 5), rf(Vd, 0); } + //Vector by element variant of UMULL + void _umullv(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, + SIMD_Arrangement Tb, FloatRegister Vm, SIMD_RegVariant Ts, int lane) { + starti; + int size = (Ta == T4S) ? 0b01 : 0b10; + int q = (Tb == T4H || Tb == T2S) ? 0 : 1; + int h = (size == 0b01) ? ((lane >> 2) & 1) : ((lane >> 1) & 1); + int l = (size == 0b01) ? ((lane >> 1) & 1) : (lane & 1); + int m = lane & 1; + assert((size == 0b10 ? lane < 4 : lane < 7)) + f(0, 31), f(q, 30), f(1, 29), f(0b01111, 28, 24), f(size, 23, 22), f(l, 21), f(m, 20); + rf(Vm, 16), f(0b1010, 15, 12), f(h, 11), f(0, 10), rf(Vn, 5), rf(Vd, 0); + } + public: void pmull(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) { assert(Tb == T1D || Tb == T8B, "pmull assumes T1D or T8B as the second size specifier"); @@ -3161,6 +3175,23 @@ public: _pmull(Vd, Ta, Vn, Vm, Tb); } + //Vector by element variant of UMULL + void umullv(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, + SIMD_Arrangement Tb, FloatRegister Vm, SIMD_RegVariant Ts, int lane) { + assert(Tb == T8B || Tb == T4H || Tb == T2S, "umullv assumes T8B, T4H, or T2S as the Tb size specifier"); + assert(Ts == H || Ts == S, "umullv assumes H or S as the RegVariant for Ts"); + assert(Ts == H ? Tb == T4H : Tb == T2S, "umullv assumes Tb is T4H when Ts is H"); + _umullv(Vd, Ta, Vn, Tb, Vm, Ts, lane); + } + + void umull2v(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, + SIMD_Arrangement Tb, FloatRegister Vm, SIMD_RegVariant Ts, int lane) { + assert(Tb == T16B || Tb == T8H || Tb == T4S, "umull2v assumes T16B, T8H, or T4S as the size specifier"); + assert(Ts == H || Ts == S, "umull2v assumes H or S as the RegVariant for Ts"); + assert(Ts == H ? Tb == T8H : Tb == T4S, "umull2v assumes Tb is T8H when Ts is H"); + _umullv(Vd, Ta, Vn, Tb, Vm, Ts, lane); + } + void uqxtn(FloatRegister Vd, SIMD_Arrangement Tb, FloatRegister Vn, SIMD_Arrangement Ta) { starti; int size_b = (int)Tb >> 1; diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index 3a8b8764567..e93d528b5cb 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -7140,6 +7140,15 @@ class StubGenerator: public StubCodeGenerator { return start; } + // P256 Montgomery Multiplication. + // Implements the method protected void mult(long[] a, long[] b, long[] r) {} + // of the sun.security.util.math.intpoly.MontgomeryIntegerPolynomialP256 class + // + // a (long[5]) = c_rarg0 + // b (long[5]) = c_rarg1 + // r (long[5]) = c_rarg2 + // + // Note that each arg represents a 256-bit integer broken into 52-bit limbs address generate_intpoly_montgomeryMult_P256() { __ align(CodeEntryAlignment); @@ -7152,172 +7161,230 @@ class StubGenerator: public StubCodeGenerator { const Register b = c_rarg1; const Register result = c_rarg2; + //Omit 3rd limb of modulus since it is 0 static const int64_t modulus[5] = { - 0x000fffffffffffffL, 0x00000fffffffffffL, 0x0000000000000000L, - 0x0000001000000000L, 0x0000ffffffff0000L + 0x000fffffffffffffL, 0x00000fffffffffffL, + 0x0000001000000000L, 0x0000ffffffff0000L }; - Register c_ptr = r9; - Register a_i = r10; - Register c_idx = r10; //c_idx is not used at the same time as a_i - Register limb_mask_scalar = r11; - Register b_j = r12; - Register mod_j = r12; - Register mod_ptr = r13; - Register mul_tmp = r14; - Register n = r15; + int shift1 = 12; // 64 - bits per limb + int shift2 = 52; // bits per limb - FloatRegister low_01 = v16; - FloatRegister low_23 = v17; - FloatRegister low_4x = v18; - FloatRegister high_01 = v19; - FloatRegister high_23 = v20; - FloatRegister high_4x = v21; - FloatRegister modmul_low = v22; - FloatRegister modmul_high = v23; - FloatRegister c_01 = v24; - FloatRegister c_23 = v25; - FloatRegister limb_mask = v28; - FloatRegister tmp = v29; + // GPRs that are used throughout loop + Register b_j = r3; + Register mod_ptr = r4; + Register limb_mask_scalar = r5; + Register c_ptr = r6; - int shift1 = 12; - int shift2 = 52; //bits per limb + // These neon registers remain constant through the main loop + FloatRegister limb_mask = v0; + FloatRegister mask_32_vec = v1; + FloatRegister b_lows = v2; + FloatRegister b_highs = v3; + FloatRegister mod_lows = v4; + FloatRegister mod_highs = v5; // Push callee saved registers on to the stack RegSet callee_saved = RegSet::range(r19, r28); __ push(callee_saved, sp); - + // Allocate space on the stack for carry values and zero memory __ sub(sp, sp, 80); __ mov(c_ptr, sp); - __ eor(a_i, a_i, a_i); + __ eor(b_j, b_j, b_j); //Create a 0 reg to clear memory for (int i = 0; i < 10; i++) { - __ str(a_i, Address(sp, i * 8)); + __ str(b_j, Address(sp, i * 8)); } // Calculate limb mask - __ mov(limb_mask_scalar, -UCONST64(1) >> (64 - shift2)); - __ dup(limb_mask, __ T2D, limb_mask_scalar); + __ mov(limb_mask_scalar, -UCONST64(1) >> (64 - shift2)); + __ dup(limb_mask, __ T2D, limb_mask_scalar); - // Get pointer for modulus + // Calculate 32-bit mask + { + Register mask_32 = r7; + __ mov(mask_32, (UCONST64(1) << 32) - 1); + __ dup(mask_32_vec, __ T2D, mask_32); + } + + // Load modulus and input array b __ lea(mod_ptr, ExternalAddress((address)modulus)); + __ ld2(b_lows, b_highs, __ T4S, Address(b)); + __ ld2(mod_lows, mod_highs, __ T4S, Address(mod_ptr)); + __ ldr(b_j, Address(b, 32)); for (int i = 0; i < 5; i++) { - // Load a_i into scalar_mult register and increment by 64 bits - __ ldr(a_i, Address(a, i * 8)); + Register c_idx = r10; + Register mul_tmp = r11; + Register scalar_ai = r12; - // Iterate through b, multiplying each limb by a_i + FloatRegister A = v6; + FloatRegister B = v7; + FloatRegister C = v8; + FloatRegister D = v16; + FloatRegister a_i = v17; + FloatRegister n = v18; + FloatRegister middle = v19; + FloatRegister tmp = v20; + FloatRegister modmul_low = v21; + FloatRegister modmul_high = v22; + FloatRegister c_01 = v23; + FloatRegister c_23 = v24; + FloatRegister low_34 = v25; + FloatRegister low_01 = v26; + FloatRegister low_23 = v27; + FloatRegister low_4x = v28; + FloatRegister high_01 = v29; + FloatRegister high_23 = v30; + FloatRegister high_4x = v31; + + // Load a_i and increment by 8 bytes + __ ldr(scalar_ai, a); + __ ld1(a_i, __ D, 0, __ post(a, 8)); + + // Start computing final multiply with GPR since it is not + // worth it to vectorize a single mult + __ mul(mul_tmp, scalar_ai, b_j); + __ mov(low_4x, Assembler::D, 0, mul_tmp); + __ umulh(mul_tmp, scalar_ai, b_j); + __ mov(high_4x, Assembler::D, 0, mul_tmp); + + // Iterate through b, multiplying each limb by a_i // storing low and high parts in separate vectors. // Compute high[i] = high[i] << shift1 | (low[i] >>> shift2) // and low[i] &= LIMB_MASK - __ ldr(b_j, Address(b)); - __ mul(mul_tmp, a_i, b_j); - __ mov(low_01, Assembler::D, 0, mul_tmp); - __ umulh(mul_tmp, a_i, b_j); - __ mov(high_01, Assembler::D, 0, mul_tmp); - __ ldr(b_j, Address(b, 8)); - __ mul(mul_tmp, a_i, b_j); - __ mov(low_01, Assembler::D, 1, mul_tmp); - __ umulh(mul_tmp, a_i, b_j); - __ mov(high_01, Assembler::D, 1, mul_tmp); - + // Calculus low_01 and high_01 + __ umullv(A, __ T2D, b_lows, __ T2S, a_i, __ S, 0); + __ umullv(B, __ T2D, b_highs, __ T2S, a_i, __ S, 0); + __ umullv(C, __ T2D, b_lows, __ T2S, a_i, __ S, 1); + __ umullv(D, __ T2D, b_highs, __ T2S, a_i, __ S, 1); + + __ andr(middle, __ T16B, B, mask_32_vec); + __ ushr(tmp, __ T2D, A, 32); + __ addv(middle, __ T2D, middle, tmp); + __ addv(middle, __ T2D, middle, C); + + __ shl(low_01, __ T2D, middle, 32); + __ andr(tmp, __ T16B, A, mask_32_vec); + __ orr(low_01, __ T16B, low_01, tmp); + + __ ushr(high_01, __ T2D, middle, 32); + __ addv(high_01, __ T2D, high_01, D); + __ ushr(tmp, __ T2D, B, 32); + __ addv(high_01, __ T2D, high_01, tmp); + __ shl(high_01, __ T2D, high_01, shift1); __ ushr(tmp, __ T2D, low_01, shift2); __ orr(high_01, __ T16B, high_01, tmp); - __ andr(low_01, __ T2D, low_01, limb_mask); - - __ ldr(b_j, Address(b, 16)); - __ mul(mul_tmp, a_i, b_j); - __ mov(low_23, Assembler::D, 0, mul_tmp); - __ umulh(mul_tmp, a_i, b_j); - __ mov(high_23, Assembler::D, 0, mul_tmp); + __ andr(low_01, __ T16B, low_01, limb_mask); - __ ldr(b_j, Address(b, 24)); - __ mul(mul_tmp, a_i, b_j); - __ mov(low_23, Assembler::D, 1, mul_tmp); - __ umulh(mul_tmp, a_i, b_j); - __ mov(high_23, Assembler::D, 1, mul_tmp); + // Calculate low_23 and high_23 + __ umull2v(A, __ T2D, b_lows, __ T4S, a_i, __ S, 0); + __ umull2v(B, __ T2D, b_highs, __ T4S, a_i, __ S, 0); + __ umull2v(C, __ T2D, b_lows, __ T4S, a_i, __ S, 1); + __ umull2v(D, __ T2D, b_highs, __ T4S, a_i, __ S, 1); + + __ andr(middle, __ T16B, B, mask_32_vec); + __ ushr(tmp, __ T2D, A, 32); + __ addv(middle, __ T2D, middle, tmp); + __ addv(middle, __ T2D, middle, C); + + __ shl(low_23, __ T2D, middle, 32); + __ andr(tmp, __ T16B, A, mask_32_vec); + __ orr(low_23, __ T16B, low_23, tmp); + + __ ushr(high_23, __ T2D, middle, 32); + __ addv(high_23, __ T2D, high_23, D); + __ ushr(tmp, __ T2D, B, 32); + __ addv(high_23, __ T2D, high_23, tmp); __ shl(high_23, __ T2D, high_23, shift1); __ ushr(tmp, __ T2D, low_23, shift2); __ orr(high_23, __ T16B, high_23, tmp); - __ andr(low_23, __ T2D, low_23, limb_mask); - - __ ldr(b_j, Address(b, 32)); - __ mul(mul_tmp, a_i, b_j); - __ mov(low_4x, Assembler::D, 0, mul_tmp); - __ umulh(mul_tmp, a_i, b_j); - __ mov(high_4x, Assembler::D, 0, mul_tmp); + __ andr(low_23, __ T16B, low_23, limb_mask); + // Finish computing high_4x __ shl(high_4x, __ T2D, high_4x, shift1); __ ushr(tmp, __ T2D, low_4x, shift2); __ orr(high_4x, __ T16B, high_4x, tmp); - __ andr(low_4x, __ T2D, low_4x, limb_mask); - - // Load c_i and perform + __ andr(low_4x, __ T16B, low_4x, limb_mask); + // low_0 += c_i // n = low_0 & limb_mask __ eor(c_01, __ T2D, c_01, c_01); __ ld1(c_01, __ D, 0, c_ptr); __ addv(low_01, __ T2D, low_01, c_01); - __ mov(n, low_01, __ D, 0); - __ andr(n, n, limb_mask_scalar); - - // Iterate through the modulus, multiplying each limb by n and + __ andr(n, __ T16B, low_01, limb_mask); + + // Iterate through the modulus, multiplying each limb by n and // storing low and high parts in separate vectors. // Compute high += modmul_high << shift1 | (modmul_low >>> shift2); // and low += modmul_low & LIMB_MASK - __ ldr(mod_j, Address(mod_ptr)); - __ mul(mul_tmp, n, mod_j); - __ mov(modmul_low, Assembler::D, 0, mul_tmp); - __ umulh(mul_tmp, n, mod_j); - __ mov(modmul_high, Assembler::D, 0, mul_tmp); - __ ldr(mod_j, Address(mod_ptr, 8)); - __ mul(mul_tmp, n, mod_j); - __ mov(modmul_low, Assembler::D, 1, mul_tmp); - __ umulh(mul_tmp, n, mod_j); - __ mov(modmul_high, Assembler::D, 1, mul_tmp); + // Calculate modmul_low and modmul_high for modulus[0] and modulus[1] + __ umullv(A, __ T2D, mod_lows, __ T2S, n, __ S, 0); + __ umullv(B, __ T2D, mod_highs, __ T2S, n, __ S, 0); + __ umullv(C, __ T2D, mod_lows, __ T2S, n, __ S, 1); + __ umullv(D, __ T2D, mod_highs, __ T2S, n, __ S, 1); + + __ andr(middle, __ T16B, B, mask_32_vec); + __ ushr(tmp, __ T2D, A, 32); + __ addv(middle, __ T2D, middle, tmp); + __ addv(middle, __ T2D, middle, C); + + __ shl(modmul_low, __ T2D, middle, 32); + __ andr(tmp, __ T16B, A, mask_32_vec); + __ orr(modmul_low, __ T16B, modmul_low, tmp); + + __ ushr(modmul_high, __ T2D, middle, 32); + __ addv(modmul_high, __ T2D, modmul_high, D); + __ ushr(tmp, __ T2D, B, 32); + __ addv(modmul_high, __ T2D, modmul_high, tmp); __ shl(modmul_high, __ T2D, modmul_high, shift1); __ ushr(tmp, __ T2D, modmul_low, shift2); __ orr(modmul_high, __ T16B, modmul_high, tmp); __ addv(high_01, __ T2D, high_01, modmul_high); - __ andr(modmul_low, __ T2D, modmul_low, limb_mask); + __ andr(modmul_low, __ T16B, modmul_low, limb_mask); __ addv(low_01, __ T2D, low_01, modmul_low); - __ ldr(mod_j, Address(mod_ptr, 16)); - __ mul(mul_tmp, n, mod_j); - __ mov(modmul_low, Assembler::D, 0, mul_tmp); - __ umulh(mul_tmp, n, mod_j); - __ mov(modmul_high, Assembler::D, 0, mul_tmp); + // Calculate modmul_low and modmul_high for modulus[3] and modulus[4]. + // Can omit modulus[2] since it is 0 + __ umull2v(A, __ T2D, mod_lows, __ T4S, n, __ S, 0); + __ umull2v(B, __ T2D, mod_highs, __ T4S, n, __ S, 0); + __ umull2v(C, __ T2D, mod_lows, __ T4S, n, __ S, 1); + __ umull2v(D, __ T2D, mod_highs, __ T4S, n, __ S, 1); - __ ldr(mod_j, Address(mod_ptr, 24)); - __ mul(mul_tmp, n, mod_j); - __ mov(modmul_low, Assembler::D, 1, mul_tmp); - __ umulh(mul_tmp, n, mod_j); - __ mov(modmul_high, Assembler::D, 1, mul_tmp); + __ andr(middle, __ T16B, B, mask_32_vec); + __ ushr(tmp, __ T2D, A, 32); + __ addv(middle, __ T2D, middle, tmp); + __ addv(middle, __ T2D, middle, C); + + __ shl(modmul_low, __ T2D, middle, 32); + __ andr(tmp, __ T16B, A, mask_32_vec); + __ orr(modmul_low, __ T16B, modmul_low, tmp); + + __ ushr(modmul_high, __ T2D, middle, 32); + __ addv(modmul_high, __ T2D, modmul_high, D); + __ ushr(tmp, __ T2D, B, 32); + __ addv(modmul_high, __ T2D, modmul_high, tmp); __ shl(modmul_high, __ T2D, modmul_high, shift1); __ ushr(tmp, __ T2D, modmul_low, shift2); __ orr(modmul_high, __ T16B, modmul_high, tmp); - __ addv(high_23, __ T2D, high_23, modmul_high); - __ andr(modmul_low, __ T2D, modmul_low, limb_mask); - __ addv(low_23, __ T2D, low_23, modmul_low); + __ andr(modmul_low, __ T16B, modmul_low, limb_mask); - __ ldr(mod_j, Address(mod_ptr, 32)); - __ mul(mul_tmp, n, mod_j); - __ mov(modmul_low, Assembler::D, 0, mul_tmp); - __ umulh(mul_tmp, n, mod_j); - __ mov(modmul_high, Assembler::D, 0, mul_tmp); + //Need to shift around vectors to get right layout bc of no modulus[2] + __ ins(low_34, __ D, low_23, 0, 1); + __ ins(low_34, __ D, low_4x, 1, 0); + __ addv(low_34, __ T2D, low_34, modmul_low); - __ shl(modmul_high, __ T2D, modmul_high, shift1); - __ ushr(tmp, __ T2D, modmul_low, shift2); - __ orr(modmul_high, __ T16B, modmul_high, tmp); - __ addv(high_4x, __ T2D, high_4x, modmul_high); - __ andr(modmul_low, __ T2D, modmul_low, limb_mask); - __ addv(low_4x, __ T2D, low_4x, modmul_low); + __ eor(tmp, __ T16B, tmp, tmp); + __ ins(tmp, __ D, modmul_high, 1, 0); // tmp = [0, nn3] + __ addv(high_23, __ T2D, high_23, tmp); + __ ins(tmp, __ D, modmul_high, 0, 1); // tmp = [nn4, nn3] + __ addv(high_4x, __ T2D, high_4x, tmp); // Compute carry values // c_i+1 += low_1 + high_0 + (low_0 >>> shift2) @@ -7333,20 +7400,18 @@ class StubGenerator: public StubCodeGenerator { // Add high values to c __ addv(c_01, __ T2D, c_01, high_01); __ addv(c_23, __ T2D, c_23, high_23); + __ addv(c_23, __ T2D, c_23, low_34); // Reorder low vectors to enable simd ops - // clear tmp_4x and put low_0 in first lane __ ins(tmp, __ D, low_01, 0, 1); __ ins(tmp, __ D, low_23, 1, 0); __ addv(c_01, __ T2D, c_01, tmp); - __ ins(tmp, __ D, low_23, 0, 1); - __ ins(tmp, __ D, low_4x, 1, 0); - __ addv(c_23, __ T2D, c_23, tmp); + // clear tmp_4x and put low_0 in first lane // Shift low_0 and add to c_i+1 __ ushr(low_01, __ T2D, low_01, shift2); __ eor(tmp, __ T16B, tmp, tmp); //zero out tmp - __ ins(tmp, __ D, low_01, 0, 0); + __ ins(tmp, __ D, low_01, 0, 0); __ addv(c_01, __ T2D, c_01, tmp); // Write back carry values to stack @@ -7354,8 +7419,8 @@ class StubGenerator: public StubCodeGenerator { } // Final carry propagate and write result - - Register tmp_0 = r10; + Register mod_j = r3; // b_j is not used after loop + Register tmp = r6; // c_ptr is not used after loop Register c0 = r19; Register c1 = r20; Register c2 = r21; @@ -7374,14 +7439,14 @@ class StubGenerator: public StubCodeGenerator { // c8 += (c7 >>> BITS_PER_LIMB); // c9 += (c8 >>> BITS_PER_LIMB); - __ lsr(tmp_0, c5, shift2); - __ add(c6, c6, tmp_0); - __ lsr(tmp_0, c6, shift2); - __ add(c7, c7, tmp_0); - __ lsr(tmp_0, c7, shift2); - __ add(c8, c8, tmp_0); - __ lsr(tmp_0, c8, shift2); - __ add(c9, c9, tmp_0); + __ lsr(tmp, c5, shift2); + __ add(c6, c6, tmp); + __ lsr(tmp, c6, shift2); + __ add(c7, c7, tmp); + __ lsr(tmp, c7, shift2); + __ add(c8, c8, tmp); + __ lsr(tmp, c8, shift2); + __ add(c9, c9, tmp); __ andr(c5, c5, limb_mask_scalar); __ andr(c6, c6, limb_mask_scalar); @@ -7391,41 +7456,40 @@ class StubGenerator: public StubCodeGenerator { // c0 = c5 - modulus[0]; // c1 = c6 - modulus[1] + (c0 >> BITS_PER_LIMB); // c0 &= LIMB_MASK; - // c2 = c7 + (c1 >> BITS_PER_LIMB); + // c2 = c7 + (c1 >> BITS_PER_LIMB); // c1 &= LIMB_MASK; // c3 = c8 - modulus[3] + (c2 >> BITS_PER_LIMB); // c2 &= LIMB_MASK; // c4 = c9 - modulus[4] + (c3 >> BITS_PER_LIMB); // c3 &= LIMB_MASK; - __ ldr(mod_j, Address(mod_ptr)); __ sub(c0, c5, mod_j); __ ldr(mod_j, Address(mod_ptr, 8)); __ sub(c1, c6, mod_j); - __ asr(tmp_0, c0, shift2); - __ add(c1, c1, tmp_0); + __ asr(tmp, c0, shift2); + __ add(c1, c1, tmp); - __ ldr(mod_j, Address(mod_ptr, 16)); + // Modulus[2] is zero __ asr(c2, c1, shift2); __ add(c2, c2, c7); - __ ldr(mod_j, Address(mod_ptr, 24)); + __ ldr(mod_j, Address(mod_ptr, 16)); __ sub(c3, c8, mod_j); - __ asr(tmp_0, c2, shift2); - __ add(c3, c3, tmp_0); + __ asr(tmp, c2, shift2); + __ add(c3, c3, tmp); - __ ldr(mod_j, Address(mod_ptr, 32)); + __ ldr(mod_j, Address(mod_ptr, 24)); __ sub(c4, c9, mod_j); - __ asr(tmp_0, c3, shift2); - __ add(c4, c4, tmp_0); + __ asr(tmp, c3, shift2); + __ add(c4, c4, tmp); // Apply limb mask __ andr(c0, c0, limb_mask_scalar); __ andr(c1, c1, limb_mask_scalar); __ andr(c2, c2, limb_mask_scalar); __ andr(c3, c3, limb_mask_scalar); - + // Final write back // mask = c4 >> 63 // r[0] = ((c5 & mask) | (c0 & ~mask)); @@ -7434,46 +7498,45 @@ class StubGenerator: public StubCodeGenerator { // r[3] = ((c8 & mask) | (c3 & ~mask)); // r[4] = ((c9 & mask) | (c4 & ~mask)); - Register res_0 = r9; - Register res_1 = r10; - Register res_2 = r11; - Register res_3 = r12; - Register res_4 = r13; - Register mask = r14; - Register nmask = r15; - Register tmp_1 = r19; + Register res_0 = r11; + Register res_1 = r12; + Register res_2 = r13; + Register res_3 = r14; + Register res_4 = r15; + Register mask = r7; + Register nmask = r10; - RegSet res = RegSet::range(r9, r13); + RegSet res = RegSet::range(r11, r15); __ asr(mask, c4, 63); __ mvn(nmask, mask); __ andr(res_0, c5, mask); - __ andr(tmp_1, c0, nmask); - __ orr(res_0, res_0, tmp_1); + __ andr(tmp, c0, nmask); + __ orr(res_0, res_0, tmp); __ andr(res_1, c6, mask); - __ andr(tmp_1, c1, nmask); - __ orr(res_1, res_1, tmp_1); + __ andr(tmp, c1, nmask); + __ orr(res_1, res_1, tmp); __ andr(res_2, c7, mask); - __ andr(tmp_1, c2, nmask); - __ orr(res_2, res_2, tmp_1); + __ andr(tmp, c2, nmask); + __ orr(res_2, res_2, tmp); __ andr(res_3, c8, mask); - __ andr(tmp_1, c3, nmask); - __ orr(res_3, res_3, tmp_1); + __ andr(tmp, c3, nmask); + __ orr(res_3, res_3, tmp); __ andr(res_4, c9, mask); - __ andr(tmp_1, c4, nmask); - __ orr(res_4, res_4, tmp_1); + __ andr(tmp, c4, nmask); + __ orr(res_4, res_4, tmp); __ str(res_0, result); __ str(res_1, Address(result, 8)); __ str(res_2, Address(result, 16)); __ str(res_3, Address(result, 24)); __ str(res_4, Address(result, 32)); - + // End intrinsic call __ pop(callee_saved, sp); __ leave(); // required for proper stackwalking of RuntimeStub frame