diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index f89b6e2d579..cae69ac4621 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -8654,6 +8654,123 @@ class StubGenerator: public StubCodeGenerator { return start; } + /** + * Arithmetic polynomial multiplication in Curve25519. The algorithm mimics + * the version in the IntegerPolynomial25519 class, including the use of all + * columns (no folding method). + * + * Arguments: + * + * Inputs: + * c_rarg0 - long[] aLimbs + * c_rarg1 - long[] bLimbs + * + * Output: + * c_rarg2 - long[] rLimbs result + */ + address generate_intpoly_mult_25519() { + StubId stub_id = StubId::stubgen_intpoly_mult_25519_id; + int entry_count = StubInfo::entry_count(stub_id); + assert(entry_count == 1, "sanity check"); + address start = load_archive_data(stub_id); + if (start != nullptr) { + return start; + } + __ align(CodeEntryAlignment); + StubCodeMark mark(this, stub_id); + start = __ pc(); + __ enter(); + + // Register Map + const Register aLimbs = c_rarg0; // r0 + const Register bLimbs = c_rarg1; // r1 + const Register rLimbs = c_rarg2; // r2 + + Register c[] = {r3, r4, r5, r6, r7, r8, r9, r10, r11, r12}; + Register a = r13; + Register b = r14; + Register term = r15; + Register low = r16; + Register high = r17; + + const int32_t limbs = 5; + const int32_t bpl = 51; + const int32_t rem = 64 - bpl; + const int32_t TERM = 19; + const int32_t columns = limbs * 2; + const uint64_t mask = (uint64_t) -1 >> rem; + const uint64_t CARRY_ADD = (uint64_t) 1 << (bpl - 1); + + __ mov(term, TERM); + for (int i = 0; i < columns; i++) { + __ mov(c[i], zr); + } + + // Perform high/low multiplication with signed 5x51 bit limbs + for (int i = 0; i < limbs; i++) { + __ ldr(b, Address(bLimbs, i * 8)); + for (int j = 0; j < limbs; j++) { + __ ldr(a, Address(aLimbs, j * 8)); + __ smulh(high, a, b); + __ mul(low, a, b); + __ extr(high, high, low, bpl); + __ andr(low, low, mask); + __ add(c[i + j], c[i + j], low); + __ add(c[i + j + 1], c[i + j + 1], high); + } + } + + for (int i = 0; i < limbs; i++) { + __ mul(c[i + 5], c[i + 5], term); + __ add(c[i], c[i], c[i + 5]); + } + + // Carry-add with reduction from high limb + Register tmp = low; + Register carry_add = high; + __ mov(carry_add, CARRY_ADD); + + // Limb 3 + __ add(tmp, c[3], carry_add); + __ asr(tmp, tmp, bpl); + __ add(c[4], c[4], tmp); + __ lsl(tmp, tmp, bpl); + __ sub(c[3], c[3], tmp); + + // Limb 4 + __ add(tmp, c[4], carry_add); + __ asr(tmp, tmp, bpl); + + // Reduce high order limb and fold back into low order limb + __ mul(term, tmp, term); + __ add(c[0], c[0], term); + + __ lsl(tmp, tmp, bpl); + __ sub(c[4], c[4], tmp); + + // Limbs 0 - 3 + for (int i = 0; i < (limbs - 1); i++) { + __ add(tmp, c[i], carry_add); + __ asr(tmp, tmp, bpl); + __ add(c[i + 1], c[i + 1], tmp); + __ lsl(tmp, tmp, bpl); + __ sub(c[i], c[i], tmp); + } + + for (int i = 0; i < limbs; i++) { + __ str(c[i], Address(rLimbs, i * 8)); + } + + __ mov(r0, 0); + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(lr); + + // record the stub entry and end + store_archive_data(stub_id, start, __ pc()); + + return start; + } + void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4, Register tmp0, Register tmp1, Register tmp2) { __ bic(tmp0, a2, a1); // for a0 @@ -13791,6 +13908,15 @@ class StubGenerator: public StubCodeGenerator { StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); } + // The difference between AArch64 vs. x86_64 intrinsics implementation + // include the lack of square() intrinsics; usage caused a 3.3% performance + // degradation due to the efficiencies of the symmetric squaring shape in + // Java vs. the inefficiencies of the leaf calls and the additional cycles + // required for 64 bit multiplication in AArch64. + if (UseIntPoly25519Intrinsics) { + StubRoutines::_intpoly_mult_25519 = generate_intpoly_mult_25519(); + } + // generate Adler32 intrinsics code if (UseAdler32Intrinsics) { StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); diff --git a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp index e746447e013..5462ccf2a76 100644 --- a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp @@ -661,6 +661,10 @@ void VM_Version::initialize() { FLAG_SET_DEFAULT(UsePoly1305Intrinsics, true); } + if (FLAG_IS_DEFAULT(UseIntPoly25519Intrinsics)) { + FLAG_SET_DEFAULT(UseIntPoly25519Intrinsics, true); + } + if (FLAG_IS_DEFAULT(UseVectorizedHashCodeIntrinsic)) { FLAG_SET_DEFAULT(UseVectorizedHashCodeIntrinsic, true); } diff --git a/src/hotspot/share/code/aotCodeCache.hpp b/src/hotspot/share/code/aotCodeCache.hpp index 448bab6fbc2..c65b9cb23d1 100644 --- a/src/hotspot/share/code/aotCodeCache.hpp +++ b/src/hotspot/share/code/aotCodeCache.hpp @@ -291,6 +291,7 @@ public: do_var(bool, UseCRC32Intrinsics) \ do_var(bool, UseDilithiumIntrinsics) \ do_var(bool, UseGHASHIntrinsics) \ + do_var(bool, UseIntPoly25519Intrinsics) \ do_var(bool, UseKyberIntrinsics) \ do_var(bool, UseMD5Intrinsics) \ do_var(bool, UsePoly1305Intrinsics) \