diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index 24930367319..a5113f994df 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -7046,6 +7046,171 @@ class StubGenerator: public StubCodeGenerator { return start; } + // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers + // are represented as long[5], with BITS_PER_LIMB = 26. + // Pack five 26-bit limbs into three 64-bit registers. + void pack_26(Register dest0, Register dest1, Register dest2, Register src) { + __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits + __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits + __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); + __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits + + __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits + __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits + __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); + __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits + + if (dest2->is_valid()) { + __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits + } else { +#ifdef ASSERT + Label OK; + __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits + __ br(__ EQ, OK); + __ stop("high bits of Poly1305 integer should be zero"); + __ should_not_reach_here(); + __ bind(OK); +#endif + } + } + + // As above, but return only a 128-bit integer, packed into two + // 64-bit registers. + void pack_26(Register dest0, Register dest1, Register src) { + pack_26(dest0, dest1, noreg, src); + } + + // Multiply and multiply-accumulate unsigned 64-bit registers. + void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { + __ mul(prod_lo, n, m); + __ umulh(prod_hi, n, m); + } + void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { + wide_mul(rscratch1, rscratch2, n, m); + __ adds(sum_lo, sum_lo, rscratch1); + __ adc(sum_hi, sum_hi, rscratch2); + } + + // Poly1305, RFC 7539 + + // See https://loup-vaillant.fr/tutorials/poly1305-design for a + // description of the tricks used to simplify and accelerate this + // computation. + + address generate_poly1305_processBlocks() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); + address start = __ pc(); + Label here; + __ enter(); + RegSet callee_saved = RegSet::range(r19, r28); + __ push(callee_saved, sp); + + RegSetIterator regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); + + // Arguments + const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; + + // R_n is the 128-bit randomly-generated key, packed into two + // registers. The caller passes this key to us as long[5], with + // BITS_PER_LIMB = 26. + const Register R_0 = *++regs, R_1 = *++regs; + pack_26(R_0, R_1, r_start); + + // RR_n is (R_n >> 2) * 5 + const Register RR_0 = *++regs, RR_1 = *++regs; + __ lsr(RR_0, R_0, 2); + __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); + __ lsr(RR_1, R_1, 2); + __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); + + // U_n is the current checksum + const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; + pack_26(U_0, U_1, U_2, acc_start); + + static constexpr int BLOCK_LENGTH = 16; + Label DONE, LOOP; + + __ cmp(length, checked_cast(BLOCK_LENGTH)); + __ br(Assembler::LT, DONE); { + __ bind(LOOP); + + // S_n is to be the sum of U_n and the next block of data + const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; + __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); + __ adds(S_0, U_0, S_0); + __ adcs(S_1, U_1, S_1); + __ adc(S_2, U_2, zr); + __ add(S_2, S_2, 1); + + const Register U_0HI = *++regs, U_1HI = *++regs; + + // NB: this logic depends on some of the special properties of + // Poly1305 keys. In particular, because we know that the top + // four bits of R_0 and R_1 are zero, we can add together + // partial products without any risk of needing to propagate a + // carry out. + wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); + wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); + __ andr(U_2, R_0, 3); + __ mul(U_2, S_2, U_2); + + // Recycle registers S_0, S_1, S_2 + regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); + + // Partial reduction mod 2**130 - 5 + __ adds(U_1, U_0HI, U_1); + __ adc(U_2, U_1HI, U_2); + // Sum now in U_2:U_1:U_0. + // Dead: U_0HI, U_1HI. + regs = (regs.remaining() + U_0HI + U_1HI).begin(); + + // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps + + // First, U_2:U_1:U_0 += (U_2 >> 2) + __ lsr(rscratch1, U_2, 2); + __ andr(U_2, U_2, (u8)3); + __ adds(U_0, U_0, rscratch1); + __ adcs(U_1, U_1, zr); + __ adc(U_2, U_2, zr); + // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 + __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); + __ adcs(U_1, U_1, zr); + __ adc(U_2, U_2, zr); + + __ sub(length, length, checked_cast(BLOCK_LENGTH)); + __ cmp(length, checked_cast(BLOCK_LENGTH)); + __ br(~ Assembler::LT, LOOP); + } + + // Further reduce modulo 2^130 - 5 + __ lsr(rscratch1, U_2, 2); + __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 + __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 + __ adcs(U_1, U_1, zr); + __ andr(U_2, U_2, (u1)3); + __ adc(U_2, U_2, zr); + + // Unpack the sum into five 26-bit limbs and write to memory. + __ ubfiz(rscratch1, U_0, 0, 26); + __ ubfx(rscratch2, U_0, 26, 26); + __ stp(rscratch1, rscratch2, Address(acc_start)); + __ ubfx(rscratch1, U_0, 52, 12); + __ bfi(rscratch1, U_1, 12, 14); + __ ubfx(rscratch2, U_1, 14, 26); + __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); + __ ubfx(rscratch1, U_1, 40, 24); + __ bfi(rscratch1, U_2, 24, 3); + __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); + + __ bind(DONE); + __ pop(callee_saved, sp); + __ leave(); + __ ret(lr); + + return start; + } + #if INCLUDE_JFR static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) { @@ -8133,6 +8298,10 @@ class StubGenerator: public StubCodeGenerator { StubRoutines::aarch64::_spin_wait = generate_spin_wait(); + if (UsePoly1305Intrinsics) { + StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); + } + #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) generate_atomic_entry_points(); diff --git a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp index 26aaf75ab81..f33e64b3af0 100644 --- a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp @@ -565,6 +565,10 @@ void VM_Version::initialize() { if (FLAG_IS_DEFAULT(AlignVector)) { AlignVector = AvoidUnalignedAccesses; } + + if (FLAG_IS_DEFAULT(UsePoly1305Intrinsics)) { + FLAG_SET_DEFAULT(UsePoly1305Intrinsics, true); + } #endif _spin_wait = get_spin_wait_desc(); diff --git a/src/hotspot/share/asm/register.hpp b/src/hotspot/share/asm/register.hpp index a1544281702..fab548ed36d 100644 --- a/src/hotspot/share/asm/register.hpp +++ b/src/hotspot/share/asm/register.hpp @@ -184,6 +184,10 @@ public: return *this; } + RegSetIterator& operator=(const RegSetIterator& mit) { + _regs= mit._regs; + return *this; + } bool operator==(const RegSetIterator& rhs) const { return _regs.bits() == rhs._regs.bits(); } @@ -194,6 +198,10 @@ public: RegImpl operator*() { return _regs.first(); } + + AbstractRegSet remaining() const { + return _regs; + } }; template diff --git a/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java b/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java index 15613557fb9..529ce9a2b32 100644 --- a/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java +++ b/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java @@ -82,9 +82,11 @@ public class Poly1305DigestBench extends CryptoBase { } @Setup - public void setup() { + public void setup() throws Throwable { setupProvider(); data = fillRandom(new byte[SET_SIZE][dataSize]); + byte[] d = data[0]; + polyEngineInit.invoke(polyObj, new SecretKeySpec(d, 0, 32, "Poly1305"), null); } @Benchmark @@ -100,6 +102,17 @@ public class Poly1305DigestBench extends CryptoBase { } } + @Benchmark + public void updateBytes() { + try { + byte[] d = data[index]; + // index = (index +1) % SET_SIZE; + polyEngineUpdate.invoke(polyObj, d, 0, d.length); + } catch (Throwable ex) { + throw new RuntimeException(ex); + } + } + @Benchmark public byte[] digestBuffer() { try {