8296411: AArch64: Accelerated Poly1305 intrinsics

Reviewed-by: redestad, adinn
This commit is contained in:
Andrew Haley 2023-06-02 14:28:44 +00:00
parent 59d9d9fcb9
commit dc21e8aa83
4 changed files with 195 additions and 1 deletions

View File

@ -7046,6 +7046,171 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
// In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
// are represented as long[5], with BITS_PER_LIMB = 26.
// Pack five 26-bit limbs into three 64-bit registers.
void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
__ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits
__ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits
__ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
__ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits
__ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits
__ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits
__ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
__ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits
if (dest2->is_valid()) {
__ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits
} else {
#ifdef ASSERT
Label OK;
__ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits
__ br(__ EQ, OK);
__ stop("high bits of Poly1305 integer should be zero");
__ should_not_reach_here();
__ bind(OK);
#endif
}
}
// As above, but return only a 128-bit integer, packed into two
// 64-bit registers.
void pack_26(Register dest0, Register dest1, Register src) {
pack_26(dest0, dest1, noreg, src);
}
// Multiply and multiply-accumulate unsigned 64-bit registers.
void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
__ mul(prod_lo, n, m);
__ umulh(prod_hi, n, m);
}
void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
wide_mul(rscratch1, rscratch2, n, m);
__ adds(sum_lo, sum_lo, rscratch1);
__ adc(sum_hi, sum_hi, rscratch2);
}
// Poly1305, RFC 7539
// See https://loup-vaillant.fr/tutorials/poly1305-design for a
// description of the tricks used to simplify and accelerate this
// computation.
address generate_poly1305_processBlocks() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
address start = __ pc();
Label here;
__ enter();
RegSet callee_saved = RegSet::range(r19, r28);
__ push(callee_saved, sp);
RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
// Arguments
const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
// R_n is the 128-bit randomly-generated key, packed into two
// registers. The caller passes this key to us as long[5], with
// BITS_PER_LIMB = 26.
const Register R_0 = *++regs, R_1 = *++regs;
pack_26(R_0, R_1, r_start);
// RR_n is (R_n >> 2) * 5
const Register RR_0 = *++regs, RR_1 = *++regs;
__ lsr(RR_0, R_0, 2);
__ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
__ lsr(RR_1, R_1, 2);
__ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
// U_n is the current checksum
const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
pack_26(U_0, U_1, U_2, acc_start);
static constexpr int BLOCK_LENGTH = 16;
Label DONE, LOOP;
__ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
__ br(Assembler::LT, DONE); {
__ bind(LOOP);
// S_n is to be the sum of U_n and the next block of data
const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
__ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
__ adds(S_0, U_0, S_0);
__ adcs(S_1, U_1, S_1);
__ adc(S_2, U_2, zr);
__ add(S_2, S_2, 1);
const Register U_0HI = *++regs, U_1HI = *++regs;
// NB: this logic depends on some of the special properties of
// Poly1305 keys. In particular, because we know that the top
// four bits of R_0 and R_1 are zero, we can add together
// partial products without any risk of needing to propagate a
// carry out.
wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1);
__ andr(U_2, R_0, 3);
__ mul(U_2, S_2, U_2);
// Recycle registers S_0, S_1, S_2
regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
// Partial reduction mod 2**130 - 5
__ adds(U_1, U_0HI, U_1);
__ adc(U_2, U_1HI, U_2);
// Sum now in U_2:U_1:U_0.
// Dead: U_0HI, U_1HI.
regs = (regs.remaining() + U_0HI + U_1HI).begin();
// U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
// First, U_2:U_1:U_0 += (U_2 >> 2)
__ lsr(rscratch1, U_2, 2);
__ andr(U_2, U_2, (u8)3);
__ adds(U_0, U_0, rscratch1);
__ adcs(U_1, U_1, zr);
__ adc(U_2, U_2, zr);
// Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
__ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
__ adcs(U_1, U_1, zr);
__ adc(U_2, U_2, zr);
__ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
__ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
__ br(~ Assembler::LT, LOOP);
}
// Further reduce modulo 2^130 - 5
__ lsr(rscratch1, U_2, 2);
__ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
__ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
__ adcs(U_1, U_1, zr);
__ andr(U_2, U_2, (u1)3);
__ adc(U_2, U_2, zr);
// Unpack the sum into five 26-bit limbs and write to memory.
__ ubfiz(rscratch1, U_0, 0, 26);
__ ubfx(rscratch2, U_0, 26, 26);
__ stp(rscratch1, rscratch2, Address(acc_start));
__ ubfx(rscratch1, U_0, 52, 12);
__ bfi(rscratch1, U_1, 12, 14);
__ ubfx(rscratch2, U_1, 14, 26);
__ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
__ ubfx(rscratch1, U_1, 40, 24);
__ bfi(rscratch1, U_2, 24, 3);
__ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
__ bind(DONE);
__ pop(callee_saved, sp);
__ leave();
__ ret(lr);
return start;
}
#if INCLUDE_JFR
static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
@ -8133,6 +8298,10 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::aarch64::_spin_wait = generate_spin_wait();
if (UsePoly1305Intrinsics) {
StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
}
#if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
generate_atomic_entry_points();

View File

@ -565,6 +565,10 @@ void VM_Version::initialize() {
if (FLAG_IS_DEFAULT(AlignVector)) {
AlignVector = AvoidUnalignedAccesses;
}
if (FLAG_IS_DEFAULT(UsePoly1305Intrinsics)) {
FLAG_SET_DEFAULT(UsePoly1305Intrinsics, true);
}
#endif
_spin_wait = get_spin_wait_desc();

View File

@ -184,6 +184,10 @@ public:
return *this;
}
RegSetIterator<RegImpl>& operator=(const RegSetIterator<RegImpl>& mit) {
_regs= mit._regs;
return *this;
}
bool operator==(const RegSetIterator& rhs) const {
return _regs.bits() == rhs._regs.bits();
}
@ -194,6 +198,10 @@ public:
RegImpl operator*() {
return _regs.first();
}
AbstractRegSet<RegImpl> remaining() const {
return _regs;
}
};
template <class RegImpl>

View File

@ -82,9 +82,11 @@ public class Poly1305DigestBench extends CryptoBase {
}
@Setup
public void setup() {
public void setup() throws Throwable {
setupProvider();
data = fillRandom(new byte[SET_SIZE][dataSize]);
byte[] d = data[0];
polyEngineInit.invoke(polyObj, new SecretKeySpec(d, 0, 32, "Poly1305"), null);
}
@Benchmark
@ -100,6 +102,17 @@ public class Poly1305DigestBench extends CryptoBase {
}
}
@Benchmark
public void updateBytes() {
try {
byte[] d = data[index];
// index = (index +1) % SET_SIZE;
polyEngineUpdate.invoke(polyObj, d, 0, d.length);
} catch (Throwable ex) {
throw new RuntimeException(ex);
}
}
@Benchmark
public byte[] digestBuffer() {
try {