mirror of
https://github.com/openjdk/jdk.git
synced 2026-02-22 16:26:05 +00:00
8296411: AArch64: Accelerated Poly1305 intrinsics
Reviewed-by: redestad, adinn
This commit is contained in:
parent
59d9d9fcb9
commit
dc21e8aa83
@ -7046,6 +7046,171 @@ class StubGenerator: public StubCodeGenerator {
|
||||
return start;
|
||||
}
|
||||
|
||||
// In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
|
||||
// are represented as long[5], with BITS_PER_LIMB = 26.
|
||||
// Pack five 26-bit limbs into three 64-bit registers.
|
||||
void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
|
||||
__ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits
|
||||
__ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits
|
||||
__ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
|
||||
__ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits
|
||||
|
||||
__ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits
|
||||
__ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits
|
||||
__ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
|
||||
__ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits
|
||||
|
||||
if (dest2->is_valid()) {
|
||||
__ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits
|
||||
} else {
|
||||
#ifdef ASSERT
|
||||
Label OK;
|
||||
__ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits
|
||||
__ br(__ EQ, OK);
|
||||
__ stop("high bits of Poly1305 integer should be zero");
|
||||
__ should_not_reach_here();
|
||||
__ bind(OK);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
// As above, but return only a 128-bit integer, packed into two
|
||||
// 64-bit registers.
|
||||
void pack_26(Register dest0, Register dest1, Register src) {
|
||||
pack_26(dest0, dest1, noreg, src);
|
||||
}
|
||||
|
||||
// Multiply and multiply-accumulate unsigned 64-bit registers.
|
||||
void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
|
||||
__ mul(prod_lo, n, m);
|
||||
__ umulh(prod_hi, n, m);
|
||||
}
|
||||
void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
|
||||
wide_mul(rscratch1, rscratch2, n, m);
|
||||
__ adds(sum_lo, sum_lo, rscratch1);
|
||||
__ adc(sum_hi, sum_hi, rscratch2);
|
||||
}
|
||||
|
||||
// Poly1305, RFC 7539
|
||||
|
||||
// See https://loup-vaillant.fr/tutorials/poly1305-design for a
|
||||
// description of the tricks used to simplify and accelerate this
|
||||
// computation.
|
||||
|
||||
address generate_poly1305_processBlocks() {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
|
||||
address start = __ pc();
|
||||
Label here;
|
||||
__ enter();
|
||||
RegSet callee_saved = RegSet::range(r19, r28);
|
||||
__ push(callee_saved, sp);
|
||||
|
||||
RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
|
||||
|
||||
// Arguments
|
||||
const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
|
||||
|
||||
// R_n is the 128-bit randomly-generated key, packed into two
|
||||
// registers. The caller passes this key to us as long[5], with
|
||||
// BITS_PER_LIMB = 26.
|
||||
const Register R_0 = *++regs, R_1 = *++regs;
|
||||
pack_26(R_0, R_1, r_start);
|
||||
|
||||
// RR_n is (R_n >> 2) * 5
|
||||
const Register RR_0 = *++regs, RR_1 = *++regs;
|
||||
__ lsr(RR_0, R_0, 2);
|
||||
__ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
|
||||
__ lsr(RR_1, R_1, 2);
|
||||
__ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
|
||||
|
||||
// U_n is the current checksum
|
||||
const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
|
||||
pack_26(U_0, U_1, U_2, acc_start);
|
||||
|
||||
static constexpr int BLOCK_LENGTH = 16;
|
||||
Label DONE, LOOP;
|
||||
|
||||
__ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
|
||||
__ br(Assembler::LT, DONE); {
|
||||
__ bind(LOOP);
|
||||
|
||||
// S_n is to be the sum of U_n and the next block of data
|
||||
const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
|
||||
__ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
|
||||
__ adds(S_0, U_0, S_0);
|
||||
__ adcs(S_1, U_1, S_1);
|
||||
__ adc(S_2, U_2, zr);
|
||||
__ add(S_2, S_2, 1);
|
||||
|
||||
const Register U_0HI = *++regs, U_1HI = *++regs;
|
||||
|
||||
// NB: this logic depends on some of the special properties of
|
||||
// Poly1305 keys. In particular, because we know that the top
|
||||
// four bits of R_0 and R_1 are zero, we can add together
|
||||
// partial products without any risk of needing to propagate a
|
||||
// carry out.
|
||||
wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
|
||||
wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1);
|
||||
__ andr(U_2, R_0, 3);
|
||||
__ mul(U_2, S_2, U_2);
|
||||
|
||||
// Recycle registers S_0, S_1, S_2
|
||||
regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
|
||||
|
||||
// Partial reduction mod 2**130 - 5
|
||||
__ adds(U_1, U_0HI, U_1);
|
||||
__ adc(U_2, U_1HI, U_2);
|
||||
// Sum now in U_2:U_1:U_0.
|
||||
// Dead: U_0HI, U_1HI.
|
||||
regs = (regs.remaining() + U_0HI + U_1HI).begin();
|
||||
|
||||
// U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
|
||||
|
||||
// First, U_2:U_1:U_0 += (U_2 >> 2)
|
||||
__ lsr(rscratch1, U_2, 2);
|
||||
__ andr(U_2, U_2, (u8)3);
|
||||
__ adds(U_0, U_0, rscratch1);
|
||||
__ adcs(U_1, U_1, zr);
|
||||
__ adc(U_2, U_2, zr);
|
||||
// Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
|
||||
__ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
|
||||
__ adcs(U_1, U_1, zr);
|
||||
__ adc(U_2, U_2, zr);
|
||||
|
||||
__ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
|
||||
__ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
|
||||
__ br(~ Assembler::LT, LOOP);
|
||||
}
|
||||
|
||||
// Further reduce modulo 2^130 - 5
|
||||
__ lsr(rscratch1, U_2, 2);
|
||||
__ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
|
||||
__ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
|
||||
__ adcs(U_1, U_1, zr);
|
||||
__ andr(U_2, U_2, (u1)3);
|
||||
__ adc(U_2, U_2, zr);
|
||||
|
||||
// Unpack the sum into five 26-bit limbs and write to memory.
|
||||
__ ubfiz(rscratch1, U_0, 0, 26);
|
||||
__ ubfx(rscratch2, U_0, 26, 26);
|
||||
__ stp(rscratch1, rscratch2, Address(acc_start));
|
||||
__ ubfx(rscratch1, U_0, 52, 12);
|
||||
__ bfi(rscratch1, U_1, 12, 14);
|
||||
__ ubfx(rscratch2, U_1, 14, 26);
|
||||
__ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
|
||||
__ ubfx(rscratch1, U_1, 40, 24);
|
||||
__ bfi(rscratch1, U_2, 24, 3);
|
||||
__ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
|
||||
|
||||
__ bind(DONE);
|
||||
__ pop(callee_saved, sp);
|
||||
__ leave();
|
||||
__ ret(lr);
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
#if INCLUDE_JFR
|
||||
|
||||
static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
|
||||
@ -8133,6 +8298,10 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
StubRoutines::aarch64::_spin_wait = generate_spin_wait();
|
||||
|
||||
if (UsePoly1305Intrinsics) {
|
||||
StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
|
||||
}
|
||||
|
||||
#if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
|
||||
|
||||
generate_atomic_entry_points();
|
||||
|
||||
@ -565,6 +565,10 @@ void VM_Version::initialize() {
|
||||
if (FLAG_IS_DEFAULT(AlignVector)) {
|
||||
AlignVector = AvoidUnalignedAccesses;
|
||||
}
|
||||
|
||||
if (FLAG_IS_DEFAULT(UsePoly1305Intrinsics)) {
|
||||
FLAG_SET_DEFAULT(UsePoly1305Intrinsics, true);
|
||||
}
|
||||
#endif
|
||||
|
||||
_spin_wait = get_spin_wait_desc();
|
||||
|
||||
@ -184,6 +184,10 @@ public:
|
||||
return *this;
|
||||
}
|
||||
|
||||
RegSetIterator<RegImpl>& operator=(const RegSetIterator<RegImpl>& mit) {
|
||||
_regs= mit._regs;
|
||||
return *this;
|
||||
}
|
||||
bool operator==(const RegSetIterator& rhs) const {
|
||||
return _regs.bits() == rhs._regs.bits();
|
||||
}
|
||||
@ -194,6 +198,10 @@ public:
|
||||
RegImpl operator*() {
|
||||
return _regs.first();
|
||||
}
|
||||
|
||||
AbstractRegSet<RegImpl> remaining() const {
|
||||
return _regs;
|
||||
}
|
||||
};
|
||||
|
||||
template <class RegImpl>
|
||||
|
||||
@ -82,9 +82,11 @@ public class Poly1305DigestBench extends CryptoBase {
|
||||
}
|
||||
|
||||
@Setup
|
||||
public void setup() {
|
||||
public void setup() throws Throwable {
|
||||
setupProvider();
|
||||
data = fillRandom(new byte[SET_SIZE][dataSize]);
|
||||
byte[] d = data[0];
|
||||
polyEngineInit.invoke(polyObj, new SecretKeySpec(d, 0, 32, "Poly1305"), null);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
@ -100,6 +102,17 @@ public class Poly1305DigestBench extends CryptoBase {
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void updateBytes() {
|
||||
try {
|
||||
byte[] d = data[index];
|
||||
// index = (index +1) % SET_SIZE;
|
||||
polyEngineUpdate.invoke(polyObj, d, 0, d.length);
|
||||
} catch (Throwable ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public byte[] digestBuffer() {
|
||||
try {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user