mirror of
https://github.com/openjdk/jdk.git
synced 2026-04-19 03:17:40 +00:00
8134869: AARCH64: GHASH intrinsic is not optimal
Rewrite intrinsic to make better use of SIMD instructions Reviewed-by: kvn
This commit is contained in:
parent
7c7692856a
commit
ef62a6daab
@ -1210,7 +1210,7 @@ public:
|
||||
|
||||
INSN(ldrs, 0b00, 1);
|
||||
INSN(ldrd, 0b01, 1);
|
||||
INSN(ldrq, 0x10, 1);
|
||||
INSN(ldrq, 0b10, 1);
|
||||
|
||||
#undef INSN
|
||||
|
||||
@ -2285,13 +2285,13 @@ public:
|
||||
#undef INSN
|
||||
|
||||
// Table vector lookup
|
||||
#define INSN(NAME, op) \
|
||||
void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, unsigned registers, FloatRegister Vm) { \
|
||||
starti; \
|
||||
assert(T == T8B || T == T16B, "invalid arrangement"); \
|
||||
assert(0 < registers && registers <= 4, "invalid number of registers"); \
|
||||
f(0, 31), f((int)T & 1, 30), f(0b001110000, 29, 21), rf(Vm, 16), f(0, 15); \
|
||||
f(registers - 1, 14, 13), f(op, 12),f(0b00, 11, 10), rf(Vn, 5), rf(Vd, 0); \
|
||||
#define INSN(NAME, op) \
|
||||
void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, unsigned registers, FloatRegister Vm) { \
|
||||
starti; \
|
||||
assert(T == T8B || T == T16B, "invalid arrangement"); \
|
||||
assert(0 < registers && registers <= 4, "invalid number of registers"); \
|
||||
f(0, 31), f((int)T & 1, 30), f(0b001110000, 29, 21), rf(Vm, 16), f(0, 15); \
|
||||
f(registers - 1, 14, 13), f(op, 12),f(0b00, 11, 10), rf(Vn, 5), rf(Vd, 0); \
|
||||
}
|
||||
|
||||
INSN(tbl, 0);
|
||||
@ -2299,6 +2299,7 @@ public:
|
||||
|
||||
#undef INSN
|
||||
|
||||
// AdvSIMD two-reg misc
|
||||
#define INSN(NAME, U, opcode) \
|
||||
void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) { \
|
||||
starti; \
|
||||
@ -2316,10 +2317,19 @@ public:
|
||||
|
||||
#define ASSERTION (T == T8B || T == T16B || T == T4H || T == T8H)
|
||||
INSN(rev32, 1, 0b00000);
|
||||
private:
|
||||
INSN(_rbit, 1, 0b00101);
|
||||
public:
|
||||
|
||||
#undef ASSERTION
|
||||
|
||||
#define ASSERTION (T == T8B || T == T16B)
|
||||
INSN(rev16, 0, 0b00001);
|
||||
// RBIT only allows T8B and T16B but encodes them oddly. Argh...
|
||||
void rbit(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) {
|
||||
assert((ASSERTION), MSG);
|
||||
_rbit(Vd, SIMD_Arrangement(T & 1 | 0b010), Vn);
|
||||
}
|
||||
#undef ASSERTION
|
||||
|
||||
#undef MSG
|
||||
|
||||
@ -2364,7 +2364,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
* c_rarg3 - int* table
|
||||
*
|
||||
* Ouput:
|
||||
* rax - int crc result
|
||||
* r0 - int crc result
|
||||
*/
|
||||
address generate_updateBytesCRC32C() {
|
||||
assert(UseCRC32CIntrinsics, "what are we doing here?");
|
||||
@ -2435,6 +2435,69 @@ class StubGenerator: public StubCodeGenerator {
|
||||
return start;
|
||||
}
|
||||
|
||||
void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
|
||||
FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
|
||||
FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
|
||||
// Karatsuba multiplication performs a 128*128 -> 256-bit
|
||||
// multiplication in three 128-bit multiplications and a few
|
||||
// additions.
|
||||
//
|
||||
// (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
|
||||
// (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
|
||||
//
|
||||
// Inputs:
|
||||
//
|
||||
// A0 in a.d[0] (subkey)
|
||||
// A1 in a.d[1]
|
||||
// (A1+A0) in a1_xor_a0.d[0]
|
||||
//
|
||||
// B0 in b.d[0] (state)
|
||||
// B1 in b.d[1]
|
||||
|
||||
__ ext(tmp1, __ T16B, b, b, 0x08);
|
||||
__ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1
|
||||
__ eor(tmp1, __ T16B, tmp1, b); // (B1+B0)
|
||||
__ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0
|
||||
__ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
|
||||
|
||||
__ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
|
||||
__ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
|
||||
__ eor(tmp2, __ T16B, tmp2, tmp4);
|
||||
__ eor(tmp2, __ T16B, tmp2, tmp3);
|
||||
|
||||
// Register pair <result_hi:result_lo> holds the result of carry-less multiplication
|
||||
__ ins(result_hi, __ D, tmp2, 0, 1);
|
||||
__ ins(result_lo, __ D, tmp2, 1, 0);
|
||||
}
|
||||
|
||||
void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
|
||||
FloatRegister p, FloatRegister z, FloatRegister t1) {
|
||||
const FloatRegister t0 = result;
|
||||
|
||||
// The GCM field polynomial f is z^128 + p(z), where p =
|
||||
// z^7+z^2+z+1.
|
||||
//
|
||||
// z^128 === -p(z) (mod (z^128 + p(z)))
|
||||
//
|
||||
// so, given that the product we're reducing is
|
||||
// a == lo + hi * z^128
|
||||
// substituting,
|
||||
// === lo - hi * p(z) (mod (z^128 + p(z)))
|
||||
//
|
||||
// we reduce by multiplying hi by p(z) and subtracting the result
|
||||
// from (i.e. XORing it with) lo. Because p has no nonzero high
|
||||
// bits we can do this with two 64-bit multiplications, lo*p and
|
||||
// hi*p.
|
||||
|
||||
__ pmull2(t0, __ T1Q, hi, p, __ T2D);
|
||||
__ ext(t1, __ T16B, t0, z, 8);
|
||||
__ eor(hi, __ T16B, hi, t1);
|
||||
__ ext(t1, __ T16B, z, t0, 8);
|
||||
__ eor(lo, __ T16B, lo, t1);
|
||||
__ pmull(t0, __ T1Q, hi, p, __ T1D);
|
||||
__ eor(result, __ T16B, lo, t0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Arguments:
|
||||
*
|
||||
@ -2448,10 +2511,27 @@ class StubGenerator: public StubCodeGenerator {
|
||||
* Updated state at c_rarg0
|
||||
*/
|
||||
address generate_ghash_processBlocks() {
|
||||
__ align(CodeEntryAlignment);
|
||||
Label L_ghash_loop, L_exit;
|
||||
// Bafflingly, GCM uses little-endian for the byte order, but
|
||||
// big-endian for the bit order. For example, the polynomial 1 is
|
||||
// represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
|
||||
//
|
||||
// So, we must either reverse the bytes in each word and do
|
||||
// everything big-endian or reverse the bits in each byte and do
|
||||
// it little-endian. On AArch64 it's more idiomatic to reverse
|
||||
// the bits in each byte (we have an instruction, RBIT, to do
|
||||
// that) and keep the data in little-endian bit order throught the
|
||||
// calculation, bit-reversing the inputs and outputs.
|
||||
|
||||
StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
|
||||
__ align(wordSize * 2);
|
||||
address p = __ pc();
|
||||
__ emit_int64(0x87); // The low-order bits of the field
|
||||
// polynomial (i.e. p = z^7+z^2+z+1)
|
||||
// repeated in the low and high parts of a
|
||||
// 128-bit vector
|
||||
__ emit_int64(0x87);
|
||||
|
||||
__ align(CodeEntryAlignment);
|
||||
address start = __ pc();
|
||||
|
||||
Register state = c_rarg0;
|
||||
@ -2462,104 +2542,43 @@ class StubGenerator: public StubCodeGenerator {
|
||||
FloatRegister vzr = v30;
|
||||
__ eor(vzr, __ T16B, vzr, vzr); // zero register
|
||||
|
||||
__ mov(v26, __ T16B, 1);
|
||||
__ mov(v27, __ T16B, 63);
|
||||
__ mov(v28, __ T16B, 62);
|
||||
__ mov(v29, __ T16B, 57);
|
||||
__ ldrq(v0, Address(state));
|
||||
__ ldrq(v1, Address(subkeyH));
|
||||
|
||||
__ ldrq(v6, Address(state));
|
||||
__ ldrq(v16, Address(subkeyH));
|
||||
__ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH
|
||||
__ rbit(v0, __ T16B, v0);
|
||||
__ rev64(v1, __ T16B, v1);
|
||||
__ rbit(v1, __ T16B, v1);
|
||||
|
||||
__ ext(v0, __ T16B, v6, v6, 0x08);
|
||||
__ ext(v1, __ T16B, v16, v16, 0x08);
|
||||
__ eor(v16, __ T16B, v16, v1);
|
||||
__ ldrq(v26, p);
|
||||
|
||||
__ bind(L_ghash_loop);
|
||||
__ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
|
||||
__ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
|
||||
|
||||
__ ldrq(v2, Address(__ post(data, 0x10)));
|
||||
__ rev64(v2, __ T16B, v2); // swap data
|
||||
{
|
||||
Label L_ghash_loop;
|
||||
__ bind(L_ghash_loop);
|
||||
|
||||
__ ext(v6, __ T16B, v0, v0, 0x08);
|
||||
__ eor(v6, __ T16B, v6, v2);
|
||||
__ ext(v2, __ T16B, v6, v6, 0x08);
|
||||
__ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
|
||||
// reversing each byte
|
||||
__ rbit(v2, __ T16B, v2);
|
||||
__ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
|
||||
|
||||
__ pmull2(v7, __ T1Q, v2, v1, __ T2D); // A1*B1
|
||||
__ eor(v6, __ T16B, v6, v2);
|
||||
__ pmull(v5, __ T1Q, v2, v1, __ T1D); // A0*B0
|
||||
__ pmull(v20, __ T1Q, v6, v16, __ T1D); // (A1 + A0)(B1 + B0)
|
||||
// Multiply state in v2 by subkey in v1
|
||||
ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
|
||||
/*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
|
||||
/*temps*/v6, v20, v18, v21);
|
||||
// Reduce v7:v5 by the field polynomial
|
||||
ghash_reduce(v0, v5, v7, v26, vzr, v20);
|
||||
|
||||
__ ext(v21, __ T16B, v5, v7, 0x08);
|
||||
__ eor(v18, __ T16B, v7, v5); // A1*B1 xor A0*B0
|
||||
__ eor(v20, __ T16B, v20, v21);
|
||||
__ eor(v20, __ T16B, v20, v18);
|
||||
__ sub(blocks, blocks, 1);
|
||||
__ cbnz(blocks, L_ghash_loop);
|
||||
}
|
||||
|
||||
// Registers pair <v7:v5> holds the result of carry-less multiplication
|
||||
__ ins(v7, __ D, v20, 0, 1);
|
||||
__ ins(v5, __ D, v20, 1, 0);
|
||||
// The bit-reversed result is at this point in v0
|
||||
__ rev64(v1, __ T16B, v0);
|
||||
__ rbit(v1, __ T16B, v1);
|
||||
|
||||
// Result of the multiplication is shifted by one bit position
|
||||
// [X3:X2:X1:X0] = [X3:X2:X1:X0] << 1
|
||||
__ ushr(v18, __ T2D, v5, -63 & 63);
|
||||
__ ins(v25, __ D, v18, 1, 0);
|
||||
__ ins(v25, __ D, vzr, 0, 0);
|
||||
__ ushl(v5, __ T2D, v5, v26);
|
||||
__ orr(v5, __ T16B, v5, v25);
|
||||
|
||||
__ ushr(v19, __ T2D, v7, -63 & 63);
|
||||
__ ins(v19, __ D, v19, 1, 0);
|
||||
__ ins(v19, __ D, v18, 0, 1);
|
||||
__ ushl(v7, __ T2D, v7, v26);
|
||||
__ orr(v6, __ T16B, v7, v19);
|
||||
|
||||
__ ins(v24, __ D, v5, 0, 1);
|
||||
|
||||
// A = X0 << 63
|
||||
__ ushl(v21, __ T2D, v5, v27);
|
||||
|
||||
// A = X0 << 62
|
||||
__ ushl(v22, __ T2D, v5, v28);
|
||||
|
||||
// A = X0 << 57
|
||||
__ ushl(v23, __ T2D, v5, v29);
|
||||
|
||||
// D = X1^A^B^C
|
||||
__ eor(v21, __ T16B, v21, v22);
|
||||
__ eor(v21, __ T16B, v21, v23);
|
||||
__ eor(v21, __ T16B, v21, v24);
|
||||
__ ins(v5, __ D, v21, 1, 0);
|
||||
|
||||
// [E1:E0] = [D:X0] >> 1
|
||||
__ ushr(v20, __ T2D, v5, -1 & 63);
|
||||
__ ushl(v18, __ T2D, v5, v27);
|
||||
__ ext(v25, __ T16B, v18, vzr, 0x08);
|
||||
__ orr(v19, __ T16B, v20, v25);
|
||||
|
||||
__ eor(v7, __ T16B, v5, v19);
|
||||
|
||||
// [F1:F0] = [D:X0] >> 2
|
||||
__ ushr(v20, __ T2D, v5, -2 & 63);
|
||||
__ ushl(v18, __ T2D, v5, v28);
|
||||
__ ins(v25, __ D, v18, 0, 1);
|
||||
__ orr(v19, __ T16B, v20, v25);
|
||||
|
||||
__ eor(v7, __ T16B, v7, v19);
|
||||
|
||||
// [G1:G0] = [D:X0] >> 7
|
||||
__ ushr(v20, __ T2D, v5, -7 & 63);
|
||||
__ ushl(v18, __ T2D, v5, v29);
|
||||
__ ins(v25, __ D, v18, 0, 1);
|
||||
__ orr(v19, __ T16B, v20, v25);
|
||||
|
||||
// [H1:H0] = [D^E1^F1^G1:X0^E0^F0^G0]
|
||||
__ eor(v7, __ T16B, v7, v19);
|
||||
|
||||
// Result = [H1:H0]^[X3:X2]
|
||||
__ eor(v0, __ T16B, v7, v6);
|
||||
|
||||
__ subs(blocks, blocks, 1);
|
||||
__ cbnz(blocks, L_ghash_loop);
|
||||
|
||||
__ ext(v1, __ T16B, v0, v0, 0x08);
|
||||
__ st1(v1, __ T16B, state);
|
||||
__ ret(lr);
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user