8165381: Update for x86 SHA512 using AVX2

Add intrinsics for x86 AVX2 architecture with no SHA instructions.

Reviewed-by: kvn
This commit is contained in:
Smita Kamath 2016-10-21 10:16:09 -07:00 committed by Vladimir Kozlov
parent cebdce2234
commit 2bc0337093
11 changed files with 668 additions and 9 deletions

View File

@ -3298,6 +3298,15 @@ void Assembler::vperm2i128(XMMRegister dst, XMMRegister nds, XMMRegister src, i
emit_int8(imm8);
}
void Assembler::vperm2f128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8) {
assert(VM_Version::supports_avx(), "");
InstructionAttr attributes(AVX_256bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int8(0x06);
emit_int8(0xC0 | encode);
emit_int8(imm8);
}
void Assembler::pause() {
emit_int8((unsigned char)0xF3);
@ -7359,7 +7368,7 @@ void Assembler::cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop
emit_int8((unsigned char)(0xF & cop));
}
void Assembler::vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) {
void Assembler::blendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) {
assert(VM_Version::supports_avx(), "");
assert(!VM_Version::supports_evex(), "");
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
@ -7370,6 +7379,15 @@ void Assembler::vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMM
emit_int8((unsigned char)(0xF0 & src2_enc<<4));
}
void Assembler::vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) {
assert(VM_Version::supports_avx2(), "");
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int8((unsigned char)0x02);
emit_int8((unsigned char)(0xC0 | encode));
emit_int8((unsigned char)imm8);
}
void Assembler::shlxl(Register dst, Register src1, Register src2) {
assert(VM_Version::supports_bmi2(), "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);

View File

@ -1550,6 +1550,7 @@ private:
void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
void vpermq(XMMRegister dst, XMMRegister src, int imm8);
void vperm2i128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8);
void vperm2f128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8);
void pause();
@ -2105,7 +2106,8 @@ private:
// AVX support for vectorized conditional move (double). The following two instructions used only coupled.
void cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
void vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
void blendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
void vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len);
protected:
// Next instructions require address alignment 16 bytes SSE mode.

View File

@ -4309,6 +4309,15 @@ void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int v
}
}
void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
if (reachable(src)) {
Assembler::vpand(dst, nds, as_Address(src), vector_len);
} else {
lea(rscratch1, src);
Assembler::vpand(dst, nds, Address(rscratch1, 0), vector_len);
}
}
void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
int dst_enc = dst->encoding();
int src_enc = src->encoding();

View File

@ -943,6 +943,23 @@ class MacroAssembler: public Assembler {
bool multi_block, XMMRegister shuf_mask);
#endif
#ifdef _LP64
private:
void sha512_AVX2_one_round_compute(Register old_h, Register a, Register b, Register c, Register d,
Register e, Register f, Register g, Register h, int iteration);
void sha512_AVX2_one_round_and_schedule(XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
Register a, Register b, Register c, Register d, Register e, Register f,
Register g, Register h, int iteration);
void addmq(int disp, Register r1, Register r2);
public:
void sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block,
XMMRegister shuf_mask);
#endif
void fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
Register buf, Register state, Register ofs, Register limit, Register rsp,
@ -1177,6 +1194,10 @@ public:
void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
void vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
void vpbroadcastw(XMMRegister dst, XMMRegister src);
void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);

View File

@ -674,6 +674,11 @@ void MacroAssembler::addm(int disp, Register r1, Register r2) {
movl(Address(r1, disp), r2);
}
void MacroAssembler::addmq(int disp, Register r1, Register r2) {
addq(r2, Address(r1, disp));
movq(Address(r1, disp), r2);
}
void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
Register buf, Register state, Register ofs, Register limit, Register rsp,
@ -1026,4 +1031,488 @@ bind(compute_size1);
bind(compute_size_end1);
}
}
void MacroAssembler::sha512_AVX2_one_round_compute(Register old_h, Register a, Register b, Register c,
Register d, Register e, Register f, Register g, Register h,
int iteration)
{
const Register& y0 = r13;
const Register& y1 = r14;
const Register& y2 = r15;
#ifdef _WIN64
const Register& y3 = rcx;
#else
const Register& y3 = rdi;
#endif
const Register& T1 = r12;
if (iteration % 4 > 0) {
addq(old_h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0;
}
movq(y2, f); //y2 = f; CH
rorxq(y0, e, 41); //y0 = e >> 41; S1A
rorxq(y1, e, 18); //y1 = e >> 18; S1B
xorq(y2, g); //y2 = f^g; CH
xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1
rorxq(y1, e, 14); //y1 = (e >> 14); S1
andq(y2, e); //y2 = (f^g)&e; CH
if (iteration % 4 > 0 ) {
addq(old_h, y3); //h = t1 + S0 + MAJ
}
xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1
rorxq(T1, a, 34); //T1 = a >> 34; S0B
xorq(y2, g); //y2 = CH = ((f^g)&e) ^g; CH
rorxq(y1, a, 39); //y1 = a >> 39; S0A
movq(y3, a); //y3 = a; MAJA
xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0
rorxq(T1, a, 28); //T1 = (a >> 28); S0
addq(h, Address(rsp, (8 * iteration))); //h = k + w + h; --
orq(y3, c); //y3 = a | c; MAJA
xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0
movq(T1, a); //T1 = a; MAJB
andq(y3, b); //y3 = (a | c)&b; MAJA
andq(T1, c); //T1 = a&c; MAJB
addq(y2, y0); //y2 = S1 + CH; --
addq(d, h); //d = k + w + h + d; --
orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ
addq(h, y1); //h = k + w + h + S0; --
addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; --
if (iteration % 4 == 3) {
addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; --
addq(h, y3); //h = t1 + S0 + MAJ; --
}
}
void MacroAssembler::sha512_AVX2_one_round_and_schedule(
XMMRegister xmm4, // ymm4
XMMRegister xmm5, // ymm5
XMMRegister xmm6, // ymm6
XMMRegister xmm7, // ymm7
Register a, //rax
Register b, //rbx
Register c, //rdi
Register d, //rsi
Register e, //r8
Register f, //r9
Register g, //r10
Register h, //r11
int iteration)
{
const Register& y0 = r13;
const Register& y1 = r14;
const Register& y2 = r15;
#ifdef _WIN64
const Register& y3 = rcx;
#else
const Register& y3 = rdi;
#endif
const Register& T1 = r12;
if (iteration % 4 == 0) {
// Extract w[t - 7]
// xmm0 = W[-7]
vperm2f128(xmm0, xmm7, xmm6, 3);
vpalignr(xmm0, xmm0, xmm6, 8, AVX_256bit);
// Calculate w[t - 16] + w[t - 7]
vpaddq(xmm0, xmm0, xmm4, AVX_256bit); //xmm0 = W[-7] + W[-16]
// Extract w[t - 15]
//xmm1 = W[-15]
vperm2f128(xmm1, xmm5, xmm4, 3);
vpalignr(xmm1, xmm1, xmm4, 8, AVX_256bit);
// Calculate sigma0
// Calculate w[t - 15] ror 1
vpsrlq(xmm2, xmm1, 1, AVX_256bit);
vpsllq(xmm3, xmm1, (64 - 1), AVX_256bit);
vpor(xmm3, xmm3, xmm2, AVX_256bit); //xmm3 = W[-15] ror 1
// Calculate w[t - 15] shr 7
vpsrlq(xmm8, xmm1, 7, AVX_256bit); //xmm8 = W[-15] >> 7
} else if (iteration % 4 == 1) {
//Calculate w[t - 15] ror 8
vpsrlq(xmm2, xmm1, 8, AVX_256bit);
vpsllq(xmm1, xmm1, (64 - 8), AVX_256bit);
vpor(xmm1, xmm1, xmm2, AVX_256bit); //xmm1 = W[-15] ror 8
//XOR the three components
vpxor(xmm3, xmm3, xmm8, AVX_256bit); //xmm3 = W[-15] ror 1 ^ W[-15] >> 7
vpxor(xmm1, xmm3, xmm1, AVX_256bit); //xmm1 = s0
//Add three components, w[t - 16], w[t - 7] and sigma0
vpaddq(xmm0, xmm0, xmm1, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0
// Move to appropriate lanes for calculating w[16] and w[17]
vperm2f128(xmm4, xmm0, xmm0, 0); //xmm4 = W[-16] + W[-7] + s0{ BABA }
address MASK_YMM_LO = StubRoutines::x86::pshuffle_byte_flip_mask_addr_sha512();
//Move to appropriate lanes for calculating w[18] and w[19]
vpand(xmm0, xmm0, ExternalAddress(MASK_YMM_LO + 32), AVX_256bit); //xmm0 = W[-16] + W[-7] + s0{ DC00 }
//Calculate w[16] and w[17] in both 128 bit lanes
//Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
vperm2f128(xmm2, xmm7, xmm7, 17); //xmm2 = W[-2] {BABA}
vpsrlq(xmm8, xmm2, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {BABA}
} else if (iteration % 4 == 2) {
vpsrlq(xmm3, xmm2, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {BABA}
vpsllq(xmm1, xmm2, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {BABA}
vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {BABA}
vpxor(xmm8, xmm8, xmm3, AVX_256bit);// xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
vpsrlq(xmm3, xmm2, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {BABA}
vpsllq(xmm1, xmm2, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {BABA}
vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {BABA}
vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { BABA }
//Add sigma1 to the other components to get w[16] and w[17]
vpaddq(xmm4, xmm4, xmm8, AVX_256bit); //xmm4 = { W[1], W[0], W[1], W[0] }
//Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
vpsrlq(xmm8, xmm4, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {DC--}
} else if (iteration % 4 == 3){
vpsrlq(xmm3, xmm4, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {DC--}
vpsllq(xmm1, xmm4, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {DC--}
vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {DC--}
vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
vpsrlq(xmm3, xmm4, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {DC--}
vpsllq(xmm1, xmm4, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {DC--}
vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {DC--}
vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { DC-- }
//Add the sigma0 + w[t - 7] + w[t - 16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19]
vpaddq(xmm2, xmm0, xmm8, AVX_256bit); //xmm2 = { W[3], W[2], --, -- }
//Form w[19, w[18], w17], w[16]
vpblendd(xmm4, xmm4, xmm2, 0xF0, AVX_256bit); //xmm4 = { W[3], W[2], W[1], W[0] }
}
movq(y3, a); //y3 = a; MAJA
rorxq(y0, e, 41); // y0 = e >> 41; S1A
rorxq(y1, e, 18); //y1 = e >> 18; S1B
addq(h, Address(rsp, (iteration * 8))); //h = k + w + h; --
orq(y3, c); //y3 = a | c; MAJA
movq(y2, f); //y2 = f; CH
xorq(y2, g); //y2 = f^g; CH
rorxq(T1, a, 34); //T1 = a >> 34; S0B
xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1
rorxq(y1, e, 14); //y1 = (e >> 14); S1
andq(y2, e); //y2 = (f^g) & e; CH
addq(d, h); //d = k + w + h + d; --
andq(y3, b); //y3 = (a | c)&b; MAJA
xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1
rorxq(y1, a, 39); //y1 = a >> 39; S0A
xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0
rorxq(T1, a, 28); //T1 = (a >> 28); S0
xorq(y2, g); //y2 = CH = ((f^g)&e) ^ g; CH
xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0
movq(T1, a); //T1 = a; MAJB
andq(T1, c); //T1 = a&c; MAJB
addq(y2, y0); //y2 = S1 + CH; --
orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ
addq(h, y1); //h = k + w + h + S0; --
addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; --
addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; --
addq(h, y3); //h = t1 + S0 + MAJ; --
}
void MacroAssembler::sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
Register buf, Register state, Register ofs, Register limit, Register rsp,
bool multi_block, XMMRegister shuf_mask)
{
Label loop0, loop1, loop2, done_hash,
compute_block_size, compute_size,
compute_block_size_end, compute_size_end;
address K512_W = StubRoutines::x86::k512_W_addr();
address pshuffle_byte_flip_mask_sha512 = StubRoutines::x86::pshuffle_byte_flip_mask_addr_sha512();
address pshuffle_byte_flip_mask_addr = 0;
const XMMRegister& XFER = xmm0; // YTMP0
const XMMRegister& BYTE_FLIP_MASK = xmm9; // ymm9
#ifdef _WIN64
const Register& INP = rcx; //1st arg
const Register& CTX = rdx; //2nd arg
const Register& NUM_BLKS = r8; //3rd arg
const Register& c = rdi;
const Register& d = rsi;
const Register& e = r8;
const Register& y3 = rcx;
const Register& offset = r8;
const Register& input_limit = r9;
#else
const Register& INP = rdi; //1st arg
const Register& CTX = rsi; //2nd arg
const Register& NUM_BLKS = rdx; //3rd arg
const Register& c = rcx;
const Register& d = r8;
const Register& e = rdx;
const Register& y3 = rdi;
const Register& offset = rdx;
const Register& input_limit = rcx;
#endif
const Register& TBL = rbp;
const Register& a = rax;
const Register& b = rbx;
const Register& f = r9;
const Register& g = r10;
const Register& h = r11;
//Local variables as defined in assembly file.
enum
{
_XFER_SIZE = 4 * 8, // resq 4 => reserve 4 quadwords. Hence 4 * 8
_SRND_SIZE = 8, // resq 1
_INP_SIZE = 8,
_INP_END_SIZE = 8,
_RSP_SAVE_SIZE = 8, // defined as resq 1
#ifdef _WIN64
_GPR_SAVE_SIZE = 8 * 8, // defined as resq 8
#else
_GPR_SAVE_SIZE = 6 * 8 // resq 6
#endif
};
enum
{
_XFER = 0,
_SRND = _XFER + _XFER_SIZE, // 32
_INP = _SRND + _SRND_SIZE, // 40
_INP_END = _INP + _INP_SIZE, // 48
_RSP = _INP_END + _INP_END_SIZE, // 56
_GPR = _RSP + _RSP_SAVE_SIZE, // 64
_STACK_SIZE = _GPR + _GPR_SAVE_SIZE // 128 for windows and 112 for linux.
};
//Saving offset and limit as it will help with blocksize calculation for multiblock SHA512.
#ifdef _WIN64
push(r8); // win64: this is ofs
push(r9); // win64: this is limit, we need them again at the very end.
#else
push(rdx); // linux : this is ofs, need at the end for multiblock calculation
push(rcx); // linux: This is the limit.
#endif
//Allocate Stack Space
movq(rax, rsp);
subq(rsp, _STACK_SIZE);
andq(rsp, -32);
movq(Address(rsp, _RSP), rax);
//Save GPRs
movq(Address(rsp, _GPR), rbp);
movq(Address(rsp, (_GPR + 8)), rbx);
movq(Address(rsp, (_GPR + 16)), r12);
movq(Address(rsp, (_GPR + 24)), r13);
movq(Address(rsp, (_GPR + 32)), r14);
movq(Address(rsp, (_GPR + 40)), r15);
#ifdef _WIN64
movq(Address(rsp, (_GPR + 48)), rsi);
movq(Address(rsp, (_GPR + 56)), rdi);
#endif
vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_128bit);
vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_256bit);
if (multi_block) {
xorq(rax, rax);
bind(compute_block_size);
cmpptr(offset, input_limit); // Assuming that offset is less than limit.
jccb(Assembler::aboveEqual, compute_block_size_end);
addq(offset, 128);
addq(rax, 128);
jmpb(compute_block_size);
bind(compute_block_size_end);
movq(NUM_BLKS, rax);
cmpq(NUM_BLKS, 0);
jcc(Assembler::equal, done_hash);
} else {
xorq(NUM_BLKS, NUM_BLKS); //If single block.
addq(NUM_BLKS, 128);
}
addq(NUM_BLKS, INP); //pointer to end of data
movq(Address(rsp, _INP_END), NUM_BLKS);
//load initial digest
movq(a, Address(CTX, 8 * 0));
movq(b, Address(CTX, 8 * 1));
movq(c, Address(CTX, 8 * 2));
movq(d, Address(CTX, 8 * 3));
movq(e, Address(CTX, 8 * 4));
movq(f, Address(CTX, 8 * 5));
movq(g, Address(CTX, 8 * 6));
movq(h, Address(CTX, 8 * 7));
pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask_sha512;
vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //PSHUFFLE_BYTE_FLIP_MASK wrt rip
bind(loop0);
lea(TBL, ExternalAddress(K512_W));
//byte swap first 16 dwords
vmovdqu(xmm4, Address(INP, 32 * 0));
vpshufb(xmm4, xmm4, BYTE_FLIP_MASK, AVX_256bit);
vmovdqu(xmm5, Address(INP, 32 * 1));
vpshufb(xmm5, xmm5, BYTE_FLIP_MASK, AVX_256bit);
vmovdqu(xmm6, Address(INP, 32 * 2));
vpshufb(xmm6, xmm6, BYTE_FLIP_MASK, AVX_256bit);
vmovdqu(xmm7, Address(INP, 32 * 3));
vpshufb(xmm7, xmm7, BYTE_FLIP_MASK, AVX_256bit);
movq(Address(rsp, _INP), INP);
movslq(Address(rsp, _SRND), 4);
align(16);
//Schedule 64 input dwords, by calling sha512_AVX2_one_round_and_schedule
bind(loop1);
vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit);
vmovdqu(Address(rsp, _XFER), xmm0);
//four rounds and schedule
sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, a, b, c, d, e, f, g, h, 0);
sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, h, a, b, c, d, e, f, g, 1);
sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, g, h, a, b, c, d, e, f, 2);
sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, f, g, h, a, b, c, d, e, 3);
vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit);
vmovdqu(Address(rsp, _XFER), xmm0);
//four rounds and schedule
sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, e, f, g, h, a, b, c, d, 0);
sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, d, e, f, g, h, a, b, c, 1);
sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, c, d, e, f, g, h, a, b, 2);
sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, b, c, d, e, f, g, h, a, 3);
vpaddq(xmm0, xmm6, Address(TBL, 2 * 32), AVX_256bit);
vmovdqu(Address(rsp, _XFER), xmm0);
//four rounds and schedule
sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, a, b, c, d, e, f, g, h, 0);
sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, h, a, b, c, d, e, f, g, 1);
sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, g, h, a, b, c, d, e, f, 2);
sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, f, g, h, a, b, c, d, e, 3);
vpaddq(xmm0, xmm7, Address(TBL, 3 * 32), AVX_256bit);
vmovdqu(Address(rsp, _XFER), xmm0);
addq(TBL, 4 * 32);
//four rounds and schedule
sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, e, f, g, h, a, b, c, d, 0);
sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, d, e, f, g, h, a, b, c, 1);
sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, c, d, e, f, g, h, a, b, 2);
sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, b, c, d, e, f, g, h, a, 3);
subq(Address(rsp, _SRND), 1);
jcc(Assembler::notEqual, loop1);
movslq(Address(rsp, _SRND), 2);
bind(loop2);
vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit);
vmovdqu(Address(rsp, _XFER), xmm0);
//four rounds and compute.
sha512_AVX2_one_round_compute(a, a, b, c, d, e, f, g, h, 0);
sha512_AVX2_one_round_compute(h, h, a, b, c, d, e, f, g, 1);
sha512_AVX2_one_round_compute(g, g, h, a, b, c, d, e, f, 2);
sha512_AVX2_one_round_compute(f, f, g, h, a, b, c, d, e, 3);
vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit);
vmovdqu(Address(rsp, _XFER), xmm0);
addq(TBL, 2 * 32);
// four rounds and compute.
sha512_AVX2_one_round_compute(e, e, f, g, h, a, b, c, d, 0);
sha512_AVX2_one_round_compute(d, d, e, f, g, h, a, b, c, 1);
sha512_AVX2_one_round_compute(c, c, d, e, f, g, h, a, b, 2);
sha512_AVX2_one_round_compute(b, b, c, d, e, f, g, h, a, 3);
vmovdqu(xmm4, xmm6);
vmovdqu(xmm5, xmm7);
subq(Address(rsp, _SRND), 1);
jcc(Assembler::notEqual, loop2);
addmq(8 * 0, CTX, a);
addmq(8 * 1, CTX, b);
addmq(8 * 2, CTX, c);
addmq(8 * 3, CTX, d);
addmq(8 * 4, CTX, e);
addmq(8 * 5, CTX, f);
addmq(8 * 6, CTX, g);
addmq(8 * 7, CTX, h);
movq(INP, Address(rsp, _INP));
addq(INP, 128);
cmpq(INP, Address(rsp, _INP_END));
jcc(Assembler::notEqual, loop0);
bind(done_hash);
//Restore GPRs
movq(rbp, Address(rsp, (_GPR + 0)));
movq(rbx, Address(rsp, (_GPR + 8)));
movq(r12, Address(rsp, (_GPR + 16)));
movq(r13, Address(rsp, (_GPR + 24)));
movq(r14, Address(rsp, (_GPR + 32)));
movq(r15, Address(rsp, (_GPR + 40)));
#ifdef _WIN64
movq(rsi, Address(rsp, (_GPR + 48)));
movq(rdi, Address(rsp, (_GPR + 56)));
#endif
//Restore Stack Pointer
movq(rsp, Address(rsp, _RSP));
#ifdef _WIN64
pop(r9);
pop(r8);
#else
pop(rcx);
pop(rdx);
#endif
if (multi_block) {
#ifdef _WIN64
const Register& limit_end = r9;
const Register& ofs_end = r8;
#else
const Register& limit_end = rcx;
const Register& ofs_end = rdx;
#endif
movq(rax, ofs_end);
bind(compute_size);
cmpptr(rax, limit_end);
jccb(Assembler::aboveEqual, compute_size_end);
addq(rax, 128);
jmpb(compute_size);
bind(compute_size_end);
}
}
#endif //#ifdef _LP64

View File

@ -3718,6 +3718,25 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
//Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
address generate_pshuffle_byte_flip_mask_sha512() {
__ align(32);
StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
address start = __ pc();
if (VM_Version::supports_avx2()) {
__ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
__ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
__ emit_data64(0x1011121314151617, relocInfo::none);
__ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
__ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
__ emit_data64(0x0000000000000000, relocInfo::none);
__ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
__ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
}
return start;
}
// ofs and limit are use for multi-block byte array.
// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
address generate_sha256_implCompress(bool multi_block, const char *name) {
@ -3761,6 +3780,39 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
address generate_sha512_implCompress(bool multi_block, const char *name) {
assert(VM_Version::supports_avx2(), "");
assert(VM_Version::supports_bmi2(), "");
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
Register buf = c_rarg0;
Register state = c_rarg1;
Register ofs = c_rarg2;
Register limit = c_rarg3;
const XMMRegister msg = xmm0;
const XMMRegister state0 = xmm1;
const XMMRegister state1 = xmm2;
const XMMRegister msgtmp0 = xmm3;
const XMMRegister msgtmp1 = xmm4;
const XMMRegister msgtmp2 = xmm5;
const XMMRegister msgtmp3 = xmm6;
const XMMRegister msgtmp4 = xmm7;
const XMMRegister shuf_mask = xmm8;
__ enter();
__ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
buf, state, ofs, limit, rsp, multi_block, shuf_mask);
__ leave();
__ ret(0);
return start;
}
// This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
// to hide instruction latency
//
@ -5081,6 +5133,12 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
}
if (UseSHA512Intrinsics) {
StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
}
// Generate GHASH intrinsics code
if (UseGHASHIntrinsics) {

View File

@ -48,6 +48,8 @@ address StubRoutines::x86::_shuffle_byte_flip_mask_addr = NULL;
address StubRoutines::x86::_k256_adr = NULL;
#ifdef _LP64
address StubRoutines::x86::_k256_W_adr = NULL;
address StubRoutines::x86::_k512_W_addr = NULL;
address StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = NULL;
#endif
address StubRoutines::x86::_pshuffle_byte_flip_mask_addr = NULL;
@ -297,4 +299,49 @@ ALIGNED_(64) juint StubRoutines::x86::_k256[] =
// used in MacroAssembler::sha256_AVX2
// dynamically built from _k256
ALIGNED_(64) juint StubRoutines::x86::_k256_W[2*sizeof(StubRoutines::x86::_k256)];
// used in MacroAssembler::sha512_AVX2
ALIGNED_(64) julong StubRoutines::x86::_k512_W[] =
{
0x428a2f98d728ae22LL, 0x7137449123ef65cdLL,
0xb5c0fbcfec4d3b2fLL, 0xe9b5dba58189dbbcLL,
0x3956c25bf348b538LL, 0x59f111f1b605d019LL,
0x923f82a4af194f9bLL, 0xab1c5ed5da6d8118LL,
0xd807aa98a3030242LL, 0x12835b0145706fbeLL,
0x243185be4ee4b28cLL, 0x550c7dc3d5ffb4e2LL,
0x72be5d74f27b896fLL, 0x80deb1fe3b1696b1LL,
0x9bdc06a725c71235LL, 0xc19bf174cf692694LL,
0xe49b69c19ef14ad2LL, 0xefbe4786384f25e3LL,
0x0fc19dc68b8cd5b5LL, 0x240ca1cc77ac9c65LL,
0x2de92c6f592b0275LL, 0x4a7484aa6ea6e483LL,
0x5cb0a9dcbd41fbd4LL, 0x76f988da831153b5LL,
0x983e5152ee66dfabLL, 0xa831c66d2db43210LL,
0xb00327c898fb213fLL, 0xbf597fc7beef0ee4LL,
0xc6e00bf33da88fc2LL, 0xd5a79147930aa725LL,
0x06ca6351e003826fLL, 0x142929670a0e6e70LL,
0x27b70a8546d22ffcLL, 0x2e1b21385c26c926LL,
0x4d2c6dfc5ac42aedLL, 0x53380d139d95b3dfLL,
0x650a73548baf63deLL, 0x766a0abb3c77b2a8LL,
0x81c2c92e47edaee6LL, 0x92722c851482353bLL,
0xa2bfe8a14cf10364LL, 0xa81a664bbc423001LL,
0xc24b8b70d0f89791LL, 0xc76c51a30654be30LL,
0xd192e819d6ef5218LL, 0xd69906245565a910LL,
0xf40e35855771202aLL, 0x106aa07032bbd1b8LL,
0x19a4c116b8d2d0c8LL, 0x1e376c085141ab53LL,
0x2748774cdf8eeb99LL, 0x34b0bcb5e19b48a8LL,
0x391c0cb3c5c95a63LL, 0x4ed8aa4ae3418acbLL,
0x5b9cca4f7763e373LL, 0x682e6ff3d6b2b8a3LL,
0x748f82ee5defb2fcLL, 0x78a5636f43172f60LL,
0x84c87814a1f0ab72LL, 0x8cc702081a6439ecLL,
0x90befffa23631e28LL, 0xa4506cebde82bde9LL,
0xbef9a3f7b2c67915LL, 0xc67178f2e372532bLL,
0xca273eceea26619cLL, 0xd186b8c721c0c207LL,
0xeada7dd6cde0eb1eLL, 0xf57d4f7fee6ed178LL,
0x06f067aa72176fbaLL, 0x0a637dc5a2c898a6LL,
0x113f9804bef90daeLL, 0x1b710b35131c471bLL,
0x28db77f523047d84LL, 0x32caab7b40c72493LL,
0x3c9ebe0a15c9bebcLL, 0x431d67c49c100d4cLL,
0x4cc5d4becb3e42b6LL, 0x597f299cfc657e2aLL,
0x5fcb6fab3ad6faecLL, 0x6c44198c4a475817LL,
};
#endif

View File

@ -33,7 +33,7 @@ static bool returns_to_call_stub(address return_pc) { return return_pc == _call_
enum platform_dependent_constants {
code_size1 = 20000 LP64_ONLY(+10000), // simply increase if too small (assembler will crash if too small)
code_size2 = 33800 LP64_ONLY(+1200) // simply increase if too small (assembler will crash if too small)
code_size2 = 33800 LP64_ONLY(+10000) // simply increase if too small (assembler will crash if too small)
};
class x86 {
@ -134,6 +134,10 @@ class x86 {
#ifdef _LP64
static juint _k256_W[];
static address _k256_W_adr;
static julong _k512_W[];
static address _k512_W_addr;
// byte flip mask for sha512
static address _pshuffle_byte_flip_mask_addr_sha512;
#endif
// byte flip mask for sha256
static address _pshuffle_byte_flip_mask_addr;
@ -192,6 +196,8 @@ class x86 {
static address k256_addr() { return _k256_adr; }
#ifdef _LP64
static address k256_W_addr() { return _k256_W_adr; }
static address k512_W_addr() { return _k512_W_addr; }
static address pshuffle_byte_flip_mask_addr_sha512() { return _pshuffle_byte_flip_mask_addr_sha512; }
#endif
static address pshuffle_byte_flip_mask_addr() { return _pshuffle_byte_flip_mask_addr; }
static void generate_CRC32C_table(bool is_pclmulqdq_supported);

View File

@ -769,7 +769,11 @@ void VM_Version::get_processor_features() {
FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
}
if (UseSHA512Intrinsics) {
if (UseSHA) {
if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) {
FLAG_SET_DEFAULT(UseSHA512Intrinsics, true);
}
} else if (UseSHA512Intrinsics) {
warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU.");
FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
}

View File

@ -8173,13 +8173,13 @@ instruct vcmov4D_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd cop
match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
effect(TEMP dst, USE src1, USE src2);
format %{ "cmppd.$copnd $dst, $src1, $src2 ! vcmovevd, cond=$cop\n\t"
"vpblendd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
"blendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
%}
ins_encode %{
int vector_len = 1;
int cond = (Assembler::Condition)($copnd$$cmpcode);
__ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
__ vpblendd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
__ blendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
%}
ins_pipe( pipe_slow );
%}

View File

@ -78,9 +78,14 @@ public class IntrinsicPredicates {
new CPUSpecificPredicate("aarch64.*", new String[] { "sha256" },null)))))));
public static final BooleanSupplier SHA512_INSTRUCTION_AVAILABLE
= new OrPredicate(
new CPUSpecificPredicate("sparc.*", new String[] { "sha512" },null),
new CPUSpecificPredicate("aarch64.*", new String[] { "sha512" },null));
= new OrPredicate(new CPUSpecificPredicate("x86.*", new String[] { "sha" },null),
new OrPredicate(new CPUSpecificPredicate("amd64.*", new String[] { "sha" },null),
new OrPredicate(new CPUSpecificPredicate("i386.*", new String[] { "sha" },null),
new OrPredicate(new CPUSpecificPredicate("x86_64", new String[] { "avx2", "bmi2" }, null),
new OrPredicate(new CPUSpecificPredicate("amd64.*", new String[] { "avx2", "bmi2" }, null),
new OrPredicate(
new CPUSpecificPredicate("sparc.*", new String[] { "sha512" },null),
new CPUSpecificPredicate("aarch64.*", new String[] { "sha512" },null)))))));
public static final BooleanSupplier ANY_SHA_INSTRUCTION_AVAILABLE
= new OrPredicate(IntrinsicPredicates.SHA1_INSTRUCTION_AVAILABLE,