8319716: RISC-V: Add SHA-2

Co-authored-by: Robbin Ehn <rehn@openjdk.org>
Reviewed-by: fyang, mli, luhenry
This commit is contained in:
Ludovic Henry 2024-01-09 07:26:35 +00:00 committed by Robbin Ehn
parent 7286f5291d
commit 4cf131a101
5 changed files with 516 additions and 19 deletions

View File

@ -1334,6 +1334,7 @@ enum VectorMask {
INSN(vsll_vi, 0b1010111, 0b011, 0b100101);
// Vector Slide Instructions
INSN(vslideup_vi, 0b1010111, 0b011, 0b001110);
INSN(vslidedown_vi, 0b1010111, 0b011, 0b001111);
#undef INSN
@ -1689,7 +1690,6 @@ enum VectorMask {
INSN(vmv_v_x, 0b1010111, 0b100, v0, 0b1, 0b010111);
#undef INSN
#undef patch_VArith
#define INSN(NAME, op, funct13, funct6) \
void NAME(VectorRegister Vd, VectorMask vm = unmasked) { \
@ -1731,14 +1731,29 @@ enum Nf {
patch_reg((address)&insn, 15, Rs1); \
emit(insn)
#define INSN(NAME, op, lumop, vm, mop, nf) \
void NAME(VectorRegister Vd, Register Rs1, uint32_t width = 0, bool mew = false) { \
#define INSN(NAME, op, width, lumop, vm, mop, mew, nf) \
void NAME(VectorRegister Vd, Register Rs1) { \
guarantee(is_uimm3(width), "width is invalid"); \
patch_VLdSt(op, Vd, width, Rs1, lumop, vm, mop, mew, nf); \
}
// Vector Load/Store Instructions
INSN(vl1re8_v, 0b0000111, 0b01000, 0b1, 0b00, g1);
INSN(vl1re8_v, 0b0000111, 0b000, 0b01000, 0b1, 0b00, 0b0, g1);
INSN(vl1re16_v, 0b0000111, 0b101, 0b01000, 0b1, 0b00, 0b0, g1);
INSN(vl1re32_v, 0b0000111, 0b110, 0b01000, 0b1, 0b00, 0b0, g1);
INSN(vl1re64_v, 0b0000111, 0b111, 0b01000, 0b1, 0b00, 0b0, g1);
INSN(vl2re8_v, 0b0000111, 0b000, 0b01000, 0b1, 0b00, 0b0, g2);
INSN(vl2re16_v, 0b0000111, 0b101, 0b01000, 0b1, 0b00, 0b0, g2);
INSN(vl2re32_v, 0b0000111, 0b110, 0b01000, 0b1, 0b00, 0b0, g2);
INSN(vl2re64_v, 0b0000111, 0b111, 0b01000, 0b1, 0b00, 0b0, g2);
INSN(vl4re8_v, 0b0000111, 0b000, 0b01000, 0b1, 0b00, 0b0, g4);
INSN(vl4re16_v, 0b0000111, 0b101, 0b01000, 0b1, 0b00, 0b0, g4);
INSN(vl4re32_v, 0b0000111, 0b110, 0b01000, 0b1, 0b00, 0b0, g4);
INSN(vl4re64_v, 0b0000111, 0b111, 0b01000, 0b1, 0b00, 0b0, g4);
INSN(vl8re8_v, 0b0000111, 0b000, 0b01000, 0b1, 0b00, 0b0, g8);
INSN(vl8re16_v, 0b0000111, 0b101, 0b01000, 0b1, 0b00, 0b0, g8);
INSN(vl8re32_v, 0b0000111, 0b110, 0b01000, 0b1, 0b00, 0b0, g8);
INSN(vl8re64_v, 0b0000111, 0b111, 0b01000, 0b1, 0b00, 0b0, g8);
#undef INSN
@ -1749,6 +1764,9 @@ enum Nf {
// Vector Load/Store Instructions
INSN(vs1r_v, 0b0100111, 0b000, 0b01000, 0b1, 0b00, 0b0, g1);
INSN(vs2r_v, 0b0100111, 0b000, 0b01000, 0b1, 0b00, 0b0, g2);
INSN(vs4r_v, 0b0100111, 0b000, 0b01000, 0b1, 0b00, 0b0, g4);
INSN(vs8r_v, 0b0100111, 0b000, 0b01000, 0b1, 0b00, 0b0, g8);
#undef INSN
@ -1794,9 +1812,11 @@ enum Nf {
}
// Vector unordered indexed load instructions
INSN( vluxei8_v, 0b0000111, 0b000, 0b01, 0b0);
INSN(vluxei32_v, 0b0000111, 0b110, 0b01, 0b0);
// Vector unordered indexed store instructions
INSN( vsuxei8_v, 0b0100111, 0b000, 0b01, 0b0);
INSN(vsuxei32_v, 0b0100111, 0b110, 0b01, 0b0);
#undef INSN
@ -1820,6 +1840,55 @@ enum Nf {
#undef INSN
#undef patch_VLdSt
// ====================================
// RISC-V Vector Crypto Extension
// ====================================
#define INSN(NAME, op, funct3, funct6) \
void NAME(VectorRegister Vd, VectorRegister Vs2, VectorRegister Vs1, VectorMask vm = unmasked) { \
patch_VArith(op, Vd, funct3, Vs1->raw_encoding(), Vs2, vm, funct6); \
}
// Vector Bit-manipulation used in Cryptography (Zvkb) Extension
INSN(vandn_vv, 0b1010111, 0b000, 0b000001);
INSN(vandn_vx, 0b1010111, 0b100, 0b000001);
INSN(vandn_vi, 0b1010111, 0b011, 0b000001);
INSN(vclmul_vv, 0b1010111, 0b010, 0b001100);
INSN(vclmul_vx, 0b1010111, 0b110, 0b001100);
INSN(vclmulh_vv, 0b1010111, 0b010, 0b001101);
INSN(vclmulh_vx, 0b1010111, 0b110, 0b001101);
INSN(vror_vv, 0b1010111, 0b000, 0b010100);
INSN(vror_vx, 0b1010111, 0b100, 0b010100);
INSN(vrol_vv, 0b1010111, 0b000, 0b010101);
INSN(vrol_vx, 0b1010111, 0b100, 0b010101);
#undef INSN
#define INSN(NAME, op, funct3, Vs1, funct6) \
void NAME(VectorRegister Vd, VectorRegister Vs2, VectorMask vm = unmasked) { \
patch_VArith(op, Vd, funct3, Vs1, Vs2, vm, funct6); \
}
// Vector Bit-manipulation used in Cryptography (Zvkb) Extension
INSN(vbrev8_v, 0b1010111, 0b010, 0b01000, 0b010010);
INSN(vrev8_v, 0b1010111, 0b010, 0b01001, 0b010010);
#undef INSN
#define INSN(NAME, op, funct3, vm, funct6) \
void NAME(VectorRegister Vd, VectorRegister Vs2, VectorRegister Vs1) { \
patch_VArith(op, Vd, funct3, Vs1->raw_encoding(), Vs2, vm, funct6); \
}
// Vector SHA-2 Secure Hash (Zvknh[ab]) Extension
INSN(vsha2ms_vv, 0b1110111, 0b010, 0b1, 0b101101);
INSN(vsha2ch_vv, 0b1110111, 0b010, 0b1, 0b101110);
INSN(vsha2cl_vv, 0b1110111, 0b010, 0b1, 0b101111);
#undef INSN
#undef patch_VArith
// ====================================
// RISC-V Bit-Manipulation Extension
// Currently only support Zba, Zbb and Zbs bitmanip extensions.

View File

@ -113,6 +113,8 @@ define_pd_global(intx, InlineSmallCode, 1000);
product(bool, UseZtso, false, EXPERIMENTAL, "Assume Ztso memory model") \
product(bool, UseZihintpause, false, EXPERIMENTAL, \
"Use Zihintpause instructions") \
product(bool, UseZvkn, false, EXPERIMENTAL, \
"Use Zvkn group extension, Zvkned, Zvknhb, Zvkb, Zvkt") \
product(bool, UseRVVForBigIntegerShiftIntrinsics, true, \
"Use RVV instructions for left/right shift of BigInteger")

View File

@ -1361,6 +1361,16 @@ public:
vmfle_vv(vd, vs1, vs2, vm);
}
inline void vmsltu_vi(VectorRegister Vd, VectorRegister Vs2, uint32_t imm, VectorMask vm = unmasked) {
guarantee(imm >= 1 && imm <= 16, "imm is invalid");
vmsleu_vi(Vd, Vs2, imm-1, vm);
}
inline void vmsgeu_vi(VectorRegister Vd, VectorRegister Vs2, uint32_t imm, VectorMask vm = unmasked) {
guarantee(imm >= 1 && imm <= 16, "imm is invalid");
vmsgtu_vi(Vd, Vs2, imm-1, vm);
}
// Copy mask register
inline void vmmv_m(VectorRegister vd, VectorRegister vs) {
vmand_mm(vd, vs, vs);
@ -1376,6 +1386,10 @@ public:
vmxnor_mm(vd, vd, vd);
}
inline void vnot_v(VectorRegister Vd, VectorRegister Vs, VectorMask vm = unmasked) {
vxor_vi(Vd, Vs, -1, vm);
}
static const int zero_words_block_size;
void cast_primitive_type(BasicType type, Register Rt) {

View File

@ -3659,8 +3659,394 @@ class StubGenerator: public StubCodeGenerator {
return entry;
}
};
#endif // COMPILER2
#undef __
#define __ this->
class Sha2Generator : public MacroAssembler {
StubCodeGenerator* _cgen;
public:
Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {}
address generate_sha256_implCompress(bool multi_block) {
return generate_sha2_implCompress(Assembler::e32, multi_block);
}
address generate_sha512_implCompress(bool multi_block) {
return generate_sha2_implCompress(Assembler::e64, multi_block);
}
private:
void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
if (vset_sew == Assembler::e32) __ vle32_v(vr, sr);
else __ vle64_v(vr, sr);
}
void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
if (vset_sew == Assembler::e32) __ vse32_v(vr, sr);
else __ vse64_v(vr, sr);
}
// Overview of the logic in each "quad round".
//
// The code below repeats 16/20 times the logic implementing four rounds
// of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds"
// to implementing the 64/80 single rounds.
//
// // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0])
// // Output:
// // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
// vl1reXX.v vTmp1, ofs
//
// // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b)
// addi ofs, ofs, 16/32
//
// // Add constants to message schedule words:
// // Input
// // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
// // vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0];
// // Output
// // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
// vadd.vv vTmp0, vTmp1, vW0
//
// // 2 rounds of working variables updates.
// // vState1[t+4] <- vState1[t], vState0[t], vTmp0[t]
// // Input:
// // vState1 = {c[t],d[t],g[t],h[t]} " = vState1[t] "
// // vState0 = {a[t],b[t],e[t],f[t]}
// // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
// // Output:
// // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] "
// // = {h[t+4],g[t+4],d[t+4],c[t+4]} " = vState1[t+4] "
// vsha2cl.vv vState1, vState0, vTmp0
//
// // 2 rounds of working variables updates.
// // vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t]
// // Input
// // vState0 = {a[t],b[t],e[t],f[t]} " = vState0[t] "
// // = {h[t+2],g[t+2],d[t+2],c[t+2]} " = vState1[t+2] "
// // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] "
// // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
// // Output:
// // vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]} " = vState0[t+4] "
// vsha2ch.vv vState0, vState1, vTmp0
//
// // Combine 2QW into 1QW
// //
// // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs
// // vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3]
// // and it can only take 3 vectors as inputs. Hence we need to combine
// // vW1[0] and vW2[1..3] in a single vector.
// //
// // vmerge Vt4, Vt1, Vt2, V0
// // Input
// // V0 = mask // first word from vW2, 1..3 words from vW1
// // vW2 = {Wt-8, Wt-7, Wt-6, Wt-5}
// // vW1 = {Wt-12, Wt-11, Wt-10, Wt-9}
// // Output
// // Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5}
// vmerge.vvm vTmp0, vW2, vW1, v0
//
// // Generate next Four Message Schedule Words (hence allowing for 4 more rounds)
// // Input
// // vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]} W[ 3: 0]
// // vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]} W[15:12]
// // vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]} W[11: 9,4]
// // Output (next four message schedule words)
// // vW0 = {W[t+19], W[t+18], W[t+17], W[t+16]} W[19:16]
// vsha2ms.vv vW0, vTmp0, vW3
//
// BEFORE
// vW0 - vW3 hold the message schedule words (initially the block words)
// vW0 = W[ 3: 0] "oldest"
// vW1 = W[ 7: 4]
// vW2 = W[11: 8]
// vW3 = W[15:12] "newest"
//
// vt6 - vt7 hold the working state variables
// vState0 = {a[t],b[t],e[t],f[t]} // initially {H5,H4,H1,H0}
// vState1 = {c[t],d[t],g[t],h[t]} // initially {H7,H6,H3,H2}
//
// AFTER
// vW0 - vW3 hold the message schedule words (initially the block words)
// vW1 = W[ 7: 4] "oldest"
// vW2 = W[11: 8]
// vW3 = W[15:12]
// vW0 = W[19:16] "newest"
//
// vState0 and vState1 hold the working state variables
// vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]}
// vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]}
//
// The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round,
// hence the uses of those vectors rotate in each round, and we get back to the
// initial configuration every 4 quad-rounds. We could avoid those changes at
// the cost of moving those vectors at the end of each quad-rounds.
void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4,
Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh,
bool gen_words = true, bool step_const = true) {
__ vleXX_v(vset_sew, vtemp, scalarconst);
if (step_const) {
__ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32);
}
__ vadd_vv(vtemp2, vtemp, rot1);
__ vsha2cl_vv(v_cdgh, v_abef, vtemp2);
__ vsha2ch_vv(v_abef, v_cdgh, vtemp2);
if (gen_words) {
__ vmerge_vvm(vtemp2, rot3, rot2);
__ vsha2ms_vv(rot1, vtemp2, rot4);
}
}
const char* stub_name(Assembler::SEW vset_sew, bool multi_block) {
if (vset_sew == Assembler::e32 && !multi_block) return "sha256_implCompress";
if (vset_sew == Assembler::e32 && multi_block) return "sha256_implCompressMB";
if (vset_sew == Assembler::e64 && !multi_block) return "sha512_implCompress";
if (vset_sew == Assembler::e64 && multi_block) return "sha512_implCompressMB";
ShouldNotReachHere();
return "bad name lookup";
}
// Arguments:
//
// Inputs:
// c_rarg0 - byte[] source+offset
// c_rarg1 - int[] SHA.state
// c_rarg2 - int offset
// c_rarg3 - int limit
//
address generate_sha2_implCompress(Assembler::SEW vset_sew, bool multi_block) {
alignas(64) static const uint32_t round_consts_256[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
};
alignas(64) static const uint64_t round_consts_512[80] = {
0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl,
0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l,
0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l,
0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l,
0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l,
0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l,
0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l,
0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l,
0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl,
0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l,
0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl,
0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl,
0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l,
0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l,
0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l,
0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l,
0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l,
0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl,
0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl,
0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl,
0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l,
0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l,
0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal,
0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl,
0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl,
0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al,
0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l
};
const int const_add = vset_sew == Assembler::e32 ? 16 : 32;
__ align(CodeEntryAlignment);
StubCodeMark mark(_cgen, "StubRoutines", stub_name(vset_sew, multi_block));
address start = __ pc();
Register buf = c_rarg0;
Register state = c_rarg1;
Register ofs = c_rarg2;
Register limit = c_rarg3;
Register consts = t2; // caller saved
Register state_c = x28; // caller saved
VectorRegister vindex = v2;
VectorRegister vW0 = v4;
VectorRegister vW1 = v6;
VectorRegister vW2 = v8;
VectorRegister vW3 = v10;
VectorRegister vState0 = v12;
VectorRegister vState1 = v14;
VectorRegister vHash0 = v16;
VectorRegister vHash1 = v18;
VectorRegister vTmp0 = v20;
VectorRegister vTmp1 = v22;
Label multi_block_loop;
__ enter();
address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512;
la(consts, ExternalAddress(constant_table));
// Register use in this function:
//
// VECTORS
// vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message
// schedule words (Wt). They start with the message block
// content (W0 to W15), then further words in the message
// schedule generated via vsha2ms from previous Wt.
// Initially:
// vW0 = W[ 3:0] = { W3, W2, W1, W0}
// vW1 = W[ 7:4] = { W7, W6, W5, W4}
// vW2 = W[ 11:8] = {W11, W10, W9, W8}
// vW3 = W[15:12] = {W15, W14, W13, W12}
//
// vState0 - vState1 hold the working state variables (a, b, ..., h)
// vState0 = {f[t],e[t],b[t],a[t]}
// vState1 = {h[t],g[t],d[t],c[t]}
// Initially:
// vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1}
// vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1}
//
// v0 = masks for vrgather/vmerge. Single value during the 16 rounds.
//
// vTmp0 = temporary, Wt+Kt
// vTmp1 = temporary, Kt
//
// vHash0/vHash1 = hold the initial values of the hash, byte-swapped.
//
// During most of the function the vector state is configured so that each
// vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits).
// vsha2ch/vsha2cl uses EGW of 4*SEW.
// SHA256 SEW = e32, EGW = 128-bits
// SHA512 SEW = e64, EGW = 256-bits
//
// VLEN is required to be at least 128.
// For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256)
//
// m1: LMUL=1/2
// ta: tail agnostic (don't care about those lanes)
// ma: mask agnostic (don't care about those lanes)
// x0 is not written, we known the number of vector elements.
if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128
__ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta);
} else {
__ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta);
}
int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul;
__ li(t0, indexes);
__ vmv_v_x(vindex, t0);
// Step-over a,b, so we are pointing to c.
// const_add is equal to 4x state variable, div by 2 is thus 2, a,b
__ addi(state_c, state, const_add/2);
// Use index-load to get {f,e,b,a},{h,g,d,c}
__ vluxei8_v(vState0, state, vindex);
__ vluxei8_v(vState1, state_c, vindex);
__ bind(multi_block_loop);
// Capture the initial H values in vHash0 and vHash1 to allow for computing
// the resulting H', since H' = H+{a',b',c',...,h'}.
__ vmv_v_v(vHash0, vState0);
__ vmv_v_v(vHash1, vState1);
// Load the 512/1024-bits of the message block in vW0-vW3 and perform
// an endian swap on each 4/8 bytes element.
//
// If Zvkb is not implemented one can use vrgather
// with an index sequence to byte-swap.
// sequence = [3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12]
// <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate
// this sequence. 'vid' gives us the N.
__ vleXX_v(vset_sew, vW0, buf);
__ vrev8_v(vW0, vW0);
__ addi(buf, buf, const_add);
__ vleXX_v(vset_sew, vW1, buf);
__ vrev8_v(vW1, vW1);
__ addi(buf, buf, const_add);
__ vleXX_v(vset_sew, vW2, buf);
__ vrev8_v(vW2, vW2);
__ addi(buf, buf, const_add);
__ vleXX_v(vset_sew, vW3, buf);
__ vrev8_v(vW3, vW3);
__ addi(buf, buf, const_add);
// Set v0 up for the vmerge that replaces the first word (idx==0)
__ vid_v(v0);
__ vmseq_vi(v0, v0, 0x0); // v0.mask[i] = (i == 0 ? 1 : 0)
VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3};
int rot_pos = 0;
// Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2)
const int qr_end = vset_sew == Assembler::e32 ? 12 : 16;
for (int i = 0; i < qr_end; i++) {
sha2_quad_round(vset_sew,
rotation_regs[(rot_pos + 0) & 0x3],
rotation_regs[(rot_pos + 1) & 0x3],
rotation_regs[(rot_pos + 2) & 0x3],
rotation_regs[(rot_pos + 3) & 0x3],
consts,
vTmp1, vTmp0, vState0, vState1);
++rot_pos;
}
// Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2)
// Note that we stop generating new message schedule words (Wt, vW0-13)
// as we already generated all the words we end up consuming (i.e., W[63:60]).
const int qr_c_end = qr_end + 4;
for (int i = qr_end; i < qr_c_end; i++) {
sha2_quad_round(vset_sew,
rotation_regs[(rot_pos + 0) & 0x3],
rotation_regs[(rot_pos + 1) & 0x3],
rotation_regs[(rot_pos + 2) & 0x3],
rotation_regs[(rot_pos + 3) & 0x3],
consts,
vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1));
++rot_pos;
}
//--------------------------------------------------------------------------------
// Compute the updated hash value H'
// H' = H + {h',g',...,b',a'}
// = {h,g,...,b,a} + {h',g',...,b',a'}
// = {h+h',g+g',...,b+b',a+a'}
// H' = H+{a',b',c',...,h'}
__ vadd_vv(vState0, vHash0, vState0);
__ vadd_vv(vState1, vHash1, vState1);
if (multi_block) {
int total_adds = vset_sew == Assembler::e32 ? 240 : 608;
__ addi(consts, consts, -total_adds);
__ add(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128);
__ ble(ofs, limit, multi_block_loop);
__ mv(c_rarg0, ofs); // return ofs
}
// Store H[0..8] = {a,b,c,d,e,f,g,h} from
// vState0 = {f,e,b,a}
// vState1 = {h,g,d,c}
__ vsuxei8_v(vState0, state, vindex);
__ vsuxei8_v(vState1, state_c, vindex);
__ leave();
__ ret();
return start;
}
};
#undef __
#define __ masm->
// Continuation point for throwing of implicit exceptions that are
// not handled in the current activation. Fabricates an exception
// oop and initiates normal exception dispatching in this
@ -4862,6 +5248,18 @@ static const int64_t right_3_bits = right_n_bits(3);
}
#endif // COMPILER2
if (UseSHA256Intrinsics) {
Sha2Generator sha2(_masm, this);
StubRoutines::_sha256_implCompress = sha2.generate_sha256_implCompress(false);
StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(true);
}
if (UseSHA512Intrinsics) {
Sha2Generator sha2(_masm, this);
StubRoutines::_sha512_implCompress = sha2.generate_sha512_implCompress(false);
StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(true);
}
generate_compare_long_strings();
generate_string_indexof_stubs();

View File

@ -146,26 +146,11 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
}
if (UseSHA) {
warning("SHA instructions are not available on this CPU");
FLAG_SET_DEFAULT(UseSHA, false);
}
if (UseSHA1Intrinsics) {
warning("Intrinsics for SHA-1 crypto hash functions not available on this CPU.");
FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
}
if (UseSHA256Intrinsics) {
warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU.");
FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
}
if (UseSHA512Intrinsics) {
warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU.");
FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
}
if (UseSHA3Intrinsics) {
warning("Intrinsics for SHA3-224, SHA3-256, SHA3-384 and SHA3-512 crypto hash functions not available on this CPU.");
FLAG_SET_DEFAULT(UseSHA3Intrinsics, false);
@ -272,6 +257,10 @@ void VM_Version::initialize() {
// NOTE: Make sure codes dependent on UseRVV are put after c2_initialize(),
// as there are extra checks inside it which could disable UseRVV
// in some situations.
if (UseZvkn && !UseRVV) {
FLAG_SET_DEFAULT(UseZvkn, false);
warning("Cannot enable Zvkn on cpu without RVV support.");
}
if (UseRVV) {
if (FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
@ -283,6 +272,31 @@ void VM_Version::initialize() {
}
FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
}
if (!UseZvkn && UseSHA) {
warning("SHA instructions are not available on this CPU");
FLAG_SET_DEFAULT(UseSHA, false);
} else if (UseZvkn && FLAG_IS_DEFAULT(UseSHA)) {
FLAG_SET_DEFAULT(UseSHA, true);
}
if (!UseSHA) {
if (UseSHA256Intrinsics) {
warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU, UseZvkn needed.");
FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
}
if (UseSHA512Intrinsics) {
warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU, UseZvkn needed.");
FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
}
} else {
if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) {
FLAG_SET_DEFAULT(UseSHA256Intrinsics, true);
}
if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) {
FLAG_SET_DEFAULT(UseSHA512Intrinsics, true);
}
}
}
#ifdef COMPILER2