diff --git a/src/hotspot/cpu/riscv/assembler_riscv.hpp b/src/hotspot/cpu/riscv/assembler_riscv.hpp index 22554972583..ffe6dcf07ec 100644 --- a/src/hotspot/cpu/riscv/assembler_riscv.hpp +++ b/src/hotspot/cpu/riscv/assembler_riscv.hpp @@ -1334,6 +1334,7 @@ enum VectorMask { INSN(vsll_vi, 0b1010111, 0b011, 0b100101); // Vector Slide Instructions + INSN(vslideup_vi, 0b1010111, 0b011, 0b001110); INSN(vslidedown_vi, 0b1010111, 0b011, 0b001111); #undef INSN @@ -1689,7 +1690,6 @@ enum VectorMask { INSN(vmv_v_x, 0b1010111, 0b100, v0, 0b1, 0b010111); #undef INSN -#undef patch_VArith #define INSN(NAME, op, funct13, funct6) \ void NAME(VectorRegister Vd, VectorMask vm = unmasked) { \ @@ -1731,14 +1731,29 @@ enum Nf { patch_reg((address)&insn, 15, Rs1); \ emit(insn) -#define INSN(NAME, op, lumop, vm, mop, nf) \ - void NAME(VectorRegister Vd, Register Rs1, uint32_t width = 0, bool mew = false) { \ +#define INSN(NAME, op, width, lumop, vm, mop, mew, nf) \ + void NAME(VectorRegister Vd, Register Rs1) { \ guarantee(is_uimm3(width), "width is invalid"); \ patch_VLdSt(op, Vd, width, Rs1, lumop, vm, mop, mew, nf); \ } // Vector Load/Store Instructions - INSN(vl1re8_v, 0b0000111, 0b01000, 0b1, 0b00, g1); + INSN(vl1re8_v, 0b0000111, 0b000, 0b01000, 0b1, 0b00, 0b0, g1); + INSN(vl1re16_v, 0b0000111, 0b101, 0b01000, 0b1, 0b00, 0b0, g1); + INSN(vl1re32_v, 0b0000111, 0b110, 0b01000, 0b1, 0b00, 0b0, g1); + INSN(vl1re64_v, 0b0000111, 0b111, 0b01000, 0b1, 0b00, 0b0, g1); + INSN(vl2re8_v, 0b0000111, 0b000, 0b01000, 0b1, 0b00, 0b0, g2); + INSN(vl2re16_v, 0b0000111, 0b101, 0b01000, 0b1, 0b00, 0b0, g2); + INSN(vl2re32_v, 0b0000111, 0b110, 0b01000, 0b1, 0b00, 0b0, g2); + INSN(vl2re64_v, 0b0000111, 0b111, 0b01000, 0b1, 0b00, 0b0, g2); + INSN(vl4re8_v, 0b0000111, 0b000, 0b01000, 0b1, 0b00, 0b0, g4); + INSN(vl4re16_v, 0b0000111, 0b101, 0b01000, 0b1, 0b00, 0b0, g4); + INSN(vl4re32_v, 0b0000111, 0b110, 0b01000, 0b1, 0b00, 0b0, g4); + INSN(vl4re64_v, 0b0000111, 0b111, 0b01000, 0b1, 0b00, 0b0, g4); + INSN(vl8re8_v, 0b0000111, 0b000, 0b01000, 0b1, 0b00, 0b0, g8); + INSN(vl8re16_v, 0b0000111, 0b101, 0b01000, 0b1, 0b00, 0b0, g8); + INSN(vl8re32_v, 0b0000111, 0b110, 0b01000, 0b1, 0b00, 0b0, g8); + INSN(vl8re64_v, 0b0000111, 0b111, 0b01000, 0b1, 0b00, 0b0, g8); #undef INSN @@ -1749,6 +1764,9 @@ enum Nf { // Vector Load/Store Instructions INSN(vs1r_v, 0b0100111, 0b000, 0b01000, 0b1, 0b00, 0b0, g1); + INSN(vs2r_v, 0b0100111, 0b000, 0b01000, 0b1, 0b00, 0b0, g2); + INSN(vs4r_v, 0b0100111, 0b000, 0b01000, 0b1, 0b00, 0b0, g4); + INSN(vs8r_v, 0b0100111, 0b000, 0b01000, 0b1, 0b00, 0b0, g8); #undef INSN @@ -1794,9 +1812,11 @@ enum Nf { } // Vector unordered indexed load instructions + INSN( vluxei8_v, 0b0000111, 0b000, 0b01, 0b0); INSN(vluxei32_v, 0b0000111, 0b110, 0b01, 0b0); // Vector unordered indexed store instructions + INSN( vsuxei8_v, 0b0100111, 0b000, 0b01, 0b0); INSN(vsuxei32_v, 0b0100111, 0b110, 0b01, 0b0); #undef INSN @@ -1820,6 +1840,55 @@ enum Nf { #undef INSN #undef patch_VLdSt +// ==================================== +// RISC-V Vector Crypto Extension +// ==================================== + +#define INSN(NAME, op, funct3, funct6) \ + void NAME(VectorRegister Vd, VectorRegister Vs2, VectorRegister Vs1, VectorMask vm = unmasked) { \ + patch_VArith(op, Vd, funct3, Vs1->raw_encoding(), Vs2, vm, funct6); \ + } + + // Vector Bit-manipulation used in Cryptography (Zvkb) Extension + INSN(vandn_vv, 0b1010111, 0b000, 0b000001); + INSN(vandn_vx, 0b1010111, 0b100, 0b000001); + INSN(vandn_vi, 0b1010111, 0b011, 0b000001); + INSN(vclmul_vv, 0b1010111, 0b010, 0b001100); + INSN(vclmul_vx, 0b1010111, 0b110, 0b001100); + INSN(vclmulh_vv, 0b1010111, 0b010, 0b001101); + INSN(vclmulh_vx, 0b1010111, 0b110, 0b001101); + INSN(vror_vv, 0b1010111, 0b000, 0b010100); + INSN(vror_vx, 0b1010111, 0b100, 0b010100); + INSN(vrol_vv, 0b1010111, 0b000, 0b010101); + INSN(vrol_vx, 0b1010111, 0b100, 0b010101); + +#undef INSN + +#define INSN(NAME, op, funct3, Vs1, funct6) \ + void NAME(VectorRegister Vd, VectorRegister Vs2, VectorMask vm = unmasked) { \ + patch_VArith(op, Vd, funct3, Vs1, Vs2, vm, funct6); \ + } + + // Vector Bit-manipulation used in Cryptography (Zvkb) Extension + INSN(vbrev8_v, 0b1010111, 0b010, 0b01000, 0b010010); + INSN(vrev8_v, 0b1010111, 0b010, 0b01001, 0b010010); + +#undef INSN + +#define INSN(NAME, op, funct3, vm, funct6) \ + void NAME(VectorRegister Vd, VectorRegister Vs2, VectorRegister Vs1) { \ + patch_VArith(op, Vd, funct3, Vs1->raw_encoding(), Vs2, vm, funct6); \ + } + + // Vector SHA-2 Secure Hash (Zvknh[ab]) Extension + INSN(vsha2ms_vv, 0b1110111, 0b010, 0b1, 0b101101); + INSN(vsha2ch_vv, 0b1110111, 0b010, 0b1, 0b101110); + INSN(vsha2cl_vv, 0b1110111, 0b010, 0b1, 0b101111); + +#undef INSN + +#undef patch_VArith + // ==================================== // RISC-V Bit-Manipulation Extension // Currently only support Zba, Zbb and Zbs bitmanip extensions. diff --git a/src/hotspot/cpu/riscv/globals_riscv.hpp b/src/hotspot/cpu/riscv/globals_riscv.hpp index 60456c37ffe..aa95cebec14 100644 --- a/src/hotspot/cpu/riscv/globals_riscv.hpp +++ b/src/hotspot/cpu/riscv/globals_riscv.hpp @@ -113,6 +113,8 @@ define_pd_global(intx, InlineSmallCode, 1000); product(bool, UseZtso, false, EXPERIMENTAL, "Assume Ztso memory model") \ product(bool, UseZihintpause, false, EXPERIMENTAL, \ "Use Zihintpause instructions") \ + product(bool, UseZvkn, false, EXPERIMENTAL, \ + "Use Zvkn group extension, Zvkned, Zvknhb, Zvkb, Zvkt") \ product(bool, UseRVVForBigIntegerShiftIntrinsics, true, \ "Use RVV instructions for left/right shift of BigInteger") diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp index 3e020674624..4724dd85e78 100644 --- a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp +++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp @@ -1361,6 +1361,16 @@ public: vmfle_vv(vd, vs1, vs2, vm); } + inline void vmsltu_vi(VectorRegister Vd, VectorRegister Vs2, uint32_t imm, VectorMask vm = unmasked) { + guarantee(imm >= 1 && imm <= 16, "imm is invalid"); + vmsleu_vi(Vd, Vs2, imm-1, vm); + } + + inline void vmsgeu_vi(VectorRegister Vd, VectorRegister Vs2, uint32_t imm, VectorMask vm = unmasked) { + guarantee(imm >= 1 && imm <= 16, "imm is invalid"); + vmsgtu_vi(Vd, Vs2, imm-1, vm); + } + // Copy mask register inline void vmmv_m(VectorRegister vd, VectorRegister vs) { vmand_mm(vd, vs, vs); @@ -1376,6 +1386,10 @@ public: vmxnor_mm(vd, vd, vd); } + inline void vnot_v(VectorRegister Vd, VectorRegister Vs, VectorMask vm = unmasked) { + vxor_vi(Vd, Vs, -1, vm); + } + static const int zero_words_block_size; void cast_primitive_type(BasicType type, Register Rt) { diff --git a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp index be5fd983353..4bd33d08f89 100644 --- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp @@ -3659,8 +3659,394 @@ class StubGenerator: public StubCodeGenerator { return entry; } }; + #endif // COMPILER2 +#undef __ +#define __ this-> + class Sha2Generator : public MacroAssembler { + StubCodeGenerator* _cgen; + public: + Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {} + address generate_sha256_implCompress(bool multi_block) { + return generate_sha2_implCompress(Assembler::e32, multi_block); + } + address generate_sha512_implCompress(bool multi_block) { + return generate_sha2_implCompress(Assembler::e64, multi_block); + } + private: + + void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) { + if (vset_sew == Assembler::e32) __ vle32_v(vr, sr); + else __ vle64_v(vr, sr); + } + + void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) { + if (vset_sew == Assembler::e32) __ vse32_v(vr, sr); + else __ vse64_v(vr, sr); + } + + // Overview of the logic in each "quad round". + // + // The code below repeats 16/20 times the logic implementing four rounds + // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds" + // to implementing the 64/80 single rounds. + // + // // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0]) + // // Output: + // // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]} + // vl1reXX.v vTmp1, ofs + // + // // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b) + // addi ofs, ofs, 16/32 + // + // // Add constants to message schedule words: + // // Input + // // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]} + // // vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0]; + // // Output + // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} + // vadd.vv vTmp0, vTmp1, vW0 + // + // // 2 rounds of working variables updates. + // // vState1[t+4] <- vState1[t], vState0[t], vTmp0[t] + // // Input: + // // vState1 = {c[t],d[t],g[t],h[t]} " = vState1[t] " + // // vState0 = {a[t],b[t],e[t],f[t]} + // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} + // // Output: + // // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] " + // // = {h[t+4],g[t+4],d[t+4],c[t+4]} " = vState1[t+4] " + // vsha2cl.vv vState1, vState0, vTmp0 + // + // // 2 rounds of working variables updates. + // // vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t] + // // Input + // // vState0 = {a[t],b[t],e[t],f[t]} " = vState0[t] " + // // = {h[t+2],g[t+2],d[t+2],c[t+2]} " = vState1[t+2] " + // // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] " + // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} + // // Output: + // // vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]} " = vState0[t+4] " + // vsha2ch.vv vState0, vState1, vTmp0 + // + // // Combine 2QW into 1QW + // // + // // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs + // // vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3] + // // and it can only take 3 vectors as inputs. Hence we need to combine + // // vW1[0] and vW2[1..3] in a single vector. + // // + // // vmerge Vt4, Vt1, Vt2, V0 + // // Input + // // V0 = mask // first word from vW2, 1..3 words from vW1 + // // vW2 = {Wt-8, Wt-7, Wt-6, Wt-5} + // // vW1 = {Wt-12, Wt-11, Wt-10, Wt-9} + // // Output + // // Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5} + // vmerge.vvm vTmp0, vW2, vW1, v0 + // + // // Generate next Four Message Schedule Words (hence allowing for 4 more rounds) + // // Input + // // vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]} W[ 3: 0] + // // vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]} W[15:12] + // // vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]} W[11: 9,4] + // // Output (next four message schedule words) + // // vW0 = {W[t+19], W[t+18], W[t+17], W[t+16]} W[19:16] + // vsha2ms.vv vW0, vTmp0, vW3 + // + // BEFORE + // vW0 - vW3 hold the message schedule words (initially the block words) + // vW0 = W[ 3: 0] "oldest" + // vW1 = W[ 7: 4] + // vW2 = W[11: 8] + // vW3 = W[15:12] "newest" + // + // vt6 - vt7 hold the working state variables + // vState0 = {a[t],b[t],e[t],f[t]} // initially {H5,H4,H1,H0} + // vState1 = {c[t],d[t],g[t],h[t]} // initially {H7,H6,H3,H2} + // + // AFTER + // vW0 - vW3 hold the message schedule words (initially the block words) + // vW1 = W[ 7: 4] "oldest" + // vW2 = W[11: 8] + // vW3 = W[15:12] + // vW0 = W[19:16] "newest" + // + // vState0 and vState1 hold the working state variables + // vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]} + // vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]} + // + // The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round, + // hence the uses of those vectors rotate in each round, and we get back to the + // initial configuration every 4 quad-rounds. We could avoid those changes at + // the cost of moving those vectors at the end of each quad-rounds. + void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4, + Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh, + bool gen_words = true, bool step_const = true) { + __ vleXX_v(vset_sew, vtemp, scalarconst); + if (step_const) { + __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32); + } + __ vadd_vv(vtemp2, vtemp, rot1); + __ vsha2cl_vv(v_cdgh, v_abef, vtemp2); + __ vsha2ch_vv(v_abef, v_cdgh, vtemp2); + if (gen_words) { + __ vmerge_vvm(vtemp2, rot3, rot2); + __ vsha2ms_vv(rot1, vtemp2, rot4); + } + } + + const char* stub_name(Assembler::SEW vset_sew, bool multi_block) { + if (vset_sew == Assembler::e32 && !multi_block) return "sha256_implCompress"; + if (vset_sew == Assembler::e32 && multi_block) return "sha256_implCompressMB"; + if (vset_sew == Assembler::e64 && !multi_block) return "sha512_implCompress"; + if (vset_sew == Assembler::e64 && multi_block) return "sha512_implCompressMB"; + ShouldNotReachHere(); + return "bad name lookup"; + } + + // Arguments: + // + // Inputs: + // c_rarg0 - byte[] source+offset + // c_rarg1 - int[] SHA.state + // c_rarg2 - int offset + // c_rarg3 - int limit + // + address generate_sha2_implCompress(Assembler::SEW vset_sew, bool multi_block) { + alignas(64) static const uint32_t round_consts_256[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, + }; + alignas(64) static const uint64_t round_consts_512[80] = { + 0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl, + 0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l, + 0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l, + 0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l, + 0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l, + 0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l, + 0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l, + 0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l, + 0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl, + 0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l, + 0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl, + 0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl, + 0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l, + 0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l, + 0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l, + 0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l, + 0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l, + 0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl, + 0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl, + 0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl, + 0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l, + 0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l, + 0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal, + 0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl, + 0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl, + 0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al, + 0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l + }; + const int const_add = vset_sew == Assembler::e32 ? 16 : 32; + + __ align(CodeEntryAlignment); + StubCodeMark mark(_cgen, "StubRoutines", stub_name(vset_sew, multi_block)); + address start = __ pc(); + + Register buf = c_rarg0; + Register state = c_rarg1; + Register ofs = c_rarg2; + Register limit = c_rarg3; + Register consts = t2; // caller saved + Register state_c = x28; // caller saved + VectorRegister vindex = v2; + VectorRegister vW0 = v4; + VectorRegister vW1 = v6; + VectorRegister vW2 = v8; + VectorRegister vW3 = v10; + VectorRegister vState0 = v12; + VectorRegister vState1 = v14; + VectorRegister vHash0 = v16; + VectorRegister vHash1 = v18; + VectorRegister vTmp0 = v20; + VectorRegister vTmp1 = v22; + + Label multi_block_loop; + + __ enter(); + + address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512; + la(consts, ExternalAddress(constant_table)); + + // Register use in this function: + // + // VECTORS + // vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message + // schedule words (Wt). They start with the message block + // content (W0 to W15), then further words in the message + // schedule generated via vsha2ms from previous Wt. + // Initially: + // vW0 = W[ 3:0] = { W3, W2, W1, W0} + // vW1 = W[ 7:4] = { W7, W6, W5, W4} + // vW2 = W[ 11:8] = {W11, W10, W9, W8} + // vW3 = W[15:12] = {W15, W14, W13, W12} + // + // vState0 - vState1 hold the working state variables (a, b, ..., h) + // vState0 = {f[t],e[t],b[t],a[t]} + // vState1 = {h[t],g[t],d[t],c[t]} + // Initially: + // vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1} + // vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1} + // + // v0 = masks for vrgather/vmerge. Single value during the 16 rounds. + // + // vTmp0 = temporary, Wt+Kt + // vTmp1 = temporary, Kt + // + // vHash0/vHash1 = hold the initial values of the hash, byte-swapped. + // + // During most of the function the vector state is configured so that each + // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits). + + // vsha2ch/vsha2cl uses EGW of 4*SEW. + // SHA256 SEW = e32, EGW = 128-bits + // SHA512 SEW = e64, EGW = 256-bits + // + // VLEN is required to be at least 128. + // For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256) + // + // m1: LMUL=1/2 + // ta: tail agnostic (don't care about those lanes) + // ma: mask agnostic (don't care about those lanes) + // x0 is not written, we known the number of vector elements. + + if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128 + __ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta); + } else { + __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta); + } + + int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul; + __ li(t0, indexes); + __ vmv_v_x(vindex, t0); + + // Step-over a,b, so we are pointing to c. + // const_add is equal to 4x state variable, div by 2 is thus 2, a,b + __ addi(state_c, state, const_add/2); + + // Use index-load to get {f,e,b,a},{h,g,d,c} + __ vluxei8_v(vState0, state, vindex); + __ vluxei8_v(vState1, state_c, vindex); + + __ bind(multi_block_loop); + + // Capture the initial H values in vHash0 and vHash1 to allow for computing + // the resulting H', since H' = H+{a',b',c',...,h'}. + __ vmv_v_v(vHash0, vState0); + __ vmv_v_v(vHash1, vState1); + + // Load the 512/1024-bits of the message block in vW0-vW3 and perform + // an endian swap on each 4/8 bytes element. + // + // If Zvkb is not implemented one can use vrgather + // with an index sequence to byte-swap. + // sequence = [3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12] + // gives us "N ^ 3" as a nice formula to generate + // this sequence. 'vid' gives us the N. + __ vleXX_v(vset_sew, vW0, buf); + __ vrev8_v(vW0, vW0); + __ addi(buf, buf, const_add); + __ vleXX_v(vset_sew, vW1, buf); + __ vrev8_v(vW1, vW1); + __ addi(buf, buf, const_add); + __ vleXX_v(vset_sew, vW2, buf); + __ vrev8_v(vW2, vW2); + __ addi(buf, buf, const_add); + __ vleXX_v(vset_sew, vW3, buf); + __ vrev8_v(vW3, vW3); + __ addi(buf, buf, const_add); + + // Set v0 up for the vmerge that replaces the first word (idx==0) + __ vid_v(v0); + __ vmseq_vi(v0, v0, 0x0); // v0.mask[i] = (i == 0 ? 1 : 0) + + VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3}; + int rot_pos = 0; + // Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2) + const int qr_end = vset_sew == Assembler::e32 ? 12 : 16; + for (int i = 0; i < qr_end; i++) { + sha2_quad_round(vset_sew, + rotation_regs[(rot_pos + 0) & 0x3], + rotation_regs[(rot_pos + 1) & 0x3], + rotation_regs[(rot_pos + 2) & 0x3], + rotation_regs[(rot_pos + 3) & 0x3], + consts, + vTmp1, vTmp0, vState0, vState1); + ++rot_pos; + } + // Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2) + // Note that we stop generating new message schedule words (Wt, vW0-13) + // as we already generated all the words we end up consuming (i.e., W[63:60]). + const int qr_c_end = qr_end + 4; + for (int i = qr_end; i < qr_c_end; i++) { + sha2_quad_round(vset_sew, + rotation_regs[(rot_pos + 0) & 0x3], + rotation_regs[(rot_pos + 1) & 0x3], + rotation_regs[(rot_pos + 2) & 0x3], + rotation_regs[(rot_pos + 3) & 0x3], + consts, + vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1)); + ++rot_pos; + } + + //-------------------------------------------------------------------------------- + // Compute the updated hash value H' + // H' = H + {h',g',...,b',a'} + // = {h,g,...,b,a} + {h',g',...,b',a'} + // = {h+h',g+g',...,b+b',a+a'} + + // H' = H+{a',b',c',...,h'} + __ vadd_vv(vState0, vHash0, vState0); + __ vadd_vv(vState1, vHash1, vState1); + + if (multi_block) { + int total_adds = vset_sew == Assembler::e32 ? 240 : 608; + __ addi(consts, consts, -total_adds); + __ add(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128); + __ ble(ofs, limit, multi_block_loop); + __ mv(c_rarg0, ofs); // return ofs + } + + // Store H[0..8] = {a,b,c,d,e,f,g,h} from + // vState0 = {f,e,b,a} + // vState1 = {h,g,d,c} + __ vsuxei8_v(vState0, state, vindex); + __ vsuxei8_v(vState1, state_c, vindex); + + __ leave(); + __ ret(); + + return start; + } + }; +#undef __ +#define __ masm-> + // Continuation point for throwing of implicit exceptions that are // not handled in the current activation. Fabricates an exception // oop and initiates normal exception dispatching in this @@ -4862,6 +5248,18 @@ static const int64_t right_3_bits = right_n_bits(3); } #endif // COMPILER2 + if (UseSHA256Intrinsics) { + Sha2Generator sha2(_masm, this); + StubRoutines::_sha256_implCompress = sha2.generate_sha256_implCompress(false); + StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(true); + } + + if (UseSHA512Intrinsics) { + Sha2Generator sha2(_masm, this); + StubRoutines::_sha512_implCompress = sha2.generate_sha512_implCompress(false); + StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(true); + } + generate_compare_long_strings(); generate_string_indexof_stubs(); diff --git a/src/hotspot/cpu/riscv/vm_version_riscv.cpp b/src/hotspot/cpu/riscv/vm_version_riscv.cpp index 41ec15a8634..9a72b8d75a1 100644 --- a/src/hotspot/cpu/riscv/vm_version_riscv.cpp +++ b/src/hotspot/cpu/riscv/vm_version_riscv.cpp @@ -146,26 +146,11 @@ void VM_Version::initialize() { FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); } - if (UseSHA) { - warning("SHA instructions are not available on this CPU"); - FLAG_SET_DEFAULT(UseSHA, false); - } - if (UseSHA1Intrinsics) { warning("Intrinsics for SHA-1 crypto hash functions not available on this CPU."); FLAG_SET_DEFAULT(UseSHA1Intrinsics, false); } - if (UseSHA256Intrinsics) { - warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU."); - FLAG_SET_DEFAULT(UseSHA256Intrinsics, false); - } - - if (UseSHA512Intrinsics) { - warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU."); - FLAG_SET_DEFAULT(UseSHA512Intrinsics, false); - } - if (UseSHA3Intrinsics) { warning("Intrinsics for SHA3-224, SHA3-256, SHA3-384 and SHA3-512 crypto hash functions not available on this CPU."); FLAG_SET_DEFAULT(UseSHA3Intrinsics, false); @@ -272,6 +257,10 @@ void VM_Version::initialize() { // NOTE: Make sure codes dependent on UseRVV are put after c2_initialize(), // as there are extra checks inside it which could disable UseRVV // in some situations. + if (UseZvkn && !UseRVV) { + FLAG_SET_DEFAULT(UseZvkn, false); + warning("Cannot enable Zvkn on cpu without RVV support."); + } if (UseRVV) { if (FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) { @@ -283,6 +272,31 @@ void VM_Version::initialize() { } FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false); } + + if (!UseZvkn && UseSHA) { + warning("SHA instructions are not available on this CPU"); + FLAG_SET_DEFAULT(UseSHA, false); + } else if (UseZvkn && FLAG_IS_DEFAULT(UseSHA)) { + FLAG_SET_DEFAULT(UseSHA, true); + } + + if (!UseSHA) { + if (UseSHA256Intrinsics) { + warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU, UseZvkn needed."); + FLAG_SET_DEFAULT(UseSHA256Intrinsics, false); + } + if (UseSHA512Intrinsics) { + warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU, UseZvkn needed."); + FLAG_SET_DEFAULT(UseSHA512Intrinsics, false); + } + } else { + if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) { + FLAG_SET_DEFAULT(UseSHA256Intrinsics, true); + } + if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) { + FLAG_SET_DEFAULT(UseSHA512Intrinsics, true); + } + } } #ifdef COMPILER2