8319716: RISC-V: Add SHA-2

Co-authored-by: Robbin Ehn <rehn@openjdk.org> Reviewed-by: fyang, mli, luhenry
2026-07-04 16:20:32 +00:00 · 2024-01-09 07:26:35 +00:00 · 2024-01-09 07:26:35 +00:00 · 4cf131a101
commit 4cf131a101
parent 7286f5291d
5 changed files with 516 additions and 19 deletions
--- a/src/hotspot/cpu/riscv/assembler_riscv.hpp
+++ b/src/hotspot/cpu/riscv/assembler_riscv.hpp
@ -1334,6 +1334,7 @@ enum VectorMask {
  INSN(vsll_vi,    0b1010111, 0b011, 0b100101);

  // Vector Slide Instructions
+  INSN(vslideup_vi,   0b1010111, 0b011, 0b001110);
  INSN(vslidedown_vi, 0b1010111, 0b011, 0b001111);

 #undef INSN
@ -1689,7 +1690,6 @@ enum VectorMask {
  INSN(vmv_v_x, 0b1010111, 0b100, v0, 0b1, 0b010111);

 #undef INSN
-#undef patch_VArith

 #define INSN(NAME, op, funct13, funct6)                    \
  void NAME(VectorRegister Vd, VectorMask vm = unmasked) { \
@ -1731,14 +1731,29 @@ enum Nf {
    patch_reg((address)&insn, 15, Rs1);                                  \
    emit(insn)

-#define INSN(NAME, op, lumop, vm, mop, nf)                                           \
-  void NAME(VectorRegister Vd, Register Rs1, uint32_t width = 0, bool mew = false) { \
+#define INSN(NAME, op, width, lumop, vm, mop, mew, nf)                               \
+  void NAME(VectorRegister Vd, Register Rs1) {                                       \
    guarantee(is_uimm3(width), "width is invalid");                                  \
    patch_VLdSt(op, Vd, width, Rs1, lumop, vm, mop, mew, nf);                        \
  }

  // Vector Load/Store Instructions
-  INSN(vl1re8_v, 0b0000111, 0b01000, 0b1, 0b00, g1);
+  INSN(vl1re8_v,  0b0000111, 0b000, 0b01000, 0b1, 0b00, 0b0, g1);
+  INSN(vl1re16_v, 0b0000111, 0b101, 0b01000, 0b1, 0b00, 0b0, g1);
+  INSN(vl1re32_v, 0b0000111, 0b110, 0b01000, 0b1, 0b00, 0b0, g1);
+  INSN(vl1re64_v, 0b0000111, 0b111, 0b01000, 0b1, 0b00, 0b0, g1);
+  INSN(vl2re8_v,  0b0000111, 0b000, 0b01000, 0b1, 0b00, 0b0, g2);
+  INSN(vl2re16_v, 0b0000111, 0b101, 0b01000, 0b1, 0b00, 0b0, g2);
+  INSN(vl2re32_v, 0b0000111, 0b110, 0b01000, 0b1, 0b00, 0b0, g2);
+  INSN(vl2re64_v, 0b0000111, 0b111, 0b01000, 0b1, 0b00, 0b0, g2);
+  INSN(vl4re8_v,  0b0000111, 0b000, 0b01000, 0b1, 0b00, 0b0, g4);
+  INSN(vl4re16_v, 0b0000111, 0b101, 0b01000, 0b1, 0b00, 0b0, g4);
+  INSN(vl4re32_v, 0b0000111, 0b110, 0b01000, 0b1, 0b00, 0b0, g4);
+  INSN(vl4re64_v, 0b0000111, 0b111, 0b01000, 0b1, 0b00, 0b0, g4);
+  INSN(vl8re8_v,  0b0000111, 0b000, 0b01000, 0b1, 0b00, 0b0, g8);
+  INSN(vl8re16_v, 0b0000111, 0b101, 0b01000, 0b1, 0b00, 0b0, g8);
+  INSN(vl8re32_v, 0b0000111, 0b110, 0b01000, 0b1, 0b00, 0b0, g8);
+  INSN(vl8re64_v, 0b0000111, 0b111, 0b01000, 0b1, 0b00, 0b0, g8);

 #undef INSN

@ -1749,6 +1764,9 @@ enum Nf {

  // Vector Load/Store Instructions
  INSN(vs1r_v, 0b0100111, 0b000, 0b01000, 0b1, 0b00, 0b0, g1);
+  INSN(vs2r_v, 0b0100111, 0b000, 0b01000, 0b1, 0b00, 0b0, g2);
+  INSN(vs4r_v, 0b0100111, 0b000, 0b01000, 0b1, 0b00, 0b0, g4);
+  INSN(vs8r_v, 0b0100111, 0b000, 0b01000, 0b1, 0b00, 0b0, g8);

 #undef INSN

@ -1794,9 +1812,11 @@ enum Nf {
  }

  // Vector unordered indexed load instructions
+  INSN( vluxei8_v, 0b0000111, 0b000, 0b01, 0b0);
  INSN(vluxei32_v, 0b0000111, 0b110, 0b01, 0b0);

  // Vector unordered indexed store instructions
+  INSN( vsuxei8_v, 0b0100111, 0b000, 0b01, 0b0);
  INSN(vsuxei32_v, 0b0100111, 0b110, 0b01, 0b0);

 #undef INSN
@ -1820,6 +1840,55 @@ enum Nf {
 #undef INSN
 #undef patch_VLdSt

+// ====================================
+// RISC-V Vector Crypto Extension
+// ====================================
+
+#define INSN(NAME, op, funct3, funct6)                                                             \
+  void NAME(VectorRegister Vd, VectorRegister Vs2, VectorRegister Vs1, VectorMask vm = unmasked) { \
+    patch_VArith(op, Vd, funct3, Vs1->raw_encoding(), Vs2, vm, funct6);                            \
+  }
+
+  // Vector Bit-manipulation used in Cryptography (Zvkb) Extension
+  INSN(vandn_vv,   0b1010111, 0b000, 0b000001);
+  INSN(vandn_vx,   0b1010111, 0b100, 0b000001);
+  INSN(vandn_vi,   0b1010111, 0b011, 0b000001);
+  INSN(vclmul_vv,  0b1010111, 0b010, 0b001100);
+  INSN(vclmul_vx,  0b1010111, 0b110, 0b001100);
+  INSN(vclmulh_vv, 0b1010111, 0b010, 0b001101);
+  INSN(vclmulh_vx, 0b1010111, 0b110, 0b001101);
+  INSN(vror_vv,    0b1010111, 0b000, 0b010100);
+  INSN(vror_vx,    0b1010111, 0b100, 0b010100);
+  INSN(vrol_vv,    0b1010111, 0b000, 0b010101);
+  INSN(vrol_vx,    0b1010111, 0b100, 0b010101);
+
+#undef INSN
+
+#define INSN(NAME, op, funct3, Vs1, funct6)                                    \
+  void NAME(VectorRegister Vd, VectorRegister Vs2, VectorMask vm = unmasked) { \
+    patch_VArith(op, Vd, funct3, Vs1, Vs2, vm, funct6);                        \
+  }
+
+  // Vector Bit-manipulation used in Cryptography (Zvkb) Extension
+  INSN(vbrev8_v, 0b1010111, 0b010, 0b01000, 0b010010);
+  INSN(vrev8_v,  0b1010111, 0b010, 0b01001, 0b010010);
+
+#undef INSN
+
+#define INSN(NAME, op, funct3, vm, funct6)                                   \
+  void NAME(VectorRegister Vd, VectorRegister Vs2, VectorRegister Vs1) {     \
+    patch_VArith(op, Vd, funct3, Vs1->raw_encoding(), Vs2, vm, funct6);      \
+  }
+
+  // Vector SHA-2 Secure Hash (Zvknh[ab]) Extension
+  INSN(vsha2ms_vv,  0b1110111, 0b010, 0b1, 0b101101);
+  INSN(vsha2ch_vv,  0b1110111, 0b010, 0b1, 0b101110);
+  INSN(vsha2cl_vv,  0b1110111, 0b010, 0b1, 0b101111);
+
+#undef INSN
+
+#undef patch_VArith
+
 // ====================================
 // RISC-V Bit-Manipulation Extension
 // Currently only support Zba, Zbb and Zbs bitmanip extensions.
--- a/src/hotspot/cpu/riscv/globals_riscv.hpp
+++ b/src/hotspot/cpu/riscv/globals_riscv.hpp
@ -113,6 +113,8 @@ define_pd_global(intx, InlineSmallCode,          1000);
  product(bool, UseZtso, false, EXPERIMENTAL, "Assume Ztso memory model")        \
  product(bool, UseZihintpause, false, EXPERIMENTAL,                             \
          "Use Zihintpause instructions")                                        \
+  product(bool, UseZvkn, false, EXPERIMENTAL,                                    \
+          "Use Zvkn group extension, Zvkned, Zvknhb, Zvkb, Zvkt")                \
  product(bool, UseRVVForBigIntegerShiftIntrinsics, true,                        \
          "Use RVV instructions for left/right shift of BigInteger")

--- a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp
+++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp
@ -1361,6 +1361,16 @@ public:
    vmfle_vv(vd, vs1, vs2, vm);
  }

+  inline void vmsltu_vi(VectorRegister Vd, VectorRegister Vs2, uint32_t imm, VectorMask vm = unmasked) {
+    guarantee(imm >= 1 && imm <= 16, "imm is invalid");
+    vmsleu_vi(Vd, Vs2, imm-1, vm);
+  }
+
+  inline void vmsgeu_vi(VectorRegister Vd, VectorRegister Vs2, uint32_t imm, VectorMask vm = unmasked) {
+    guarantee(imm >= 1 && imm <= 16, "imm is invalid");
+    vmsgtu_vi(Vd, Vs2, imm-1, vm);
+  }
+
  // Copy mask register
  inline void vmmv_m(VectorRegister vd, VectorRegister vs) {
    vmand_mm(vd, vs, vs);
@ -1376,6 +1386,10 @@ public:
    vmxnor_mm(vd, vd, vd);
  }

+  inline void vnot_v(VectorRegister Vd, VectorRegister Vs, VectorMask vm = unmasked) {
+    vxor_vi(Vd, Vs, -1, vm);
+  }
+
  static const int zero_words_block_size;

  void cast_primitive_type(BasicType type, Register Rt) {
--- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
+++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
@ -3659,8 +3659,394 @@ class StubGenerator: public StubCodeGenerator {
      return entry;
    }
  };
+
 #endif // COMPILER2

+#undef __
+#define __ this->
+  class Sha2Generator : public MacroAssembler {
+    StubCodeGenerator* _cgen;
+   public:
+      Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {}
+      address generate_sha256_implCompress(bool multi_block) {
+        return generate_sha2_implCompress(Assembler::e32, multi_block);
+      }
+      address generate_sha512_implCompress(bool multi_block) {
+        return generate_sha2_implCompress(Assembler::e64, multi_block);
+      }
+   private:
+
+    void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
+      if (vset_sew == Assembler::e32) __ vle32_v(vr, sr);
+      else                            __ vle64_v(vr, sr);
+    }
+
+    void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
+      if (vset_sew == Assembler::e32) __ vse32_v(vr, sr);
+      else                            __ vse64_v(vr, sr);
+    }
+
+    // Overview of the logic in each "quad round".
+    //
+    // The code below repeats 16/20 times the logic implementing four rounds
+    // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds"
+    // to implementing the 64/80 single rounds.
+    //
+    //    // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0])
+    //    // Output:
+    //    //   vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
+    //    vl1reXX.v vTmp1, ofs
+    //
+    //    // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b)
+    //    addi ofs, ofs, 16/32
+    //
+    //    // Add constants to message schedule words:
+    //    //  Input
+    //    //    vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
+    //    //    vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0];
+    //    //  Output
+    //    //    vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
+    //    vadd.vv vTmp0, vTmp1, vW0
+    //
+    //    //  2 rounds of working variables updates.
+    //    //     vState1[t+4] <- vState1[t], vState0[t], vTmp0[t]
+    //    //  Input:
+    //    //    vState1 = {c[t],d[t],g[t],h[t]}   " = vState1[t] "
+    //    //    vState0 = {a[t],b[t],e[t],f[t]}
+    //    //    vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
+    //    //  Output:
+    //    //    vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]}  " = vState0[t+2] "
+    //    //        = {h[t+4],g[t+4],d[t+4],c[t+4]}  " = vState1[t+4] "
+    //    vsha2cl.vv vState1, vState0, vTmp0
+    //
+    //    //  2 rounds of working variables updates.
+    //    //     vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t]
+    //    //  Input
+    //    //   vState0 = {a[t],b[t],e[t],f[t]}       " = vState0[t] "
+    //    //       = {h[t+2],g[t+2],d[t+2],c[t+2]}   " = vState1[t+2] "
+    //    //   vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]}   " = vState0[t+2] "
+    //    //   vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
+    //    //  Output:
+    //    //   vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]}   " = vState0[t+4] "
+    //    vsha2ch.vv vState0, vState1, vTmp0
+    //
+    //    // Combine 2QW into 1QW
+    //    //
+    //    // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs
+    //    //     vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3]
+    //    // and it can only take 3 vectors as inputs. Hence we need to combine
+    //    // vW1[0] and vW2[1..3] in a single vector.
+    //    //
+    //    // vmerge Vt4, Vt1, Vt2, V0
+    //    // Input
+    //    //  V0 = mask // first word from vW2, 1..3 words from vW1
+    //    //  vW2 = {Wt-8, Wt-7, Wt-6, Wt-5}
+    //    //  vW1 = {Wt-12, Wt-11, Wt-10, Wt-9}
+    //    // Output
+    //    //  Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5}
+    //    vmerge.vvm vTmp0, vW2, vW1, v0
+    //
+    //    // Generate next Four Message Schedule Words (hence allowing for 4 more rounds)
+    //    // Input
+    //    //  vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]}     W[ 3: 0]
+    //    //  vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]}     W[15:12]
+    //    //  vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]}     W[11: 9,4]
+    //    // Output (next four message schedule words)
+    //    //  vW0 = {W[t+19],  W[t+18],  W[t+17],  W[t+16]}  W[19:16]
+    //    vsha2ms.vv vW0, vTmp0, vW3
+    //
+    // BEFORE
+    //  vW0 - vW3 hold the message schedule words (initially the block words)
+    //    vW0 = W[ 3: 0]   "oldest"
+    //    vW1 = W[ 7: 4]
+    //    vW2 = W[11: 8]
+    //    vW3 = W[15:12]   "newest"
+    //
+    //  vt6 - vt7 hold the working state variables
+    //    vState0 = {a[t],b[t],e[t],f[t]}   // initially {H5,H4,H1,H0}
+    //    vState1 = {c[t],d[t],g[t],h[t]}   // initially {H7,H6,H3,H2}
+    //
+    // AFTER
+    //  vW0 - vW3 hold the message schedule words (initially the block words)
+    //    vW1 = W[ 7: 4]   "oldest"
+    //    vW2 = W[11: 8]
+    //    vW3 = W[15:12]
+    //    vW0 = W[19:16]   "newest"
+    //
+    //  vState0 and vState1 hold the working state variables
+    //    vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]}
+    //    vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]}
+    //
+    //  The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round,
+    //  hence the uses of those vectors rotate in each round, and we get back to the
+    //  initial configuration every 4 quad-rounds. We could avoid those changes at
+    //  the cost of moving those vectors at the end of each quad-rounds.
+    void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4,
+                         Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh,
+                         bool gen_words = true, bool step_const = true) {
+      __ vleXX_v(vset_sew, vtemp, scalarconst);
+      if (step_const) {
+        __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32);
+      }
+      __ vadd_vv(vtemp2, vtemp, rot1);
+      __ vsha2cl_vv(v_cdgh, v_abef, vtemp2);
+      __ vsha2ch_vv(v_abef, v_cdgh, vtemp2);
+      if (gen_words) {
+        __ vmerge_vvm(vtemp2, rot3, rot2);
+        __ vsha2ms_vv(rot1, vtemp2, rot4);
+      }
+    }
+
+    const char* stub_name(Assembler::SEW vset_sew, bool multi_block) {
+      if (vset_sew == Assembler::e32 && !multi_block) return "sha256_implCompress";
+      if (vset_sew == Assembler::e32 &&  multi_block) return "sha256_implCompressMB";
+      if (vset_sew == Assembler::e64 && !multi_block) return "sha512_implCompress";
+      if (vset_sew == Assembler::e64 &&  multi_block) return "sha512_implCompressMB";
+      ShouldNotReachHere();
+      return "bad name lookup";
+    }
+
+    // Arguments:
+    //
+    // Inputs:
+    //   c_rarg0   - byte[]  source+offset
+    //   c_rarg1   - int[]   SHA.state
+    //   c_rarg2   - int     offset
+    //   c_rarg3   - int     limit
+    //
+    address generate_sha2_implCompress(Assembler::SEW vset_sew, bool multi_block) {
+      alignas(64) static const uint32_t round_consts_256[64] = {
+        0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+        0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+        0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+        0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+        0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+        0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+        0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+        0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+        0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+        0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+        0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+        0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+        0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+        0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+      };
+      alignas(64) static const uint64_t round_consts_512[80] = {
+        0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl,
+        0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l,
+        0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l,
+        0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l,
+        0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l,
+        0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l,
+        0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l,
+        0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l,
+        0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl,
+        0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l,
+        0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl,
+        0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl,
+        0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l,
+        0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l,
+        0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l,
+        0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l,
+        0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l,
+        0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl,
+        0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl,
+        0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl,
+        0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l,
+        0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l,
+        0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal,
+        0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl,
+        0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl,
+        0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al,
+        0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l
+      };
+      const int const_add = vset_sew == Assembler::e32 ? 16 : 32;
+
+      __ align(CodeEntryAlignment);
+      StubCodeMark mark(_cgen, "StubRoutines", stub_name(vset_sew, multi_block));
+      address start = __ pc();
+
+      Register buf   = c_rarg0;
+      Register state = c_rarg1;
+      Register ofs   = c_rarg2;
+      Register limit = c_rarg3;
+      Register consts =  t2; // caller saved
+      Register state_c = x28; // caller saved
+      VectorRegister vindex = v2;
+      VectorRegister vW0 = v4;
+      VectorRegister vW1 = v6;
+      VectorRegister vW2 = v8;
+      VectorRegister vW3 = v10;
+      VectorRegister vState0 = v12;
+      VectorRegister vState1 = v14;
+      VectorRegister vHash0  = v16;
+      VectorRegister vHash1  = v18;
+      VectorRegister vTmp0   = v20;
+      VectorRegister vTmp1   = v22;
+
+      Label multi_block_loop;
+
+      __ enter();
+
+      address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512;
+      la(consts, ExternalAddress(constant_table));
+
+      // Register use in this function:
+      //
+      // VECTORS
+      //  vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message
+      //             schedule words (Wt). They start with the message block
+      //             content (W0 to W15), then further words in the message
+      //             schedule generated via vsha2ms from previous Wt.
+      //   Initially:
+      //     vW0 = W[  3:0] = { W3,  W2,  W1,  W0}
+      //     vW1 = W[  7:4] = { W7,  W6,  W5,  W4}
+      //     vW2 = W[ 11:8] = {W11, W10,  W9,  W8}
+      //     vW3 = W[15:12] = {W15, W14, W13, W12}
+      //
+      //  vState0 - vState1 hold the working state variables (a, b, ..., h)
+      //    vState0 = {f[t],e[t],b[t],a[t]}
+      //    vState1 = {h[t],g[t],d[t],c[t]}
+      //   Initially:
+      //    vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1}
+      //    vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1}
+      //
+      //  v0 = masks for vrgather/vmerge. Single value during the 16 rounds.
+      //
+      //  vTmp0 = temporary, Wt+Kt
+      //  vTmp1 = temporary, Kt
+      //
+      //  vHash0/vHash1 = hold the initial values of the hash, byte-swapped.
+      //
+      // During most of the function the vector state is configured so that each
+      // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits).
+
+      // vsha2ch/vsha2cl uses EGW of 4*SEW.
+      // SHA256 SEW = e32, EGW = 128-bits
+      // SHA512 SEW = e64, EGW = 256-bits
+      //
+      // VLEN is required to be at least 128.
+      // For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256)
+      //
+      // m1: LMUL=1/2
+      // ta: tail agnostic (don't care about those lanes)
+      // ma: mask agnostic (don't care about those lanes)
+      // x0 is not written, we known the number of vector elements.
+
+      if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128
+        __ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta);
+      } else {
+        __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta);
+      }
+
+      int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul;
+      __ li(t0, indexes);
+      __ vmv_v_x(vindex, t0);
+
+      // Step-over a,b, so we are pointing to c.
+      // const_add is equal to 4x state variable, div by 2 is thus 2, a,b
+      __ addi(state_c, state, const_add/2);
+
+      // Use index-load to get {f,e,b,a},{h,g,d,c}
+      __ vluxei8_v(vState0, state, vindex);
+      __ vluxei8_v(vState1, state_c, vindex);
+
+      __ bind(multi_block_loop);
+
+      // Capture the initial H values in vHash0 and vHash1 to allow for computing
+      // the resulting H', since H' = H+{a',b',c',...,h'}.
+      __ vmv_v_v(vHash0, vState0);
+      __ vmv_v_v(vHash1, vState1);
+
+      // Load the 512/1024-bits of the message block in vW0-vW3 and perform
+      // an endian swap on each 4/8 bytes element.
+      //
+      // If Zvkb is not implemented one can use vrgather
+      // with an index sequence to byte-swap.
+      //  sequence = [3 2 1 0   7 6 5 4  11 10 9 8   15 14 13 12]
+      //   <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate
+      //  this sequence. 'vid' gives us the N.
+      __ vleXX_v(vset_sew, vW0, buf);
+      __ vrev8_v(vW0, vW0);
+      __ addi(buf, buf, const_add);
+      __ vleXX_v(vset_sew, vW1, buf);
+      __ vrev8_v(vW1, vW1);
+      __ addi(buf, buf, const_add);
+      __ vleXX_v(vset_sew, vW2, buf);
+      __ vrev8_v(vW2, vW2);
+      __ addi(buf, buf, const_add);
+      __ vleXX_v(vset_sew, vW3, buf);
+      __ vrev8_v(vW3, vW3);
+      __ addi(buf, buf, const_add);
+
+      // Set v0 up for the vmerge that replaces the first word (idx==0)
+      __ vid_v(v0);
+      __ vmseq_vi(v0, v0, 0x0);  // v0.mask[i] = (i == 0 ? 1 : 0)
+
+      VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3};
+      int rot_pos = 0;
+      // Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2)
+      const int qr_end = vset_sew == Assembler::e32 ? 12 : 16;
+      for (int i = 0; i < qr_end; i++) {
+        sha2_quad_round(vset_sew,
+                   rotation_regs[(rot_pos + 0) & 0x3],
+                   rotation_regs[(rot_pos + 1) & 0x3],
+                   rotation_regs[(rot_pos + 2) & 0x3],
+                   rotation_regs[(rot_pos + 3) & 0x3],
+                   consts,
+                   vTmp1, vTmp0, vState0, vState1);
+        ++rot_pos;
+      }
+      // Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2)
+      // Note that we stop generating new message schedule words (Wt, vW0-13)
+      // as we already generated all the words we end up consuming (i.e., W[63:60]).
+      const int qr_c_end = qr_end + 4;
+      for (int i = qr_end; i < qr_c_end; i++) {
+        sha2_quad_round(vset_sew,
+                   rotation_regs[(rot_pos + 0) & 0x3],
+                   rotation_regs[(rot_pos + 1) & 0x3],
+                   rotation_regs[(rot_pos + 2) & 0x3],
+                   rotation_regs[(rot_pos + 3) & 0x3],
+                   consts,
+                   vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1));
+        ++rot_pos;
+      }
+
+      //--------------------------------------------------------------------------------
+      // Compute the updated hash value H'
+      //   H' = H + {h',g',...,b',a'}
+      //      = {h,g,...,b,a} + {h',g',...,b',a'}
+      //      = {h+h',g+g',...,b+b',a+a'}
+
+      // H' = H+{a',b',c',...,h'}
+      __ vadd_vv(vState0, vHash0, vState0);
+      __ vadd_vv(vState1, vHash1, vState1);
+
+      if (multi_block) {
+        int total_adds = vset_sew == Assembler::e32 ? 240 : 608;
+        __ addi(consts, consts, -total_adds);
+        __ add(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128);
+        __ ble(ofs, limit, multi_block_loop);
+        __ mv(c_rarg0, ofs); // return ofs
+      }
+
+      // Store H[0..8] = {a,b,c,d,e,f,g,h} from
+      //  vState0 = {f,e,b,a}
+      //  vState1 = {h,g,d,c}
+      __ vsuxei8_v(vState0, state,   vindex);
+      __ vsuxei8_v(vState1, state_c, vindex);
+
+      __ leave();
+      __ ret();
+
+      return start;
+    }
+  };
+#undef __
+#define __ masm->
+
  // Continuation point for throwing of implicit exceptions that are
  // not handled in the current activation. Fabricates an exception
  // oop and initiates normal exception dispatching in this
@ -4862,6 +5248,18 @@ static const int64_t right_3_bits = right_n_bits(3);
    }
 #endif // COMPILER2

+    if (UseSHA256Intrinsics) {
+      Sha2Generator sha2(_masm, this);
+      StubRoutines::_sha256_implCompress   = sha2.generate_sha256_implCompress(false);
+      StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(true);
+    }
+
+    if (UseSHA512Intrinsics) {
+      Sha2Generator sha2(_masm, this);
+      StubRoutines::_sha512_implCompress   = sha2.generate_sha512_implCompress(false);
+      StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(true);
+    }
+
    generate_compare_long_strings();

    generate_string_indexof_stubs();
--- a/src/hotspot/cpu/riscv/vm_version_riscv.cpp
+++ b/src/hotspot/cpu/riscv/vm_version_riscv.cpp
@ -146,26 +146,11 @@ void VM_Version::initialize() {
    FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
  }

-  if (UseSHA) {
-    warning("SHA instructions are not available on this CPU");
-    FLAG_SET_DEFAULT(UseSHA, false);
-  }
-
  if (UseSHA1Intrinsics) {
    warning("Intrinsics for SHA-1 crypto hash functions not available on this CPU.");
    FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
  }

-  if (UseSHA256Intrinsics) {
-    warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU.");
-    FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
-  }
-
-  if (UseSHA512Intrinsics) {
-    warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU.");
-    FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
-  }
-
  if (UseSHA3Intrinsics) {
    warning("Intrinsics for SHA3-224, SHA3-256, SHA3-384 and SHA3-512 crypto hash functions not available on this CPU.");
    FLAG_SET_DEFAULT(UseSHA3Intrinsics, false);
@ -272,6 +257,10 @@ void VM_Version::initialize() {
  // NOTE: Make sure codes dependent on UseRVV are put after c2_initialize(),
  //       as there are extra checks inside it which could disable UseRVV
  //       in some situations.
+  if (UseZvkn && !UseRVV) {
+    FLAG_SET_DEFAULT(UseZvkn, false);
+    warning("Cannot enable Zvkn on cpu without RVV support.");
+  }

  if (UseRVV) {
    if (FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
@ -283,6 +272,31 @@ void VM_Version::initialize() {
    }
    FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
  }
+
+  if (!UseZvkn && UseSHA) {
+    warning("SHA instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseSHA, false);
+  } else if (UseZvkn && FLAG_IS_DEFAULT(UseSHA)) {
+    FLAG_SET_DEFAULT(UseSHA, true);
+  }
+
+  if (!UseSHA) {
+    if (UseSHA256Intrinsics) {
+      warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU, UseZvkn needed.");
+      FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
+    }
+    if (UseSHA512Intrinsics) {
+      warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU, UseZvkn needed.");
+      FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
+    }
+  } else {
+    if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) {
+       FLAG_SET_DEFAULT(UseSHA256Intrinsics, true);
+    }
+    if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) {
+      FLAG_SET_DEFAULT(UseSHA512Intrinsics, true);
+    }
+  }
 }

 #ifdef COMPILER2