8341052: SHA-512 implementation using SHA-NI

Reviewed-by: jbhateja, ascarpino, sviswanathan, sparasa
2026-01-28 12:09:14 +00:00 · 2024-10-21 15:37:36 +00:00 · 2024-10-21 15:37:36 +00:00 · 18bcbf7941
commit 18bcbf7941
parent 54a744b023
10 changed files with 259 additions and 18 deletions
--- a/src/hotspot/cpu/x86/assembler_x86.cpp
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp
@ -6751,6 +6751,27 @@ void Assembler::sha256msg2(XMMRegister dst, XMMRegister src) {
  emit_int16((unsigned char)0xCD, (0xC0 | encode));
 }

+void Assembler::sha512msg1(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_sha512() && VM_Version::supports_avx(), "");
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, &attributes);
+  emit_int16((unsigned char)0xCC, (0xC0 | encode));
+}
+
+void Assembler::sha512msg2(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_sha512() && VM_Version::supports_avx(), "");
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, &attributes);
+  emit_int16((unsigned char)0xCD, (0xC0 | encode));
+}
+
+void Assembler::sha512rnds2(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_sha512() && VM_Version::supports_avx(), "");
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, &attributes);
+  emit_int16((unsigned char)0xCB, (0xC0 | encode));
+}
+
 void Assembler::shll(Register dst, int imm8) {
  assert(isShiftCount(imm8), "illegal shift count");
  int encode = prefix_and_encode(dst->encoding());
@ -11670,6 +11691,19 @@ void Assembler::evbroadcasti64x2(XMMRegister dst, Address src, int vector_len) {
  emit_operand(dst, src, 0);
 }

+void Assembler::vbroadcasti128(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_avx2(), "");
+  assert(vector_len == AVX_256bit, "");
+  assert(dst != xnoreg, "sanity");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
+  // swap src<->dst for encoding
+  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8(0x5A);
+  emit_operand(dst, src, 0);
+}
+
 // scalar single/double precision replicate

 // duplicate single precision data from src into programmed locations in dest : requires AVX512VL
--- a/src/hotspot/cpu/x86/assembler_x86.hpp
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp
@ -2345,6 +2345,9 @@ private:
  void sha256rnds2(XMMRegister dst, XMMRegister src);
  void sha256msg1(XMMRegister dst, XMMRegister src);
  void sha256msg2(XMMRegister dst, XMMRegister src);
+  void sha512rnds2(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void sha512msg1(XMMRegister dst, XMMRegister src);
+  void sha512msg2(XMMRegister dst, XMMRegister src);

  void shldl(Register dst, Register src);
  void eshldl(Register dst, Register src1, Register src2, bool no_flags);
@ -3035,6 +3038,7 @@ private:
  void evbroadcasti32x4(XMMRegister dst, Address src, int vector_len);
  void evbroadcasti64x2(XMMRegister dst, XMMRegister src, int vector_len);
  void evbroadcasti64x2(XMMRegister dst, Address src, int vector_len);
+  void vbroadcasti128(XMMRegister dst, Address src, int vector_len);

  // scalar single/double/128bit precision replicate
  void vbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
@ -3482,6 +3482,17 @@ void MacroAssembler::vpbroadcastd(XMMRegister dst, AddressLiteral src, int vecto
  }
 }

+void MacroAssembler::vbroadcasti128(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
+  assert(rscratch != noreg || always_reachable(src), "missing");
+
+  if (reachable(src)) {
+    Assembler::vbroadcasti128(dst, as_Address(src), vector_len);
+  } else {
+    lea(rscratch, src);
+    Assembler::vbroadcasti128(dst, Address(rscratch, 0), vector_len);
+  }
+}
+
 void MacroAssembler::vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
  assert(rscratch != noreg || always_reachable(src), "missing");

--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
@ -1118,6 +1118,7 @@ public:
                   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
                   Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block,
                   XMMRegister shuf_mask);
+  void sha512_update_ni_x1(Register arg_hash, Register arg_msg, Register ofs, Register limit, bool multi_block);
 #endif // _LP64

  void fast_md5(Register buf, Address state, Address ofs, Address limit,
@ -1216,6 +1217,9 @@ public:
  void addpd(XMMRegister dst, Address        src) { Assembler::addpd(dst, src); }
  void addpd(XMMRegister dst, AddressLiteral src, Register rscratch = noreg);

+  using Assembler::vbroadcasti128;
+  void vbroadcasti128(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);
+
  using Assembler::vbroadcastsd;
  void vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = noreg);

--- a/src/hotspot/cpu/x86/macroAssembler_x86_sha.cpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86_sha.cpp
@ -1519,5 +1519,184 @@ void MacroAssembler::sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegiste
    }
 }

+//Implemented using Intel IpSec implementation (intel-ipsec-mb on github)
+void MacroAssembler::sha512_update_ni_x1(Register arg_hash, Register arg_msg, Register ofs, Register limit, bool multi_block) {
+    Label done_hash, block_loop;
+    address K512_W = StubRoutines::x86::k512_W_addr();
+
+    vbroadcasti128(xmm15, ExternalAddress(StubRoutines::x86::pshuffle_byte_flip_mask_addr_sha512()), Assembler::AVX_256bit, r10);
+
+    //load current hash value and transform
+    vmovdqu(xmm0, Address(arg_hash));
+    vmovdqu(xmm1, Address(arg_hash, 32));
+    //ymm0 = D C B A, ymm1 = H G F E
+    vperm2i128(xmm2, xmm0, xmm1, 0x20);
+    vperm2i128(xmm3, xmm0, xmm1, 0x31);
+    //ymm2 = F E B A, ymm3 = H G D C
+    vpermq(xmm13, xmm2, 0x1b, Assembler::AVX_256bit);
+    vpermq(xmm14, xmm3, 0x1b, Assembler::AVX_256bit);
+    //ymm13 = A B E F, ymm14 = C D G H
+
+    lea(rax, ExternalAddress(K512_W));
+    align(32);
+    bind(block_loop);
+    vmovdqu(xmm11, xmm13);//ABEF
+    vmovdqu(xmm12, xmm14);//CDGH
+
+    //R0 - R3
+    vmovdqu(xmm0, Address(arg_msg, 0 * 32));
+    vpshufb(xmm3, xmm0, xmm15, Assembler::AVX_256bit);//ymm0 / ymm3 = W[0..3]
+    vpaddq(xmm0, xmm3, Address(rax, 0 * 32), Assembler::AVX_256bit);
+    sha512rnds2(xmm12, xmm11, xmm0);
+    vperm2i128(xmm0, xmm0, xmm0, 0x01);
+    sha512rnds2(xmm11, xmm12, xmm0);
+
+    //R4 - R7
+    vmovdqu(xmm0, Address(arg_msg, 1 * 32));
+    vpshufb(xmm4, xmm0, xmm15, Assembler::AVX_256bit);//ymm0 / ymm4 = W[4..7]
+    vpaddq(xmm0, xmm4, Address(rax, 1 * 32), Assembler::AVX_256bit);
+    sha512rnds2(xmm12, xmm11, xmm0);
+    vperm2i128(xmm0, xmm0, xmm0, 0x01);
+    sha512rnds2(xmm11, xmm12, xmm0);
+    sha512msg1(xmm3, xmm4); //ymm3 = W[0..3] + S0(W[1..4])
+
+    //R8 - R11
+    vmovdqu(xmm0, Address(arg_msg, 2 * 32));
+    vpshufb(xmm5, xmm0, xmm15, Assembler::AVX_256bit);//ymm0 / ymm5 = W[8..11]
+    vpaddq(xmm0, xmm5, Address(rax, 2 * 32), Assembler::AVX_256bit);
+    sha512rnds2(xmm12, xmm11, xmm0);
+    vperm2i128(xmm0, xmm0, xmm0, 0x01);
+    sha512rnds2(xmm11, xmm12, xmm0);
+    sha512msg1(xmm4, xmm5);//ymm4 = W[4..7] + S0(W[5..8])
+
+    //R12 - R15
+    vmovdqu(xmm0, Address(arg_msg, 3 * 32));
+    vpshufb(xmm6, xmm0, xmm15, Assembler::AVX_256bit); //ymm0 / ymm6 = W[12..15]
+    vpaddq(xmm0, xmm6, Address(rax, 3 * 32), Assembler::AVX_256bit);
+    vpermq(xmm8, xmm6, 0x1b, Assembler::AVX_256bit); //ymm8 = W[12] W[13] W[14] W[15]
+    vpermq(xmm9, xmm5, 0x39, Assembler::AVX_256bit); //ymm9 = W[8]  W[11] W[10] W[9]
+    vpblendd(xmm8, xmm8, xmm9, 0x3f, Assembler::AVX_256bit); //ymm8 = W[12] W[11] W[10] W[9]
+    vpaddq(xmm3, xmm3, xmm8, Assembler::AVX_256bit);
+    sha512msg2(xmm3, xmm6);//W[16..19] = xmm3 + W[9..12] + S1(W[14..17])
+    sha512rnds2(xmm12, xmm11, xmm0);
+    vperm2i128(xmm0, xmm0, xmm0, 0x01);
+    sha512rnds2(xmm11, xmm12, xmm0);
+    sha512msg1(xmm5, xmm6); //ymm5 = W[8..11] + S0(W[9..12])
+
+    //R16 - R19, R32 - R35, R48 - R51
+    for (int i = 4, j = 3; j > 0; j--) {
+      vpaddq(xmm0, xmm3, Address(rax, i * 32), Assembler::AVX_256bit);
+      vpermq(xmm8, xmm3, 0x1b, Assembler::AVX_256bit);//ymm8 = W[16] W[17] W[18] W[19]
+      vpermq(xmm9, xmm6, 0x39, Assembler::AVX_256bit);//ymm9 = W[12] W[15] W[14] W[13]
+      vpblendd(xmm7, xmm8, xmm9, 0x3f, Assembler::AVX_256bit);//xmm7 = W[16] W[15] W[14] W[13]
+      vpaddq(xmm4, xmm4, xmm7, Assembler::AVX_256bit);//ymm4 = W[4..7] + S0(W[5..8]) + W[13..16]
+      sha512msg2(xmm4, xmm3);//ymm4 += S1(W[14..17])
+      sha512rnds2(xmm12, xmm11, xmm0);
+      vperm2i128(xmm0, xmm0, xmm0, 0x01);
+      sha512rnds2(xmm11, xmm12, xmm0);
+      sha512msg1(xmm6, xmm3); //ymm6 = W[12..15] + S0(W[13..16])
+      i += 1;
+      //R20 - R23, R36 - R39, R52 - R55
+      vpaddq(xmm0, xmm4, Address(rax, i * 32), Assembler::AVX_256bit);
+      vpermq(xmm8, xmm4, 0x1b, Assembler::AVX_256bit);//ymm8 = W[20] W[21] W[22] W[23]
+      vpermq(xmm9, xmm3, 0x39, Assembler::AVX_256bit);//ymm9 = W[16] W[19] W[18] W[17]
+      vpblendd(xmm7, xmm8, xmm9, 0x3f, Assembler::AVX_256bit);//ymm7 = W[20] W[19] W[18] W[17]
+      vpaddq(xmm5, xmm5, xmm7, Assembler::AVX_256bit);//ymm5 = W[8..11] + S0(W[9..12]) + W[17..20]
+      sha512msg2(xmm5, xmm4);//ymm5 += S1(W[18..21])
+      sha512rnds2(xmm12, xmm11, xmm0);
+      vperm2i128(xmm0, xmm0, xmm0, 0x01);
+      sha512rnds2(xmm11, xmm12, xmm0);
+      sha512msg1(xmm3, xmm4); //ymm3 = W[16..19] + S0(W[17..20])
+      i += 1;
+      //R24 - R27, R40 - R43, R56 - R59
+      vpaddq(xmm0, xmm5, Address(rax, i * 32), Assembler::AVX_256bit);
+      vpermq(xmm8, xmm5, 0x1b, Assembler::AVX_256bit);//ymm8 = W[24] W[25] W[26] W[27]
+      vpermq(xmm9, xmm4, 0x39, Assembler::AVX_256bit);//ymm9 = W[20] W[23] W[22] W[21]
+      vpblendd(xmm7, xmm8, xmm9, 0x3f, Assembler::AVX_256bit);//ymm7 = W[24] W[23] W[22] W[21]
+      vpaddq(xmm6, xmm6, xmm7, Assembler::AVX_256bit);//ymm6 = W[12..15] + S0(W[13..16]) + W[21..24]
+      sha512msg2(xmm6, xmm5);//ymm6 += S1(W[22..25])
+      sha512rnds2(xmm12, xmm11, xmm0);
+      vperm2i128(xmm0, xmm0, xmm0, 0x01);
+      sha512rnds2(xmm11, xmm12, xmm0);
+      sha512msg1(xmm4, xmm5);//ymm4 = W[20..23] + S0(W[21..24])
+      i += 1;
+      //R28 - R31, R44 - R47, R60 - R63
+      vpaddq(xmm0, xmm6, Address(rax, i * 32), Assembler::AVX_256bit);
+      vpermq(xmm8, xmm6, 0x1b, Assembler::AVX_256bit);//ymm8 = W[28] W[29] W[30] W[31]
+      vpermq(xmm9, xmm5, 0x39, Assembler::AVX_256bit);//ymm9 = W[24] W[27] W[26] W[25]
+      vpblendd(xmm7, xmm8, xmm9, 0x3f, Assembler::AVX_256bit);//ymm7 = W[28] W[27] W[26] W[25]
+      vpaddq(xmm3, xmm3, xmm7, Assembler::AVX_256bit);//ymm3 = W[16..19] + S0(W[17..20]) + W[25..28]
+      sha512msg2(xmm3, xmm6); //ymm3 += S1(W[26..29])
+      sha512rnds2(xmm12, xmm11, xmm0);
+      vperm2i128(xmm0, xmm0, xmm0, 0x01);
+      sha512rnds2(xmm11, xmm12, xmm0);
+      sha512msg1(xmm5, xmm6);//ymm5 = W[24..27] + S0(W[25..28])
+      i += 1;
+    }
+    //R64 - R67
+    vpaddq(xmm0, xmm3, Address(rax, 16 * 32), Assembler::AVX_256bit);
+    vpermq(xmm8, xmm3, 0x1b, Assembler::AVX_256bit);//ymm8 = W[64] W[65] W[66] W[67]
+    vpermq(xmm9, xmm6, 0x39, Assembler::AVX_256bit);//ymm9 = W[60] W[63] W[62] W[61]
+    vpblendd(xmm7, xmm8, xmm9, 0x3f, Assembler::AVX_256bit);//ymm7 = W[64] W[63] W[62] W[61]
+    vpaddq(xmm4, xmm4, xmm7, Assembler::AVX_256bit);//ymm4 = W[52..55] + S0(W[53..56]) + W[61..64]
+    sha512msg2(xmm4, xmm3);//ymm4 += S1(W[62..65])
+    sha512rnds2(xmm12, xmm11, xmm0);
+    vperm2i128(xmm0, xmm0, xmm0, 0x01);
+    sha512rnds2(xmm11, xmm12, xmm0);
+    sha512msg1(xmm6, xmm3);//ymm6 = W[60..63] + S0(W[61..64])
+
+    //R68 - R71
+    vpaddq(xmm0, xmm4, Address(rax, 17 * 32), Assembler::AVX_256bit);
+    vpermq(xmm8, xmm4, 0x1b, Assembler::AVX_256bit);//ymm8 = W[68] W[69] W[70] W[71]
+    vpermq(xmm9, xmm3, 0x39, Assembler::AVX_256bit);//ymm9 = W[64] W[67] W[66] W[65]
+    vpblendd(xmm7, xmm8, xmm9, 0x3f, Assembler::AVX_256bit);//ymm7 = W[68] W[67] W[66] W[65]
+    vpaddq(xmm5, xmm5, xmm7, Assembler::AVX_256bit);//ymm5 = W[56..59] + S0(W[57..60]) + W[65..68]
+    sha512msg2(xmm5, xmm4);//ymm5 += S1(W[66..69])
+    sha512rnds2(xmm12, xmm11, xmm0);
+    vperm2i128(xmm0, xmm0, xmm0, 0x01);
+    sha512rnds2(xmm11, xmm12, xmm0);
+
+    //R72 - R75
+    vpaddq(xmm0, xmm5, Address(rax, 18 * 32), Assembler::AVX_256bit);
+    vpermq(xmm8, xmm5, 0x1b, Assembler::AVX_256bit);//ymm8 = W[72] W[73] W[74] W[75]
+    vpermq(xmm9, xmm4, 0x39, Assembler::AVX_256bit);//ymm9 = W[68] W[71] W[70] W[69]
+    vpblendd(xmm7, xmm8, xmm9, 0x3f, Assembler::AVX_256bit);//ymm7 = W[72] W[71] W[70] W[69]
+    vpaddq(xmm6, xmm6, xmm7, Assembler::AVX_256bit);//ymm6 = W[60..63] + S0(W[61..64]) + W[69..72]
+    sha512msg2(xmm6, xmm5);//ymm6 += S1(W[70..73])
+    sha512rnds2(xmm12, xmm11, xmm0);
+    vperm2i128(xmm0, xmm0, xmm0, 0x01);
+    sha512rnds2(xmm11, xmm12, xmm0);
+
+    //R76 - R79
+    vpaddq(xmm0, xmm6, Address(rax, 19 * 32), Assembler::AVX_256bit);
+    sha512rnds2(xmm12, xmm11, xmm0);
+    vperm2i128(xmm0, xmm0, xmm0, 0x01);
+    sha512rnds2(xmm11, xmm12, xmm0);
+
+    //update hash value
+    vpaddq(xmm14, xmm14, xmm12, Assembler::AVX_256bit);
+    vpaddq(xmm13, xmm13, xmm11, Assembler::AVX_256bit);
+
+    if (multi_block) {
+      addptr(arg_msg, 4 * 32);
+      addptr(ofs, 128);
+      cmpptr(ofs, limit);
+      jcc(Assembler::belowEqual, block_loop);
+      movptr(rax, ofs); //return ofs
+    }
+
+    //store the hash value back in memory
+    //xmm13 = ABEF
+    //xmm14 = CDGH
+    vperm2i128(xmm1, xmm13, xmm14, 0x31);
+    vperm2i128(xmm2, xmm13, xmm14, 0x20);
+    vpermq(xmm1, xmm1, 0xb1, Assembler::AVX_256bit);//ymm1 = D C B A
+    vpermq(xmm2, xmm2, 0xb1, Assembler::AVX_256bit);//ymm2 = H G F E
+    vmovdqu(Address(arg_hash, 0 * 32), xmm1);
+    vmovdqu(Address(arg_hash, 1 * 32), xmm2);
+
+    bind(done_hash);
+}
+
 #endif //#ifdef _LP64

--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@ -1558,7 +1558,7 @@ address StubGenerator::generate_sha256_implCompress(bool multi_block, const char

 address StubGenerator::generate_sha512_implCompress(bool multi_block, const char *name) {
  assert(VM_Version::supports_avx2(), "");
-  assert(VM_Version::supports_bmi2(), "");
+  assert(VM_Version::supports_bmi2() || VM_Version::supports_sha512(), "");
  __ align(CodeEntryAlignment);
  StubCodeMark mark(this, "StubRoutines", name);
  address start = __ pc();
@ -1568,22 +1568,24 @@ address StubGenerator::generate_sha512_implCompress(bool multi_block, const char
  Register ofs = c_rarg2;
  Register limit = c_rarg3;

-  const XMMRegister msg = xmm0;
-  const XMMRegister state0 = xmm1;
-  const XMMRegister state1 = xmm2;
-  const XMMRegister msgtmp0 = xmm3;
-  const XMMRegister msgtmp1 = xmm4;
-  const XMMRegister msgtmp2 = xmm5;
-  const XMMRegister msgtmp3 = xmm6;
-  const XMMRegister msgtmp4 = xmm7;
-
-  const XMMRegister shuf_mask = xmm8;
-
  __ enter();

-  __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
-  buf, state, ofs, limit, rsp, multi_block, shuf_mask);
+  if (VM_Version::supports_sha512()) {
+      __ sha512_update_ni_x1(state, buf, ofs, limit, multi_block);
+  } else {
+    const XMMRegister msg = xmm0;
+    const XMMRegister state0 = xmm1;
+    const XMMRegister state1 = xmm2;
+    const XMMRegister msgtmp0 = xmm3;
+    const XMMRegister msgtmp1 = xmm4;
+    const XMMRegister msgtmp2 = xmm5;
+    const XMMRegister msgtmp3 = xmm6;
+    const XMMRegister msgtmp4 = xmm7;

+    const XMMRegister shuf_mask = xmm8;
+    __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
+      buf, state, ofs, limit, rsp, multi_block, shuf_mask);
+  }
  __ vzeroupper();
  __ leave();
  __ ret(0);
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp
@ -1060,6 +1060,7 @@ void VM_Version::get_processor_features() {
    _features &= ~CPU_AVX;
    _features &= ~CPU_VZEROUPPER;
    _features &= ~CPU_F16C;
+    _features &= ~CPU_SHA512;
  }

  if (logical_processors_per_package() == 1) {
@ -1304,7 +1305,7 @@ void VM_Version::get_processor_features() {

 #ifdef _LP64
  // These are only supported on 64-bit
-  if (UseSHA && supports_avx2() && supports_bmi2()) {
+  if (UseSHA && supports_avx2() && (supports_bmi2() || supports_sha512())) {
    if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) {
      FLAG_SET_DEFAULT(UseSHA512Intrinsics, true);
    }
@ -3007,6 +3008,8 @@ uint64_t VM_Version::CpuidInfo::feature_flags() const {
      xem_xcr0_eax.bits.ymm != 0) {
    result |= CPU_AVX;
    result |= CPU_VZEROUPPER;
+    if (sefsl1_cpuid7_eax.bits.sha512 != 0)
+      result |= CPU_SHA512;
    if (std_cpuid1_ecx.bits.f16c != 0)
      result |= CPU_F16C;
    if (sef_cpuid7_ebx.bits.avx2 != 0) {
--- a/src/hotspot/cpu/x86/vm_version_x86.hpp
+++ b/src/hotspot/cpu/x86/vm_version_x86.hpp
@ -283,7 +283,8 @@ class VM_Version : public Abstract_VM_Version {
  union SefCpuid7SubLeaf1Eax {
    uint32_t value;
    struct {
-      uint32_t             : 23,
+      uint32_t    sha512   : 1,
+                           : 22,
                  avx_ifma : 1,
                           : 8;
    } bits;
@ -415,7 +416,8 @@ protected:
    decl(CET_SS,            "cet_ss",            57) /* Control Flow Enforcement - Shadow Stack */ \
    decl(AVX512_IFMA,       "avx512_ifma",       58) /* Integer Vector FMA instructions*/ \
    decl(AVX_IFMA,          "avx_ifma",          59) /* 256-bit VEX-coded variant of AVX512-IFMA*/ \
-    decl(APX_F,             "apx_f",             60) /* Intel Advanced Performance Extensions*/
+    decl(APX_F,             "apx_f",             60) /* Intel Advanced Performance Extensions*/\
+    decl(SHA512,            "sha512",            61) /* SHA512 instructions*/

 #define DECLARE_CPU_FEATURE_FLAG(id, name, bit) CPU_##id = (1ULL << bit),
    CPU_FEATURE_FLAGS(DECLARE_CPU_FEATURE_FLAG)
@ -757,6 +759,7 @@ public:
  static bool supports_ospke()        { return (_features & CPU_OSPKE) != 0; }
  static bool supports_cet_ss()       { return (_features & CPU_CET_SS) != 0; }
  static bool supports_cet_ibt()      { return (_features & CPU_CET_IBT) != 0; }
+  static bool supports_sha512()       { return (_features & CPU_SHA512) != 0; }

  //
  // Feature identification not affected by VM flags
--- a/src/jdk.internal.vm.ci/share/classes/jdk/vm/ci/amd64/AMD64.java
+++ b/src/jdk.internal.vm.ci/share/classes/jdk/vm/ci/amd64/AMD64.java
@ -256,6 +256,7 @@ public class AMD64 extends Architecture {
        AVX512_IFMA,
        AVX_IFMA,
        APX_F,
+        SHA512,
    }

    private final EnumSet<CPUFeature> features;
--- a/test/hotspot/jtreg/serviceability/sa/ClhsdbLongConstant.java
+++ b/test/hotspot/jtreg/serviceability/sa/ClhsdbLongConstant.java
@ -103,7 +103,7 @@ public class ClhsdbLongConstant {
        String arch = System.getProperty("os.arch");
        if (arch.equals("amd64") || arch.equals("i386") || arch.equals("x86")) {
            // Expected value obtained from the CPU_SHA definition in vm_version_x86.hpp
-            checkLongValue("VM_Version::CPU_SHA",
+            checkLongValue("VM_Version::CPU_SHA ",
                           longConstantOutput,
                           17179869184L);
        }