8371305: X25519 should utilize x86 intrinsics

Reviewed-by: vpaprotski, ascarpino
2026-07-25 18:33:08 +00:00 · 2026-06-05 16:20:42 +00:00 · 2026-06-05 16:20:42 +00:00 · 0dd26b312e
commit 0dd26b312e
parent 0fcf41112c
16 changed files with 476 additions and 7 deletions
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@ -4904,6 +4904,11 @@ void StubGenerator::generate_compiler_stubs() {
    StubRoutines::_intpoly_assign = generate_intpoly_assign();
  }

+  if (UseIntPoly25519Intrinsics) {
+    StubRoutines::_intpoly_mult_25519 = generate_intpoly_mult_25519();
+    StubRoutines::_intpoly_square_25519 = generate_intpoly_square_25519();
+  }
+
  if (UseMD5Intrinsics) {
    StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
    StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
@ -496,6 +496,9 @@ class StubGenerator: public StubCodeGenerator {
  address generate_intpoly_montgomeryMult_P256();
  address generate_intpoly_assign();

+  address generate_intpoly_mult_25519();
+  address generate_intpoly_square_25519();
+
  // SHA3 stubs
  void generate_sha3_stubs();

--- a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly25519.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly25519.cpp
@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+#include "macroAssembler_x86.hpp"
+#include "stubGenerator_x86_64.hpp"
+
+#define __ _masm->
+
+const int32_t term = 19;
+const int32_t limbs = 5;
+const int32_t bpl = 51;
+const int32_t rem = 64 - bpl;
+const uint64_t MASK = 0x7FFFFFFFFFFFF;
+const uint64_t CARRY_ADD = 0x4000000000000;
+
+// Multiplication operation for polynomial arithmetic in Curve25519.
+//
+// This is the same algorithm as used in Java, except we use pseudo-Mersenne
+// reduction to reduce register pressure instead of using the full 10 columns
+// in Java.
+void multiply_25519_scalar(const Register aLimbs, const Register bLimbs, const Register rLimbs, Register c[], Register bArg, Register d, Register b, Register mask, MacroAssembler* _masm) {
+
+  for (int i = 0; i < limbs; i++) {
+    __ xorq(c[i], c[i]);
+  }
+  __ mov64(mask, MASK);
+  __ movq(bArg, bLimbs);
+
+  // Perform high/low multiplication with signed 5x51 bit limbs
+  for (int i = 0; i < limbs; i++) {
+    __ movq(b, Address(bArg, i * 8));
+    for (int j = 0; j < limbs; j++) {
+      __ movq(rax, Address(aLimbs, j * 8));
+      __ imulq(b);  // rdx:rax = a * b
+      __ movq(d, rax);
+      __ andq(d, mask);
+      __ shrq(rax, bpl);
+      __ shlq(rdx, rem);
+      __ orq(rax, rdx);
+      // Fold in pseudo-Mersenne reduction
+      if ((i + j + 1) >= limbs) {
+        __ imulq(rax, rax, term);
+      }
+      if ((i + j) >= limbs) {
+        __ imulq(d, d, term);
+      }
+      __ addq(c[(i + j) % limbs], d);
+      __ addq(c[(i + j + 1) % limbs], rax);
+    }
+  }
+
+  // Carry-add with reduction from high limb
+  Register carry = bArg;
+  __ mov64(mask, CARRY_ADD);
+  __ movq(carry, mask);
+
+  // Limb 3
+  __ addq(carry, c[3]);
+  __ sarq(carry, bpl);
+  __ addq(c[4], carry);
+  __ shlq(carry, bpl);
+  __ subq(c[3], carry);
+
+  // Limb 4
+  __ movq(carry, mask);
+  __ addq(carry, c[4]);
+  __ sarq(carry, bpl);
+
+  // Reduce high order limb and fold back into low order limb
+  __ mov64(rax, term);
+  __ imulq(carry);
+  __ addq(c[0], rax);
+
+  __ shlq(carry, bpl);
+  __ subq(c[4], carry);
+
+  // Limbs 0 - 3
+  for (int i = 0; i < (limbs - 1); i++) {
+    __ movq(carry, mask);
+    __ addq(carry, c[i]);
+    __ sarq(carry, bpl);
+    __ addq(c[i + 1], carry);
+    __ shlq(carry, bpl);
+    __ subq(c[i], carry);
+  }
+
+  __ pop_ppx(rdx);
+
+  for (int i = 0; i < limbs; i++) {
+    __ movq(Address(rLimbs, i * 8), c[i]);
+  }
+}
+
+// Squaring operation for polynomial arithmetic in Curve25519.
+//
+// This is the same algorithm as used in Java, except we use pseudo-Mersenne
+// reduction to reduce register pressure instead of using the full 10 columns
+// in Java.
+void square_25519_scalar(const Register aLimbs, const Register rLimbs, Register c[], Register aArg, Register d, Register carry, Register mask, MacroAssembler* _masm) {
+
+  for (int i = 0; i < limbs; i++) {
+    __ xorq(c[i], c[i]);
+  }
+  __ mov64(mask, MASK);
+
+  // Perform high/low multiplication with signed 5x51 bit limbs
+  for (int i = 0; i < limbs; i++) {
+    __ movq(aArg, Address(aLimbs, i * 8));
+    __ movq(rax, aArg);
+    __ imulq(aArg);   // rdx:rax = a[j] * a[i]
+    __ movq(d, rax);
+    __ andq(d, mask);
+    __ shrq(rax, bpl);
+    __ shlq(rdx, rem);
+    __ orq(rax, rdx); // rax = dd
+    if ((i * 2 + 1) >= limbs) {
+      __ imulq(rax, rax, term);
+    }
+    if ((i * 2) >= limbs) {
+      __ imulq(d, d, term);
+    }
+    __ addq(c[(i * 2) % limbs], d);
+    __ addq(c[(i * 2 + 1) % limbs], rax);
+    for (int j = i + 1; j < limbs; j++) {
+      __ movq(rax, Address(aLimbs, j * 8));
+      __ imulq(aArg);   // rdx:rax = a * a
+      __ movq(d, rax);
+      __ andq(d, mask);
+      __ shlq(d, 1);
+      __ shrq(rax, bpl);
+      __ shlq(rdx, rem);
+      __ orq(rax, rdx); // rax = dd
+      __ shlq(rax, 1);
+      if ((j + i + 1) >= limbs) {
+        __ imulq(rax, rax, term);
+      }
+      if ((j + i) >= limbs) {
+        __ imulq(d, d, term);
+      }
+      __ addq(c[(i + j) % limbs], d);
+      __ addq(c[(i + j + 1) % limbs], rax);
+    }
+  }
+
+  // Carry-add with reduction from high limb
+  // Limb 3
+  __ mov64(mask, CARRY_ADD);
+  __ movq(carry, mask);
+  __ addq(carry, c[3]);
+  __ sarq(carry, bpl);
+  __ addq(c[4], carry);
+  __ shlq(carry, bpl);
+  __ subq(c[3], carry);
+
+  // Limb 4
+  __ movq(carry, mask);
+  __ addq(carry, c[4]);
+  __ sarq(carry, bpl);
+
+  // Reduce high order limb and fold back into low order limb
+  __ mov64(rax, term);
+  __ imulq(carry);
+  __ addq(c[0], rax);
+
+  __ shlq(carry, bpl);
+  __ subq(c[4], carry);
+
+  // Limbs 0 - 3
+  for (int i = 0; i < (limbs - 1); i++) {
+    __ movq(carry, mask);
+    __ addq(carry, c[i]);
+    __ sarq(carry, bpl);
+    __ addq(c[i + 1], carry);
+    __ shlq(carry, bpl);
+    __ subq(c[i], carry);
+  }
+
+  __ pop_ppx(rdx);
+
+  for (int i = 0; i < limbs; i++) {
+    __ movq(Address(rLimbs, i * 8), c[i]);
+  }
+}
+
+address StubGenerator::generate_intpoly_mult_25519() {
+  StubId stub_id = StubId::stubgen_intpoly_mult_25519_id;
+  int entry_count = StubInfo::entry_count(stub_id);
+  assert(entry_count == 1, "sanity check");
+  address start = load_archive_data(stub_id);
+  if (start != nullptr) {
+    return start;
+  }
+  __ align(CodeEntryAlignment);
+  StubCodeMark mark(this, stub_id);
+  start = __ pc();
+  __ enter();
+
+  // Register Map
+  const Register aLimbs  = c_rarg0; // rdi | rcx
+  const Register bLimbs  = c_rarg1; // rsi | rdx
+  const Register rLimbs  = c_rarg2; // rdx | r8
+
+  Register c[]   = {r9, r10, r11, r12, r13};
+  Register bArg  = r14;
+  Register d     = r15;
+  Register b     = rbp;
+  Register mask  = rbx;
+
+  __ push_ppx(rbp);
+  __ push_ppx(rbx);
+  __ push_ppx(r12);
+  __ push_ppx(r13);
+  __ push_ppx(r14);
+  __ push_ppx(r15);
+  __ push_ppx(rdx);
+
+  multiply_25519_scalar(aLimbs, bLimbs, rLimbs, c, bArg, d, b, mask, _masm);
+
+  // __ pop_ppx(rdx); // restored in the helper already
+  __ pop_ppx(r15);
+  __ pop_ppx(r14);
+  __ pop_ppx(r13);
+  __ pop_ppx(r12);
+  __ pop_ppx(rbx);
+  __ pop_ppx(rbp);
+
+  __ leave();
+  __ ret(0);
+
+  // Record the stub entry and end
+  store_archive_data(stub_id, start, __ pc());
+
+  return start;
+}
+
+address StubGenerator::generate_intpoly_square_25519() {
+  StubId stub_id = StubId::stubgen_intpoly_square_25519_id;
+  int entry_count = StubInfo::entry_count(stub_id);
+  assert(entry_count == 1, "sanity check");
+  address start = load_archive_data(stub_id);
+  if (start != nullptr) {
+    return start;
+  }
+  __ align(CodeEntryAlignment);
+  StubCodeMark mark(this, stub_id);
+  start = __ pc();
+  __ enter();
+
+  // Register Map
+  const Register aLimbs  = c_rarg0; // rdi | rcx
+  const Register rLimbs  = c_rarg1; // rsi | rdx
+  Register c[]   = {r9, r10, r11, r12, r13};
+  Register aArg  = r14;
+  Register d     = r15;
+  Register carry = rbp;
+  Register mask  = rbx;
+
+  __ push_ppx(rbp);
+  __ push_ppx(rbx);
+  __ push_ppx(r12);
+  __ push_ppx(r13);
+  __ push_ppx(r14);
+  __ push_ppx(r15);
+  __ push_ppx(rdx);
+
+  square_25519_scalar(aLimbs, rLimbs, c, aArg, d, carry, mask, _masm);
+
+  // __ pop_ppx(rdx); // restored in the helper already
+  __ pop_ppx(r15);
+  __ pop_ppx(r14);
+  __ pop_ppx(r13);
+  __ pop_ppx(r12);
+  __ pop_ppx(rbx);
+  __ pop_ppx(rbp);
+
+  __ leave();
+  __ ret(0);
+
+  // Record the stub entry and end
+  store_archive_data(stub_id, start, __ pc());
+
+  return start;
+}
+#undef __
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly_mont.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly_mont.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, 2025, Intel Corporation. All rights reserved.
+ * Copyright (c) 2024, 2026, Intel Corporation. All rights reserved.
 *
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
@ -676,7 +676,7 @@ address StubGenerator::generate_intpoly_assign() {
  // KNOWN Lengths:
  //   MontgomeryIntPolynP256:  5 = 4 + 1
  //   IntegerPolynomial1305:   5 = 4 + 1
-  //   IntegerPolynomial25519: 10 = 8 + 2
+  //   IntegerPolynomial25519:  5 = 4 + 1
  //   IntegerPolynomialP256:  10 = 8 + 2
  //   Curve25519OrderField:   10 = 8 + 2
  //   Curve25519OrderField:   10 = 8 + 2
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp
@ -1407,6 +1407,10 @@ void VM_Version::get_processor_features() {
    FLAG_SET_DEFAULT(UseIntPolyIntrinsics, false);
  }

+  if (FLAG_IS_DEFAULT(UseIntPoly25519Intrinsics)) {
+    UseIntPoly25519Intrinsics = true;
+  }
+
  if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
    UseMultiplyToLenIntrinsic = true;
  }
--- a/src/hotspot/share/classfile/vmIntrinsics.cpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -527,6 +527,10 @@ bool vmIntrinsics::disabled_by_jvm_flags(vmIntrinsics::ID id) {
  case vmIntrinsics::_intpoly_assign:
    if (!UseIntPolyIntrinsics) return true;
    break;
+  case vmIntrinsics::_intpoly_mult_25519:
+  case vmIntrinsics::_intpoly_square_25519:
+    if (!UseIntPoly25519Intrinsics) return true;
+    break;
  case vmIntrinsics::_updateBytesCRC32C:
  case vmIntrinsics::_updateDirectByteBufferCRC32C:
    if (!UseCRC32CIntrinsics) return true;
--- a/src/hotspot/share/classfile/vmIntrinsics.hpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.hpp
@ -549,6 +549,13 @@ class methodHandle;
   do_name(intPolyAssign_name, "conditionalAssign")                                                                     \
   do_signature(intPolyAssign_signature, "(I[J[J)V")                                                                    \
                                                                                                                        \
+  /* support for sun.security.util.math.intpoly.IntegerPolynomial25519 */                                               \
+  do_class(sun_security_util_math_intpoly_IntegerPolynomial25519, "sun/security/util/math/intpoly/IntegerPolynomial25519") \
+  do_intrinsic(_intpoly_mult_25519, sun_security_util_math_intpoly_IntegerPolynomial25519, intPolyMult_name, intPolyMult_signature, F_R) \
+  do_intrinsic(_intpoly_square_25519, sun_security_util_math_intpoly_IntegerPolynomial25519, intPolySquare_name, intPolySquare_signature, F_R) \
+  do_name(intPolySquare_name, "square")                                                                                  \
+  do_signature(intPolySquare_signature, "([J[J)V")                                                                       \
+                                                                                                                         \
  /* support for java.util.Base64.Encoder*/                                                                             \
  do_class(java_util_Base64_Encoder, "java/util/Base64$Encoder")                                                        \
  do_intrinsic(_base64_encodeBlock, java_util_Base64_Encoder, encodeBlock_name, encodeBlock_signature, F_R)             \
--- a/src/hotspot/share/opto/c2compiler.cpp
+++ b/src/hotspot/share/opto/c2compiler.cpp
@ -819,6 +819,8 @@ bool C2Compiler::is_intrinsic_supported(vmIntrinsics::ID id) {
  case vmIntrinsics::_poly1305_processBlocks:
  case vmIntrinsics::_intpoly_montgomeryMult_P256:
  case vmIntrinsics::_intpoly_assign:
+  case vmIntrinsics::_intpoly_mult_25519:
+  case vmIntrinsics::_intpoly_square_25519:
  case vmIntrinsics::_updateCRC32:
  case vmIntrinsics::_updateBytesCRC32:
  case vmIntrinsics::_updateByteBufferCRC32:
--- a/src/hotspot/share/opto/escape.cpp
+++ b/src/hotspot/share/opto/escape.cpp
@ -2272,6 +2272,8 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
                  strcmp(call->as_CallLeaf()->_name, "poly1305_processBlocks") == 0 ||
                  strcmp(call->as_CallLeaf()->_name, "intpoly_montgomeryMult_P256") == 0 ||
                  strcmp(call->as_CallLeaf()->_name, "intpoly_assign") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "intpoly_mult_25519") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "intpoly_square_25519") == 0 ||
                  strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 ||
                  strcmp(call->as_CallLeaf()->_name, "chacha20Block") == 0 ||
                  strcmp(call->as_CallLeaf()->_name, "kyberNtt") == 0 ||
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@ -666,6 +666,10 @@ bool LibraryCallKit::try_to_inline(int predicate) {
    return inline_intpoly_montgomeryMult_P256();
  case vmIntrinsics::_intpoly_assign:
    return inline_intpoly_assign();
+  case vmIntrinsics::_intpoly_mult_25519:
+    return inline_intpoly_mult_25519();
+  case vmIntrinsics::_intpoly_square_25519:
+    return inline_intpoly_square_25519();
  case vmIntrinsics::_encodeISOArray:
  case vmIntrinsics::_encodeByteISOArray:
    return inline_encodeISOArray(false);
@ -8373,6 +8377,70 @@ bool LibraryCallKit::inline_intpoly_assign() {
  return true;
 }

+bool LibraryCallKit::inline_intpoly_mult_25519() {
+  address stubAddr;
+  const char *stubName;
+  assert(UseIntPoly25519Intrinsics, "need intpoly25519 intrinsics support");
+  assert(callee()->signature()->size() == 3, "intpoly_mult_25519 has %d parameters", callee()->signature()->size());
+  stubAddr = StubRoutines::intpoly_mult_25519();
+  stubName = "intpoly_mult_25519";
+
+  if (!stubAddr) return false;
+  null_check_receiver();  // null-check receiver
+  if (stopped())  return true;
+
+  Node* a = argument(1);
+  Node* b = argument(2);
+  Node* r = argument(3);
+
+  a = must_be_not_null(a, true);
+  b = must_be_not_null(b, true);
+  r = must_be_not_null(r, true);
+
+  Node* a_start = array_element_address(a, intcon(0), T_LONG);
+  assert(a_start, "a array is null");
+  Node* b_start = array_element_address(b, intcon(0), T_LONG);
+  assert(b_start, "b array is null");
+  Node* r_start = array_element_address(r, intcon(0), T_LONG);
+  assert(r_start, "r array is null");
+
+  Node* call = make_runtime_call(RC_LEAF | RC_NO_FP,
+                                 OptoRuntime::intpoly_mult_25519_Type(),
+                                 stubAddr, stubName, TypePtr::BOTTOM,
+                                 a_start, b_start, r_start);
+  return true;
+}
+
+bool LibraryCallKit::inline_intpoly_square_25519() {
+  address stubAddr;
+  const char *stubName;
+  assert(UseIntPoly25519Intrinsics, "need intpoly25519 intrinsics support");
+  assert(callee()->signature()->size() == 2, "intpoly_mult_25519 has %d parameters", callee()->signature()->size());
+  stubAddr = StubRoutines::intpoly_square_25519();
+  stubName = "intpoly_square_25519";
+
+  if (!stubAddr) return false;
+  null_check_receiver();  // null-check receiver
+  if (stopped())  return true;
+
+  Node* a = argument(1);
+  Node* r = argument(2);
+
+  a = must_be_not_null(a, true);
+  r = must_be_not_null(r, true);
+
+  Node* a_start = array_element_address(a, intcon(0), T_LONG);
+  assert(a_start, "a array is null");
+  Node* r_start = array_element_address(r, intcon(0), T_LONG);
+  assert(r_start, "r array is null");
+
+  Node* call = make_runtime_call(RC_LEAF | RC_NO_FP,
+                                 OptoRuntime::intpoly_square_25519_Type(),
+                                 stubAddr, stubName, TypePtr::BOTTOM,
+                                 a_start, r_start);
+  return true;
+}
+
 //------------------------------inline_digestBase_implCompress-----------------------
 //
 // Calculate MD5 for single-block byte[] array.
--- a/src/hotspot/share/opto/library_call.hpp
+++ b/src/hotspot/share/opto/library_call.hpp
@ -343,6 +343,8 @@ class LibraryCallKit : public GraphKit {
  bool inline_poly1305_processBlocks();
  bool inline_intpoly_montgomeryMult_P256();
  bool inline_intpoly_assign();
+  bool inline_intpoly_mult_25519();
+  bool inline_intpoly_square_25519();
  bool inline_digestBase_implCompress(vmIntrinsics::ID id);
  bool inline_keccak(vmIntrinsics::ID id);
  bool inline_digestBase_implCompressMB(int predicate);
--- a/src/hotspot/share/opto/runtime.cpp
+++ b/src/hotspot/share/opto/runtime.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2026, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -237,6 +237,8 @@ const TypeFunc* OptoRuntime::_string_IndexOf_Type                 = nullptr;
 const TypeFunc* OptoRuntime::_poly1305_processBlocks_Type         = nullptr;
 const TypeFunc* OptoRuntime::_intpoly_montgomeryMult_P256_Type    = nullptr;
 const TypeFunc* OptoRuntime::_intpoly_assign_Type                 = nullptr;
+const TypeFunc* OptoRuntime::_intpoly_mult_25519_Type             = nullptr;
+const TypeFunc* OptoRuntime::_intpoly_square_25519_Type           = nullptr;
 const TypeFunc* OptoRuntime::_updateBytesCRC32_Type               = nullptr;
 const TypeFunc* OptoRuntime::_updateBytesCRC32C_Type              = nullptr;
 const TypeFunc* OptoRuntime::_updateBytesAdler32_Type             = nullptr;
@ -1786,6 +1788,41 @@ static const TypeFunc* make_intpoly_assign_Type() {
  return TypeFunc::make(domain, range);
 }

+static const TypeFunc* make_intpoly_mult_25519_Type() {
+  int argcnt = 3;
+
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // a array
+  fields[argp++] = TypePtr::NOTNULL;    // b array
+  fields[argp++] = TypePtr::NOTNULL;    // r(esult) array
+  assert(argp == TypeFunc::Parms + argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+  // result type needed
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms + 0] = nullptr; // void
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+  return TypeFunc::make(domain, range);
+}
+
+static const TypeFunc* make_intpoly_square_25519_Type() {
+  int argcnt = 2;
+
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // a array
+  fields[argp++] = TypePtr::NOTNULL;    // r(esult) array
+  assert(argp == TypeFunc::Parms + argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+  // result type needed
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms + 0] = nullptr; // void
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+  return TypeFunc::make(domain, range);
+}
+
 //------------- Interpreter state for on stack replacement
 static const TypeFunc* make_osr_end_Type() {
  // create input type (domain)
@ -2354,6 +2391,8 @@ void OptoRuntime::initialize_types() {
  _poly1305_processBlocks_Type        = make_poly1305_processBlocks_Type();
  _intpoly_montgomeryMult_P256_Type   = make_intpoly_montgomeryMult_P256_Type();
  _intpoly_assign_Type                = make_intpoly_assign_Type();
+  _intpoly_mult_25519_Type            = make_intpoly_mult_25519_Type();
+  _intpoly_square_25519_Type          = make_intpoly_square_25519_Type();
  _updateBytesCRC32_Type              = make_updateBytesCRC32_Type();
  _updateBytesCRC32C_Type             = make_updateBytesCRC32C_Type();
  _updateBytesAdler32_Type            = make_updateBytesAdler32_Type();
--- a/src/hotspot/share/opto/runtime.hpp
+++ b/src/hotspot/share/opto/runtime.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2026, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -190,6 +190,8 @@ class OptoRuntime : public AllStatic {
  static const TypeFunc* _poly1305_processBlocks_Type;
  static const TypeFunc* _intpoly_montgomeryMult_P256_Type;
  static const TypeFunc* _intpoly_assign_Type;
+  static const TypeFunc* _intpoly_mult_25519_Type;
+  static const TypeFunc* _intpoly_square_25519_Type;
  static const TypeFunc* _updateBytesCRC32_Type;
  static const TypeFunc* _updateBytesCRC32C_Type;
  static const TypeFunc* _updateBytesAdler32_Type;
@ -687,6 +689,18 @@ private:
    return _intpoly_assign_Type;
  }

+  // IntegerPolynomial25519 multiply function
+  static inline const TypeFunc* intpoly_mult_25519_Type() {
+    assert(_intpoly_mult_25519_Type != nullptr, "should be initialized");
+    return _intpoly_mult_25519_Type;
+  }
+
+  // IntegerPolynomial25519 square function
+  static inline const TypeFunc* intpoly_square_25519_Type() {
+    assert(_intpoly_square_25519_Type != nullptr, "should be initialized");
+    return _intpoly_square_25519_Type;
+  }
+
  /**
   * int updateBytesCRC32(int crc, byte* b, int len)
   */
--- a/src/hotspot/share/runtime/globals.hpp
+++ b/src/hotspot/share/runtime/globals.hpp
@ -229,9 +229,13 @@ const int ObjectAlignmentInBytes = 8;
                                                                            \
  product(bool, UsePoly1305Intrinsics, false, DIAGNOSTIC,                   \
          "Use intrinsics for sun.security.util.math.intpoly")              \
-  product(bool, UseIntPolyIntrinsics, false, DIAGNOSTIC,                   \
+                                                                            \
+  product(bool, UseIntPolyIntrinsics, false, DIAGNOSTIC,                    \
          "Use intrinsics for sun.security.util.math.intpoly.MontgomeryIntegerPolynomialP256") \
                                                                            \
+  product(bool, UseIntPoly25519Intrinsics, false, DIAGNOSTIC,               \
+          "Use intrinsics for sun.security.util.math.intpoly.IntegerPolynomial25519") \
+                                                                            \
  product(size_t, LargePageSizeInBytes, 0,                                  \
          "Maximum large page size used (0 will use the default large "     \
          "page size for the environment as the maximum) "                  \
--- a/src/hotspot/share/runtime/stubDeclarations.hpp
+++ b/src/hotspot/share/runtime/stubDeclarations.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2025, Red Hat, Inc. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
@ -801,6 +801,12 @@
           intpoly_montgomeryMult_P256, intpoly_montgomeryMult_P256)    \
  do_stub(compiler, intpoly_assign)                                     \
  do_entry(compiler, intpoly_assign, intpoly_assign, intpoly_assign)    \
+  do_stub(compiler, intpoly_mult_25519)                                 \
+  do_entry(compiler, intpoly_mult_25519,                                \
+           intpoly_mult_25519, intpoly_mult_25519)                      \
+  do_stub(compiler, intpoly_square_25519)                               \
+  do_entry(compiler, intpoly_square_25519,                              \
+           intpoly_square_25519, intpoly_square_25519)                  \
  do_stub(compiler, md5_implCompress)                                   \
  do_entry(compiler, md5_implCompress, md5_implCompress,                \
           md5_implCompress)                                            \
--- a/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial25519.java
+++ b/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial25519.java
@ -26,6 +26,7 @@
 package sun.security.util.math.intpoly;

 import java.math.BigInteger;
+import jdk.internal.vm.annotation.IntrinsicCandidate;

 public final class IntegerPolynomial25519 extends IntegerPolynomial {
    private static final int BITS_PER_LIMB = 51;
@ -235,6 +236,7 @@ public final class IntegerPolynomial25519 extends IntegerPolynomial {
     * @param b [in] the limb operand to multiply.
     * @param r [out] the product of the limbs operands that is fully reduced.
     */
+    @IntrinsicCandidate
    protected void mult(long[] a, long[] b, long[] r) {
        long aa0 = a[0];
        long aa1 = a[1];
@ -414,6 +416,7 @@ public final class IntegerPolynomial25519 extends IntegerPolynomial {
     * @param a [in] the limb operand to square.
     * @param r [out] the resulting square of the limb which is fully reduced.
     */
+    @IntrinsicCandidate
    protected void square(long[] a, long[] r) {
        long aa0 = a[0];
        long aa1 = a[1];