From 0dd26b312ebe2af1b2d1639fc6355022834896eb Mon Sep 17 00:00:00 2001
From: Shawn Emery <semery@openjdk.org>
Date: Fri, 5 Jun 2026 16:20:42 +0000
Subject: [PATCH] 8371305: X25519 should utilize x86 intrinsics

Reviewed-by: vpaprotski, ascarpino
---
 src/hotspot/cpu/x86/stubGenerator_x86_64.cpp  |   5 +
 src/hotspot/cpu/x86/stubGenerator_x86_64.hpp  |   3 +
 .../x86/stubGenerator_x86_64_poly25519.cpp    | 306 ++++++++++++++++++
 .../x86/stubGenerator_x86_64_poly_mont.cpp    |   4 +-
 src/hotspot/cpu/x86/vm_version_x86.cpp        |   4 +
 src/hotspot/share/classfile/vmIntrinsics.cpp  |   6 +-
 src/hotspot/share/classfile/vmIntrinsics.hpp  |   7 +
 src/hotspot/share/opto/c2compiler.cpp         |   2 +
 src/hotspot/share/opto/escape.cpp             |   2 +
 src/hotspot/share/opto/library_call.cpp       |  68 ++++
 src/hotspot/share/opto/library_call.hpp       |   2 +
 src/hotspot/share/opto/runtime.cpp            |  41 ++-
 src/hotspot/share/opto/runtime.hpp            |  16 +-
 src/hotspot/share/runtime/globals.hpp         |   6 +-
 .../share/runtime/stubDeclarations.hpp        |   8 +-
 .../math/intpoly/IntegerPolynomial25519.java  |   3 +
 16 files changed, 476 insertions(+), 7 deletions(-)
 create mode 100644 src/hotspot/cpu/x86/stubGenerator_x86_64_poly25519.cpp

diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
index b64943fc4de..afd9c126a21 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@@ -4904,6 +4904,11 @@ void StubGenerator::generate_compiler_stubs() {
     StubRoutines::_intpoly_assign = generate_intpoly_assign();
   }
 
+  if (UseIntPoly25519Intrinsics) {
+    StubRoutines::_intpoly_mult_25519 = generate_intpoly_mult_25519();
+    StubRoutines::_intpoly_square_25519 = generate_intpoly_square_25519();
+  }
+
   if (UseMD5Intrinsics) {
     StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
     StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
index 360b0329d95..6e3da334f11 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
@@ -496,6 +496,9 @@ class StubGenerator: public StubCodeGenerator {
   address generate_intpoly_montgomeryMult_P256();
   address generate_intpoly_assign();
 
+  address generate_intpoly_mult_25519();
+  address generate_intpoly_square_25519();
+
   // SHA3 stubs
   void generate_sha3_stubs();
 
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly25519.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly25519.cpp
new file mode 100644
index 00000000000..c7395220d49
--- /dev/null
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly25519.cpp
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+#include "macroAssembler_x86.hpp"
+#include "stubGenerator_x86_64.hpp"
+
+#define __ _masm->
+
+const int32_t term = 19;
+const int32_t limbs = 5;
+const int32_t bpl = 51;
+const int32_t rem = 64 - bpl;
+const uint64_t MASK = 0x7FFFFFFFFFFFF;
+const uint64_t CARRY_ADD = 0x4000000000000;
+
+// Multiplication operation for polynomial arithmetic in Curve25519.
+//
+// This is the same algorithm as used in Java, except we use pseudo-Mersenne
+// reduction to reduce register pressure instead of using the full 10 columns
+// in Java.
+void multiply_25519_scalar(const Register aLimbs, const Register bLimbs, const Register rLimbs, Register c[], Register bArg, Register d, Register b, Register mask, MacroAssembler* _masm) {
+
+  for (int i = 0; i < limbs; i++) {
+    __ xorq(c[i], c[i]);
+  }
+  __ mov64(mask, MASK);
+  __ movq(bArg, bLimbs);
+
+  // Perform high/low multiplication with signed 5x51 bit limbs
+  for (int i = 0; i < limbs; i++) {
+    __ movq(b, Address(bArg, i * 8));
+    for (int j = 0; j < limbs; j++) {
+      __ movq(rax, Address(aLimbs, j * 8));
+      __ imulq(b);  // rdx:rax = a * b
+      __ movq(d, rax);
+      __ andq(d, mask);
+      __ shrq(rax, bpl);
+      __ shlq(rdx, rem);
+      __ orq(rax, rdx);
+      // Fold in pseudo-Mersenne reduction
+      if ((i + j + 1) >= limbs) {
+        __ imulq(rax, rax, term);
+      }
+      if ((i + j) >= limbs) {
+        __ imulq(d, d, term);
+      }
+      __ addq(c[(i + j) % limbs], d);
+      __ addq(c[(i + j + 1) % limbs], rax);
+    }
+  }
+
+  // Carry-add with reduction from high limb
+  Register carry = bArg;
+  __ mov64(mask, CARRY_ADD);
+  __ movq(carry, mask);
+
+  // Limb 3
+  __ addq(carry, c[3]);
+  __ sarq(carry, bpl);
+  __ addq(c[4], carry);
+  __ shlq(carry, bpl);
+  __ subq(c[3], carry);
+
+  // Limb 4
+  __ movq(carry, mask);
+  __ addq(carry, c[4]);
+  __ sarq(carry, bpl);
+
+  // Reduce high order limb and fold back into low order limb
+  __ mov64(rax, term);
+  __ imulq(carry);
+  __ addq(c[0], rax);
+
+  __ shlq(carry, bpl);
+  __ subq(c[4], carry);
+
+  // Limbs 0 - 3
+  for (int i = 0; i < (limbs - 1); i++) {
+    __ movq(carry, mask);
+    __ addq(carry, c[i]);
+    __ sarq(carry, bpl);
+    __ addq(c[i + 1], carry);
+    __ shlq(carry, bpl);
+    __ subq(c[i], carry);
+  }
+
+  __ pop_ppx(rdx);
+
+  for (int i = 0; i < limbs; i++) {
+    __ movq(Address(rLimbs, i * 8), c[i]);
+  }
+}
+
+// Squaring operation for polynomial arithmetic in Curve25519.
+//
+// This is the same algorithm as used in Java, except we use pseudo-Mersenne
+// reduction to reduce register pressure instead of using the full 10 columns
+// in Java.
+void square_25519_scalar(const Register aLimbs, const Register rLimbs, Register c[], Register aArg, Register d, Register carry, Register mask, MacroAssembler* _masm) {
+
+  for (int i = 0; i < limbs; i++) {
+    __ xorq(c[i], c[i]);
+  }
+  __ mov64(mask, MASK);
+
+  // Perform high/low multiplication with signed 5x51 bit limbs
+  for (int i = 0; i < limbs; i++) {
+    __ movq(aArg, Address(aLimbs, i * 8));
+    __ movq(rax, aArg);
+    __ imulq(aArg);   // rdx:rax = a[j] * a[i]
+    __ movq(d, rax);
+    __ andq(d, mask);
+    __ shrq(rax, bpl);
+    __ shlq(rdx, rem);
+    __ orq(rax, rdx); // rax = dd
+    if ((i * 2 + 1) >= limbs) {
+      __ imulq(rax, rax, term);
+    }
+    if ((i * 2) >= limbs) {
+      __ imulq(d, d, term);
+    }
+    __ addq(c[(i * 2) % limbs], d);
+    __ addq(c[(i * 2 + 1) % limbs], rax);
+    for (int j = i + 1; j < limbs; j++) {
+      __ movq(rax, Address(aLimbs, j * 8));
+      __ imulq(aArg);   // rdx:rax = a * a
+      __ movq(d, rax);
+      __ andq(d, mask);
+      __ shlq(d, 1);
+      __ shrq(rax, bpl);
+      __ shlq(rdx, rem);
+      __ orq(rax, rdx); // rax = dd
+      __ shlq(rax, 1);
+      if ((j + i + 1) >= limbs) {
+        __ imulq(rax, rax, term);
+      }
+      if ((j + i) >= limbs) {
+        __ imulq(d, d, term);
+      }
+      __ addq(c[(i + j) % limbs], d);
+      __ addq(c[(i + j + 1) % limbs], rax);
+    }
+  }
+
+  // Carry-add with reduction from high limb
+  // Limb 3
+  __ mov64(mask, CARRY_ADD);
+  __ movq(carry, mask);
+  __ addq(carry, c[3]);
+  __ sarq(carry, bpl);
+  __ addq(c[4], carry);
+  __ shlq(carry, bpl);
+  __ subq(c[3], carry);
+
+  // Limb 4
+  __ movq(carry, mask);
+  __ addq(carry, c[4]);
+  __ sarq(carry, bpl);
+
+  // Reduce high order limb and fold back into low order limb
+  __ mov64(rax, term);
+  __ imulq(carry);
+  __ addq(c[0], rax);
+
+  __ shlq(carry, bpl);
+  __ subq(c[4], carry);
+
+  // Limbs 0 - 3
+  for (int i = 0; i < (limbs - 1); i++) {
+    __ movq(carry, mask);
+    __ addq(carry, c[i]);
+    __ sarq(carry, bpl);
+    __ addq(c[i + 1], carry);
+    __ shlq(carry, bpl);
+    __ subq(c[i], carry);
+  }
+
+  __ pop_ppx(rdx);
+
+  for (int i = 0; i < limbs; i++) {
+    __ movq(Address(rLimbs, i * 8), c[i]);
+  }
+}
+
+address StubGenerator::generate_intpoly_mult_25519() {
+  StubId stub_id = StubId::stubgen_intpoly_mult_25519_id;
+  int entry_count = StubInfo::entry_count(stub_id);
+  assert(entry_count == 1, "sanity check");
+  address start = load_archive_data(stub_id);
+  if (start != nullptr) {
+    return start;
+  }
+  __ align(CodeEntryAlignment);
+  StubCodeMark mark(this, stub_id);
+  start = __ pc();
+  __ enter();
+
+  // Register Map
+  const Register aLimbs  = c_rarg0; // rdi | rcx
+  const Register bLimbs  = c_rarg1; // rsi | rdx
+  const Register rLimbs  = c_rarg2; // rdx | r8
+
+  Register c[]   = {r9, r10, r11, r12, r13};
+  Register bArg  = r14;
+  Register d     = r15;
+  Register b     = rbp;
+  Register mask  = rbx;
+
+  __ push_ppx(rbp);
+  __ push_ppx(rbx);
+  __ push_ppx(r12);
+  __ push_ppx(r13);
+  __ push_ppx(r14);
+  __ push_ppx(r15);
+  __ push_ppx(rdx);
+
+  multiply_25519_scalar(aLimbs, bLimbs, rLimbs, c, bArg, d, b, mask, _masm);
+
+  // __ pop_ppx(rdx); // restored in the helper already
+  __ pop_ppx(r15);
+  __ pop_ppx(r14);
+  __ pop_ppx(r13);
+  __ pop_ppx(r12);
+  __ pop_ppx(rbx);
+  __ pop_ppx(rbp);
+
+  __ leave();
+  __ ret(0);
+
+  // Record the stub entry and end
+  store_archive_data(stub_id, start, __ pc());
+
+  return start;
+}
+
+address StubGenerator::generate_intpoly_square_25519() {
+  StubId stub_id = StubId::stubgen_intpoly_square_25519_id;
+  int entry_count = StubInfo::entry_count(stub_id);
+  assert(entry_count == 1, "sanity check");
+  address start = load_archive_data(stub_id);
+  if (start != nullptr) {
+    return start;
+  }
+  __ align(CodeEntryAlignment);
+  StubCodeMark mark(this, stub_id);
+  start = __ pc();
+  __ enter();
+
+  // Register Map
+  const Register aLimbs  = c_rarg0; // rdi | rcx
+  const Register rLimbs  = c_rarg1; // rsi | rdx
+  Register c[]   = {r9, r10, r11, r12, r13};
+  Register aArg  = r14;
+  Register d     = r15;
+  Register carry = rbp;
+  Register mask  = rbx;
+
+  __ push_ppx(rbp);
+  __ push_ppx(rbx);
+  __ push_ppx(r12);
+  __ push_ppx(r13);
+  __ push_ppx(r14);
+  __ push_ppx(r15);
+  __ push_ppx(rdx);
+
+  square_25519_scalar(aLimbs, rLimbs, c, aArg, d, carry, mask, _masm);
+
+  // __ pop_ppx(rdx); // restored in the helper already
+  __ pop_ppx(r15);
+  __ pop_ppx(r14);
+  __ pop_ppx(r13);
+  __ pop_ppx(r12);
+  __ pop_ppx(rbx);
+  __ pop_ppx(rbp);
+
+  __ leave();
+  __ ret(0);
+
+  // Record the stub entry and end
+  store_archive_data(stub_id, start, __ pc());
+
+  return start;
+}
+#undef __
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly_mont.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly_mont.cpp
index 308a8042993..76b6fa97fa5 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly_mont.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly_mont.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, 2025, Intel Corporation. All rights reserved.
+ * Copyright (c) 2024, 2026, Intel Corporation. All rights reserved.
  *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -676,7 +676,7 @@ address StubGenerator::generate_intpoly_assign() {
   // KNOWN Lengths:
   //   MontgomeryIntPolynP256:  5 = 4 + 1
   //   IntegerPolynomial1305:   5 = 4 + 1
-  //   IntegerPolynomial25519: 10 = 8 + 2
+  //   IntegerPolynomial25519:  5 = 4 + 1
   //   IntegerPolynomialP256:  10 = 8 + 2
   //   Curve25519OrderField:   10 = 8 + 2
   //   Curve25519OrderField:   10 = 8 + 2
diff --git a/src/hotspot/cpu/x86/vm_version_x86.cpp b/src/hotspot/cpu/x86/vm_version_x86.cpp
index 4cdcb1770bb..2ca1c172542 100644
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp
@@ -1407,6 +1407,10 @@ void VM_Version::get_processor_features() {
     FLAG_SET_DEFAULT(UseIntPolyIntrinsics, false);
   }
 
+  if (FLAG_IS_DEFAULT(UseIntPoly25519Intrinsics)) {
+    UseIntPoly25519Intrinsics = true;
+  }
+
   if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
     UseMultiplyToLenIntrinsic = true;
   }
diff --git a/src/hotspot/share/classfile/vmIntrinsics.cpp b/src/hotspot/share/classfile/vmIntrinsics.cpp
index cec3586a50b..4a1b9ead116 100644
--- a/src/hotspot/share/classfile/vmIntrinsics.cpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -527,6 +527,10 @@ bool vmIntrinsics::disabled_by_jvm_flags(vmIntrinsics::ID id) {
   case vmIntrinsics::_intpoly_assign:
     if (!UseIntPolyIntrinsics) return true;
     break;
+  case vmIntrinsics::_intpoly_mult_25519:
+  case vmIntrinsics::_intpoly_square_25519:
+    if (!UseIntPoly25519Intrinsics) return true;
+    break;
   case vmIntrinsics::_updateBytesCRC32C:
   case vmIntrinsics::_updateDirectByteBufferCRC32C:
     if (!UseCRC32CIntrinsics) return true;
diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp
index de4eea669a1..8833e4167f6 100644
--- a/src/hotspot/share/classfile/vmIntrinsics.hpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.hpp
@@ -549,6 +549,13 @@ class methodHandle;
    do_name(intPolyAssign_name, "conditionalAssign")                                                                     \
    do_signature(intPolyAssign_signature, "(I[J[J)V")                                                                    \
                                                                                                                         \
+  /* support for sun.security.util.math.intpoly.IntegerPolynomial25519 */                                               \
+  do_class(sun_security_util_math_intpoly_IntegerPolynomial25519, "sun/security/util/math/intpoly/IntegerPolynomial25519") \
+  do_intrinsic(_intpoly_mult_25519, sun_security_util_math_intpoly_IntegerPolynomial25519, intPolyMult_name, intPolyMult_signature, F_R) \
+  do_intrinsic(_intpoly_square_25519, sun_security_util_math_intpoly_IntegerPolynomial25519, intPolySquare_name, intPolySquare_signature, F_R) \
+  do_name(intPolySquare_name, "square")                                                                                  \
+  do_signature(intPolySquare_signature, "([J[J)V")                                                                       \
+                                                                                                                         \
   /* support for java.util.Base64.Encoder*/                                                                             \
   do_class(java_util_Base64_Encoder, "java/util/Base64$Encoder")                                                        \
   do_intrinsic(_base64_encodeBlock, java_util_Base64_Encoder, encodeBlock_name, encodeBlock_signature, F_R)             \
diff --git a/src/hotspot/share/opto/c2compiler.cpp b/src/hotspot/share/opto/c2compiler.cpp
index 2f48fffcaa2..bf434030499 100644
--- a/src/hotspot/share/opto/c2compiler.cpp
+++ b/src/hotspot/share/opto/c2compiler.cpp
@@ -819,6 +819,8 @@ bool C2Compiler::is_intrinsic_supported(vmIntrinsics::ID id) {
   case vmIntrinsics::_poly1305_processBlocks:
   case vmIntrinsics::_intpoly_montgomeryMult_P256:
   case vmIntrinsics::_intpoly_assign:
+  case vmIntrinsics::_intpoly_mult_25519:
+  case vmIntrinsics::_intpoly_square_25519:
   case vmIntrinsics::_updateCRC32:
   case vmIntrinsics::_updateBytesCRC32:
   case vmIntrinsics::_updateByteBufferCRC32:
diff --git a/src/hotspot/share/opto/escape.cpp b/src/hotspot/share/opto/escape.cpp
index 49e59c70c47..f561818a99b 100644
--- a/src/hotspot/share/opto/escape.cpp
+++ b/src/hotspot/share/opto/escape.cpp
@@ -2272,6 +2272,8 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
                   strcmp(call->as_CallLeaf()->_name, "poly1305_processBlocks") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "intpoly_montgomeryMult_P256") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "intpoly_assign") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "intpoly_mult_25519") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "intpoly_square_25519") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "chacha20Block") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "kyberNtt") == 0 ||
diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
index 7251783d771..adb8ff2dedb 100644
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@@ -666,6 +666,10 @@ bool LibraryCallKit::try_to_inline(int predicate) {
     return inline_intpoly_montgomeryMult_P256();
   case vmIntrinsics::_intpoly_assign:
     return inline_intpoly_assign();
+  case vmIntrinsics::_intpoly_mult_25519:
+    return inline_intpoly_mult_25519();
+  case vmIntrinsics::_intpoly_square_25519:
+    return inline_intpoly_square_25519();
   case vmIntrinsics::_encodeISOArray:
   case vmIntrinsics::_encodeByteISOArray:
     return inline_encodeISOArray(false);
@@ -8373,6 +8377,70 @@ bool LibraryCallKit::inline_intpoly_assign() {
   return true;
 }
 
+bool LibraryCallKit::inline_intpoly_mult_25519() {
+  address stubAddr;
+  const char *stubName;
+  assert(UseIntPoly25519Intrinsics, "need intpoly25519 intrinsics support");
+  assert(callee()->signature()->size() == 3, "intpoly_mult_25519 has %d parameters", callee()->signature()->size());
+  stubAddr = StubRoutines::intpoly_mult_25519();
+  stubName = "intpoly_mult_25519";
+
+  if (!stubAddr) return false;
+  null_check_receiver();  // null-check receiver
+  if (stopped())  return true;
+
+  Node* a = argument(1);
+  Node* b = argument(2);
+  Node* r = argument(3);
+
+  a = must_be_not_null(a, true);
+  b = must_be_not_null(b, true);
+  r = must_be_not_null(r, true);
+
+  Node* a_start = array_element_address(a, intcon(0), T_LONG);
+  assert(a_start, "a array is null");
+  Node* b_start = array_element_address(b, intcon(0), T_LONG);
+  assert(b_start, "b array is null");
+  Node* r_start = array_element_address(r, intcon(0), T_LONG);
+  assert(r_start, "r array is null");
+
+  Node* call = make_runtime_call(RC_LEAF | RC_NO_FP,
+                                 OptoRuntime::intpoly_mult_25519_Type(),
+                                 stubAddr, stubName, TypePtr::BOTTOM,
+                                 a_start, b_start, r_start);
+  return true;
+}
+
+bool LibraryCallKit::inline_intpoly_square_25519() {
+  address stubAddr;
+  const char *stubName;
+  assert(UseIntPoly25519Intrinsics, "need intpoly25519 intrinsics support");
+  assert(callee()->signature()->size() == 2, "intpoly_mult_25519 has %d parameters", callee()->signature()->size());
+  stubAddr = StubRoutines::intpoly_square_25519();
+  stubName = "intpoly_square_25519";
+
+  if (!stubAddr) return false;
+  null_check_receiver();  // null-check receiver
+  if (stopped())  return true;
+
+  Node* a = argument(1);
+  Node* r = argument(2);
+
+  a = must_be_not_null(a, true);
+  r = must_be_not_null(r, true);
+
+  Node* a_start = array_element_address(a, intcon(0), T_LONG);
+  assert(a_start, "a array is null");
+  Node* r_start = array_element_address(r, intcon(0), T_LONG);
+  assert(r_start, "r array is null");
+
+  Node* call = make_runtime_call(RC_LEAF | RC_NO_FP,
+                                 OptoRuntime::intpoly_square_25519_Type(),
+                                 stubAddr, stubName, TypePtr::BOTTOM,
+                                 a_start, r_start);
+  return true;
+}
+
 //------------------------------inline_digestBase_implCompress-----------------------
 //
 // Calculate MD5 for single-block byte[] array.
diff --git a/src/hotspot/share/opto/library_call.hpp b/src/hotspot/share/opto/library_call.hpp
index 5b46ae832a4..871a6b0d072 100644
--- a/src/hotspot/share/opto/library_call.hpp
+++ b/src/hotspot/share/opto/library_call.hpp
@@ -343,6 +343,8 @@ class LibraryCallKit : public GraphKit {
   bool inline_poly1305_processBlocks();
   bool inline_intpoly_montgomeryMult_P256();
   bool inline_intpoly_assign();
+  bool inline_intpoly_mult_25519();
+  bool inline_intpoly_square_25519();
   bool inline_digestBase_implCompress(vmIntrinsics::ID id);
   bool inline_keccak(vmIntrinsics::ID id);
   bool inline_digestBase_implCompressMB(int predicate);
diff --git a/src/hotspot/share/opto/runtime.cpp b/src/hotspot/share/opto/runtime.cpp
index 1afffcadd6e..7f791082b65 100644
--- a/src/hotspot/share/opto/runtime.cpp
+++ b/src/hotspot/share/opto/runtime.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2026, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -237,6 +237,8 @@ const TypeFunc* OptoRuntime::_string_IndexOf_Type                 = nullptr;
 const TypeFunc* OptoRuntime::_poly1305_processBlocks_Type         = nullptr;
 const TypeFunc* OptoRuntime::_intpoly_montgomeryMult_P256_Type    = nullptr;
 const TypeFunc* OptoRuntime::_intpoly_assign_Type                 = nullptr;
+const TypeFunc* OptoRuntime::_intpoly_mult_25519_Type             = nullptr;
+const TypeFunc* OptoRuntime::_intpoly_square_25519_Type           = nullptr;
 const TypeFunc* OptoRuntime::_updateBytesCRC32_Type               = nullptr;
 const TypeFunc* OptoRuntime::_updateBytesCRC32C_Type              = nullptr;
 const TypeFunc* OptoRuntime::_updateBytesAdler32_Type             = nullptr;
@@ -1786,6 +1788,41 @@ static const TypeFunc* make_intpoly_assign_Type() {
   return TypeFunc::make(domain, range);
 }
 
+static const TypeFunc* make_intpoly_mult_25519_Type() {
+  int argcnt = 3;
+
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // a array
+  fields[argp++] = TypePtr::NOTNULL;    // b array
+  fields[argp++] = TypePtr::NOTNULL;    // r(esult) array
+  assert(argp == TypeFunc::Parms + argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+  // result type needed
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms + 0] = nullptr; // void
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+  return TypeFunc::make(domain, range);
+}
+
+static const TypeFunc* make_intpoly_square_25519_Type() {
+  int argcnt = 2;
+
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // a array
+  fields[argp++] = TypePtr::NOTNULL;    // r(esult) array
+  assert(argp == TypeFunc::Parms + argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+  // result type needed
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms + 0] = nullptr; // void
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+  return TypeFunc::make(domain, range);
+}
+
 //------------- Interpreter state for on stack replacement
 static const TypeFunc* make_osr_end_Type() {
   // create input type (domain)
@@ -2354,6 +2391,8 @@ void OptoRuntime::initialize_types() {
   _poly1305_processBlocks_Type        = make_poly1305_processBlocks_Type();
   _intpoly_montgomeryMult_P256_Type   = make_intpoly_montgomeryMult_P256_Type();
   _intpoly_assign_Type                = make_intpoly_assign_Type();
+  _intpoly_mult_25519_Type            = make_intpoly_mult_25519_Type();
+  _intpoly_square_25519_Type          = make_intpoly_square_25519_Type();
   _updateBytesCRC32_Type              = make_updateBytesCRC32_Type();
   _updateBytesCRC32C_Type             = make_updateBytesCRC32C_Type();
   _updateBytesAdler32_Type            = make_updateBytesAdler32_Type();
diff --git a/src/hotspot/share/opto/runtime.hpp b/src/hotspot/share/opto/runtime.hpp
index af8a206e10c..5802bf59ae5 100644
--- a/src/hotspot/share/opto/runtime.hpp
+++ b/src/hotspot/share/opto/runtime.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2026, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -190,6 +190,8 @@ class OptoRuntime : public AllStatic {
   static const TypeFunc* _poly1305_processBlocks_Type;
   static const TypeFunc* _intpoly_montgomeryMult_P256_Type;
   static const TypeFunc* _intpoly_assign_Type;
+  static const TypeFunc* _intpoly_mult_25519_Type;
+  static const TypeFunc* _intpoly_square_25519_Type;
   static const TypeFunc* _updateBytesCRC32_Type;
   static const TypeFunc* _updateBytesCRC32C_Type;
   static const TypeFunc* _updateBytesAdler32_Type;
@@ -687,6 +689,18 @@ private:
     return _intpoly_assign_Type;
   }
 
+  // IntegerPolynomial25519 multiply function
+  static inline const TypeFunc* intpoly_mult_25519_Type() {
+    assert(_intpoly_mult_25519_Type != nullptr, "should be initialized");
+    return _intpoly_mult_25519_Type;
+  }
+
+  // IntegerPolynomial25519 square function
+  static inline const TypeFunc* intpoly_square_25519_Type() {
+    assert(_intpoly_square_25519_Type != nullptr, "should be initialized");
+    return _intpoly_square_25519_Type;
+  }
+
   /**
    * int updateBytesCRC32(int crc, byte* b, int len)
    */
diff --git a/src/hotspot/share/runtime/globals.hpp b/src/hotspot/share/runtime/globals.hpp
index f90a644eaa4..ec34305f837 100644
--- a/src/hotspot/share/runtime/globals.hpp
+++ b/src/hotspot/share/runtime/globals.hpp
@@ -229,9 +229,13 @@ const int ObjectAlignmentInBytes = 8;
                                                                             \
   product(bool, UsePoly1305Intrinsics, false, DIAGNOSTIC,                   \
           "Use intrinsics for sun.security.util.math.intpoly")              \
-  product(bool, UseIntPolyIntrinsics, false, DIAGNOSTIC,                   \
+                                                                            \
+  product(bool, UseIntPolyIntrinsics, false, DIAGNOSTIC,                    \
           "Use intrinsics for sun.security.util.math.intpoly.MontgomeryIntegerPolynomialP256") \
                                                                             \
+  product(bool, UseIntPoly25519Intrinsics, false, DIAGNOSTIC,               \
+          "Use intrinsics for sun.security.util.math.intpoly.IntegerPolynomial25519") \
+                                                                            \
   product(size_t, LargePageSizeInBytes, 0,                                  \
           "Maximum large page size used (0 will use the default large "     \
           "page size for the environment as the maximum) "                  \
diff --git a/src/hotspot/share/runtime/stubDeclarations.hpp b/src/hotspot/share/runtime/stubDeclarations.hpp
index bef6a0c27f0..5c3567eb0c0 100644
--- a/src/hotspot/share/runtime/stubDeclarations.hpp
+++ b/src/hotspot/share/runtime/stubDeclarations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2025, Red Hat, Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -801,6 +801,12 @@
            intpoly_montgomeryMult_P256, intpoly_montgomeryMult_P256)    \
   do_stub(compiler, intpoly_assign)                                     \
   do_entry(compiler, intpoly_assign, intpoly_assign, intpoly_assign)    \
+  do_stub(compiler, intpoly_mult_25519)                                 \
+  do_entry(compiler, intpoly_mult_25519,                                \
+           intpoly_mult_25519, intpoly_mult_25519)                      \
+  do_stub(compiler, intpoly_square_25519)                               \
+  do_entry(compiler, intpoly_square_25519,                              \
+           intpoly_square_25519, intpoly_square_25519)                  \
   do_stub(compiler, md5_implCompress)                                   \
   do_entry(compiler, md5_implCompress, md5_implCompress,                \
            md5_implCompress)                                            \
diff --git a/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial25519.java b/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial25519.java
index c8f23da417e..b7b1ddae0e0 100644
--- a/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial25519.java
+++ b/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial25519.java
@@ -26,6 +26,7 @@
 package sun.security.util.math.intpoly;
 
 import java.math.BigInteger;
+import jdk.internal.vm.annotation.IntrinsicCandidate;
 
 public final class IntegerPolynomial25519 extends IntegerPolynomial {
     private static final int BITS_PER_LIMB = 51;
@@ -235,6 +236,7 @@ public final class IntegerPolynomial25519 extends IntegerPolynomial {
      * @param b [in] the limb operand to multiply.
      * @param r [out] the product of the limbs operands that is fully reduced.
      */
+    @IntrinsicCandidate
     protected void mult(long[] a, long[] b, long[] r) {
         long aa0 = a[0];
         long aa1 = a[1];
@@ -414,6 +416,7 @@ public final class IntegerPolynomial25519 extends IntegerPolynomial {
      * @param a [in] the limb operand to square.
      * @param r [out] the resulting square of the limb which is fully reduced.
      */
+    @IntrinsicCandidate
     protected void square(long[] a, long[] r) {
         long aa0 = a[0];
         long aa1 = a[1];