From e70dc14e2643bd0ef706da5eacf98c83aaa0d860 Mon Sep 17 00:00:00 2001
From: Ben Perez <ben.perez@oracle.com>
Date: Tue, 10 Feb 2026 22:57:50 -0500
Subject: [PATCH] added comments to p256 intrinsics, fixed error message in
 umullv instruction

---
 src/hotspot/cpu/aarch64/assembler_aarch64.hpp     |  8 ++++----
 src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp | 15 +++++++++++++--
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
index 036dc053291..ec0b1f4fef2 100644
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@@ -3158,8 +3158,8 @@ private:
     int q = (Tb == T4H || Tb == T2S) ? 0 : 1;
     int h = (size == 0b01) ? ((lane >> 2) & 1) : ((lane >> 1) & 1);
     int l = (size == 0b01) ? ((lane >> 1) & 1) : (lane & 1);
-    assert(size == 0b10 ? lane < 4 : lane < 8, "umullv assumes lane < 4 when using half-words and lane < 8 otherwise");
-    assert(Ts == H ? Vm->encoding() < 16 : Vm->encoding() < 32, "umullv requires Vm to be in range V0..V15 when Ts is H");
+    assert(size == 0b10 ? lane < 4 : lane < 8, "umull{2}v assumes lane < 4 when using half-words and lane < 8 otherwise");
+    assert(Ts == H ? Vm->encoding() < 16 : Vm->encoding() < 32, "umull{2}v requires Vm to be in range V0..V15 when Ts is H");
     f(0, 31), f(q, 30), f(0b101111, 29, 24), f(size, 23, 22), f(l, 21); //f(m, 20);
     rf(Vm, 16), f(0b1010, 15, 12), f(h, 11), f(0, 10), rf(Vn, 5), rf(Vd, 0);
   }
@@ -3185,8 +3185,8 @@ public:
 
   void umull2v(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,
                SIMD_Arrangement Tb, FloatRegister Vm, SIMD_RegVariant Ts, int lane) {
-    assert(Ta == T4S || Ta == T2D, "umullv destination register must have arrangement T4S or T2D");
-    assert(Ta == T4S ? (Tb == T8H && Ts == H) : (Tb == T4S && Ts == S), "umullv register arrangements must adhere to spec");
+    assert(Ta == T4S || Ta == T2D, "umull2v destination register must have arrangement T4S or T2D");
+    assert(Ta == T4S ? (Tb == T8H && Ts == H) : (Tb == T4S && Ts == S), "umull2v register arrangements must adhere to spec");
     _umullv(Vd, Ta, Vn, Tb, Vm, Ts, lane);
   }
 
diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
index 70f2dee54e0..84bc630c150 100644
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
@@ -7189,8 +7189,13 @@ class StubGenerator: public StubCodeGenerator {
     return start;
   }
 
-  // Multiply each 32-bit value in bs by the 32-bit values in as[lane_lo] and as[lane_lo + 2]
-  // and store in vs.
+  // Subroutine used by the 64 bit multiplication algorithm in generate_intpoly_montgomeryMult_P256().
+  // This function computes partial results of eight 52 x 52 bit multiplications where the
+  // multiplicands are stored as 64-bit values, specifically (b_0, b_1, b_2, b_3) * (a_3, a_4).
+  // In a call to this function, either the high or low 32 bits of the b_i values are multiplied
+  // by either the high or low 32 bits of the a_j values, so four calls with the appropriate
+  // parameters will produce the 64-bit low32 * low32, low32 * high32, high32 * low32, high32 * high32
+  // values in the output register sequences.
   void neon_partial_mult_64(const VSeq<4>& vs, FloatRegister bs, FloatRegister as, int lane_lo) {
 
     __ umullv(vs[0], __ T2D, bs, __ T2S, as, __ S, lane_lo);
@@ -7200,6 +7205,12 @@ class StubGenerator: public StubCodeGenerator {
 
   }
 
+  // This assembly follows the Java code in MontgomeryIntegerPolynomial256.mult() quite closely.
+  // The main difference is that the computations done with the last two limbs of `a` are
+  // done using Neon registers. This allows us to take advantage of both the Neon registers and
+  // GPRs simultaneously. It is also worth noting that since Neon does not support 64 bit multiplication
+  // we split each 64 bit value into lower and upper halves and use the "schoolbook" multiplication
+  // algorithm.
   address generate_intpoly_montgomeryMult_P256() {
 
     __ align(CodeEntryAlignment);