added comments to p256 intrinsics, fixed error message in umullv instruction

2026-04-05 04:31:36 +00:00 · 2026-02-10 22:57:50 -05:00 · 2026-02-10 22:57:50 -05:00 · e70dc14e26
commit e70dc14e26
parent 05925eaaa4
2 changed files with 17 additions and 6 deletions
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@ -3158,8 +3158,8 @@ private:
    int q = (Tb == T4H || Tb == T2S) ? 0 : 1;
    int h = (size == 0b01) ? ((lane >> 2) & 1) : ((lane >> 1) & 1);
    int l = (size == 0b01) ? ((lane >> 1) & 1) : (lane & 1);
-    assert(size == 0b10 ? lane < 4 : lane < 8, "umullv assumes lane < 4 when using half-words and lane < 8 otherwise");
-    assert(Ts == H ? Vm->encoding() < 16 : Vm->encoding() < 32, "umullv requires Vm to be in range V0..V15 when Ts is H");
+    assert(size == 0b10 ? lane < 4 : lane < 8, "umull{2}v assumes lane < 4 when using half-words and lane < 8 otherwise");
+    assert(Ts == H ? Vm->encoding() < 16 : Vm->encoding() < 32, "umull{2}v requires Vm to be in range V0..V15 when Ts is H");
    f(0, 31), f(q, 30), f(0b101111, 29, 24), f(size, 23, 22), f(l, 21); //f(m, 20);
    rf(Vm, 16), f(0b1010, 15, 12), f(h, 11), f(0, 10), rf(Vn, 5), rf(Vd, 0);
  }
@ -3185,8 +3185,8 @@ public:

  void umull2v(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,
               SIMD_Arrangement Tb, FloatRegister Vm, SIMD_RegVariant Ts, int lane) {
-    assert(Ta == T4S || Ta == T2D, "umullv destination register must have arrangement T4S or T2D");
-    assert(Ta == T4S ? (Tb == T8H && Ts == H) : (Tb == T4S && Ts == S), "umullv register arrangements must adhere to spec");
+    assert(Ta == T4S || Ta == T2D, "umull2v destination register must have arrangement T4S or T2D");
+    assert(Ta == T4S ? (Tb == T8H && Ts == H) : (Tb == T4S && Ts == S), "umull2v register arrangements must adhere to spec");
    _umullv(Vd, Ta, Vn, Tb, Vm, Ts, lane);
  }

--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
@ -7189,8 +7189,13 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }

-  // Multiply each 32-bit value in bs by the 32-bit values in as[lane_lo] and as[lane_lo + 2]
-  // and store in vs.
+  // Subroutine used by the 64 bit multiplication algorithm in generate_intpoly_montgomeryMult_P256().
+  // This function computes partial results of eight 52 x 52 bit multiplications where the
+  // multiplicands are stored as 64-bit values, specifically (b_0, b_1, b_2, b_3) * (a_3, a_4).
+  // In a call to this function, either the high or low 32 bits of the b_i values are multiplied
+  // by either the high or low 32 bits of the a_j values, so four calls with the appropriate
+  // parameters will produce the 64-bit low32 * low32, low32 * high32, high32 * low32, high32 * high32
+  // values in the output register sequences.
  void neon_partial_mult_64(const VSeq<4>& vs, FloatRegister bs, FloatRegister as, int lane_lo) {

    __ umullv(vs[0], __ T2D, bs, __ T2S, as, __ S, lane_lo);
@ -7200,6 +7205,12 @@ class StubGenerator: public StubCodeGenerator {

  }

+  // This assembly follows the Java code in MontgomeryIntegerPolynomial256.mult() quite closely.
+  // The main difference is that the computations done with the last two limbs of `a` are
+  // done using Neon registers. This allows us to take advantage of both the Neon registers and
+  // GPRs simultaneously. It is also worth noting that since Neon does not support 64 bit multiplication
+  // we split each 64 bit value into lower and upper halves and use the "schoolbook" multiplication
+  // algorithm.
  address generate_intpoly_montgomeryMult_P256() {

    __ align(CodeEntryAlignment);