From e70dc14e2643bd0ef706da5eacf98c83aaa0d860 Mon Sep 17 00:00:00 2001 From: Ben Perez Date: Tue, 10 Feb 2026 22:57:50 -0500 Subject: [PATCH] added comments to p256 intrinsics, fixed error message in umullv instruction --- src/hotspot/cpu/aarch64/assembler_aarch64.hpp | 8 ++++---- src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp | 15 +++++++++++++-- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp index 036dc053291..ec0b1f4fef2 100644 --- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp @@ -3158,8 +3158,8 @@ private: int q = (Tb == T4H || Tb == T2S) ? 0 : 1; int h = (size == 0b01) ? ((lane >> 2) & 1) : ((lane >> 1) & 1); int l = (size == 0b01) ? ((lane >> 1) & 1) : (lane & 1); - assert(size == 0b10 ? lane < 4 : lane < 8, "umullv assumes lane < 4 when using half-words and lane < 8 otherwise"); - assert(Ts == H ? Vm->encoding() < 16 : Vm->encoding() < 32, "umullv requires Vm to be in range V0..V15 when Ts is H"); + assert(size == 0b10 ? lane < 4 : lane < 8, "umull{2}v assumes lane < 4 when using half-words and lane < 8 otherwise"); + assert(Ts == H ? Vm->encoding() < 16 : Vm->encoding() < 32, "umull{2}v requires Vm to be in range V0..V15 when Ts is H"); f(0, 31), f(q, 30), f(0b101111, 29, 24), f(size, 23, 22), f(l, 21); //f(m, 20); rf(Vm, 16), f(0b1010, 15, 12), f(h, 11), f(0, 10), rf(Vn, 5), rf(Vd, 0); } @@ -3185,8 +3185,8 @@ public: void umull2v(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, SIMD_Arrangement Tb, FloatRegister Vm, SIMD_RegVariant Ts, int lane) { - assert(Ta == T4S || Ta == T2D, "umullv destination register must have arrangement T4S or T2D"); - assert(Ta == T4S ? (Tb == T8H && Ts == H) : (Tb == T4S && Ts == S), "umullv register arrangements must adhere to spec"); + assert(Ta == T4S || Ta == T2D, "umull2v destination register must have arrangement T4S or T2D"); + assert(Ta == T4S ? (Tb == T8H && Ts == H) : (Tb == T4S && Ts == S), "umull2v register arrangements must adhere to spec"); _umullv(Vd, Ta, Vn, Tb, Vm, Ts, lane); } diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index 70f2dee54e0..84bc630c150 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -7189,8 +7189,13 @@ class StubGenerator: public StubCodeGenerator { return start; } - // Multiply each 32-bit value in bs by the 32-bit values in as[lane_lo] and as[lane_lo + 2] - // and store in vs. + // Subroutine used by the 64 bit multiplication algorithm in generate_intpoly_montgomeryMult_P256(). + // This function computes partial results of eight 52 x 52 bit multiplications where the + // multiplicands are stored as 64-bit values, specifically (b_0, b_1, b_2, b_3) * (a_3, a_4). + // In a call to this function, either the high or low 32 bits of the b_i values are multiplied + // by either the high or low 32 bits of the a_j values, so four calls with the appropriate + // parameters will produce the 64-bit low32 * low32, low32 * high32, high32 * low32, high32 * high32 + // values in the output register sequences. void neon_partial_mult_64(const VSeq<4>& vs, FloatRegister bs, FloatRegister as, int lane_lo) { __ umullv(vs[0], __ T2D, bs, __ T2S, as, __ S, lane_lo); @@ -7200,6 +7205,12 @@ class StubGenerator: public StubCodeGenerator { } + // This assembly follows the Java code in MontgomeryIntegerPolynomial256.mult() quite closely. + // The main difference is that the computations done with the last two limbs of `a` are + // done using Neon registers. This allows us to take advantage of both the Neon registers and + // GPRs simultaneously. It is also worth noting that since Neon does not support 64 bit multiplication + // we split each 64 bit value into lower and upper halves and use the "schoolbook" multiplication + // algorithm. address generate_intpoly_montgomeryMult_P256() { __ align(CodeEntryAlignment);