added comments to p256 intrinsics, fixed error message in umullv instruction

This commit is contained in:
Ben Perez 2026-02-10 22:57:50 -05:00
parent 05925eaaa4
commit e70dc14e26
2 changed files with 17 additions and 6 deletions

View File

@ -3158,8 +3158,8 @@ private:
int q = (Tb == T4H || Tb == T2S) ? 0 : 1;
int h = (size == 0b01) ? ((lane >> 2) & 1) : ((lane >> 1) & 1);
int l = (size == 0b01) ? ((lane >> 1) & 1) : (lane & 1);
assert(size == 0b10 ? lane < 4 : lane < 8, "umullv assumes lane < 4 when using half-words and lane < 8 otherwise");
assert(Ts == H ? Vm->encoding() < 16 : Vm->encoding() < 32, "umullv requires Vm to be in range V0..V15 when Ts is H");
assert(size == 0b10 ? lane < 4 : lane < 8, "umull{2}v assumes lane < 4 when using half-words and lane < 8 otherwise");
assert(Ts == H ? Vm->encoding() < 16 : Vm->encoding() < 32, "umull{2}v requires Vm to be in range V0..V15 when Ts is H");
f(0, 31), f(q, 30), f(0b101111, 29, 24), f(size, 23, 22), f(l, 21); //f(m, 20);
rf(Vm, 16), f(0b1010, 15, 12), f(h, 11), f(0, 10), rf(Vn, 5), rf(Vd, 0);
}
@ -3185,8 +3185,8 @@ public:
void umull2v(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,
SIMD_Arrangement Tb, FloatRegister Vm, SIMD_RegVariant Ts, int lane) {
assert(Ta == T4S || Ta == T2D, "umullv destination register must have arrangement T4S or T2D");
assert(Ta == T4S ? (Tb == T8H && Ts == H) : (Tb == T4S && Ts == S), "umullv register arrangements must adhere to spec");
assert(Ta == T4S || Ta == T2D, "umull2v destination register must have arrangement T4S or T2D");
assert(Ta == T4S ? (Tb == T8H && Ts == H) : (Tb == T4S && Ts == S), "umull2v register arrangements must adhere to spec");
_umullv(Vd, Ta, Vn, Tb, Vm, Ts, lane);
}

View File

@ -7189,8 +7189,13 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
// Multiply each 32-bit value in bs by the 32-bit values in as[lane_lo] and as[lane_lo + 2]
// and store in vs.
// Subroutine used by the 64 bit multiplication algorithm in generate_intpoly_montgomeryMult_P256().
// This function computes partial results of eight 52 x 52 bit multiplications where the
// multiplicands are stored as 64-bit values, specifically (b_0, b_1, b_2, b_3) * (a_3, a_4).
// In a call to this function, either the high or low 32 bits of the b_i values are multiplied
// by either the high or low 32 bits of the a_j values, so four calls with the appropriate
// parameters will produce the 64-bit low32 * low32, low32 * high32, high32 * low32, high32 * high32
// values in the output register sequences.
void neon_partial_mult_64(const VSeq<4>& vs, FloatRegister bs, FloatRegister as, int lane_lo) {
__ umullv(vs[0], __ T2D, bs, __ T2S, as, __ S, lane_lo);
@ -7200,6 +7205,12 @@ class StubGenerator: public StubCodeGenerator {
}
// This assembly follows the Java code in MontgomeryIntegerPolynomial256.mult() quite closely.
// The main difference is that the computations done with the last two limbs of `a` are
// done using Neon registers. This allows us to take advantage of both the Neon registers and
// GPRs simultaneously. It is also worth noting that since Neon does not support 64 bit multiplication
// we split each 64 bit value into lower and upper halves and use the "schoolbook" multiplication
// algorithm.
address generate_intpoly_montgomeryMult_P256() {
__ align(CodeEntryAlignment);