mirror of
https://github.com/openjdk/jdk.git
synced 2026-07-02 15:20:27 +00:00
8384847: Fix documentation typos around ML-KEM and ML-DSA intrinsic code for aarch64
Reviewed-by: adinn, aph
This commit is contained in:
parent
acbedab719
commit
e356cbb395
@ -4977,7 +4977,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
return start;
|
||||
}
|
||||
// Implements the double_keccak() method of the
|
||||
// sun.secyrity.provider.SHA3Parallel class
|
||||
// sun.security.provider.SHA3Parallel class
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, stub_id);
|
||||
start = __ pc();
|
||||
@ -5045,7 +5045,8 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ ldpd(v8, v9, __ post(sp, 64));
|
||||
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ mov(r0, zr); // return 0
|
||||
|
||||
__ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
|
||||
__ ret(lr);
|
||||
|
||||
// record the stub entry and end
|
||||
@ -5458,7 +5459,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
// load N/2 pairs of quadword values from memory into N vector
|
||||
// registers via the address supplied in base with each pair indexed
|
||||
// using the the start offset plus the corresponding entry in the
|
||||
// using the start offset plus the corresponding entry in the
|
||||
// offsets array
|
||||
template<int N>
|
||||
void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
|
||||
@ -5469,7 +5470,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
// store N vector registers into N/2 pairs of quadword memory
|
||||
// locations via the address supplied in base with each pair indexed
|
||||
// using the the start offset plus the corresponding entry in the
|
||||
// using the start offset plus the corresponding entry in the
|
||||
// offsets array
|
||||
template<int N>
|
||||
void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
|
||||
@ -5480,7 +5481,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
// load N single quadword values from memory into N vector registers
|
||||
// via the address supplied in base with each value indexed using
|
||||
// the the start offset plus the corresponding entry in the offsets
|
||||
// the start offset plus the corresponding entry in the offsets
|
||||
// array
|
||||
template<int N>
|
||||
void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
|
||||
@ -5492,7 +5493,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
// store N vector registers into N single quadword memory locations
|
||||
// via the address supplied in base with each value indexed using
|
||||
// the the start offset plus the corresponding entry in the offsets
|
||||
// the start offset plus the corresponding entry in the offsets
|
||||
// array
|
||||
template<int N>
|
||||
void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
|
||||
@ -5504,7 +5505,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
// load N/2 pairs of quadword values from memory de-interleaved into
|
||||
// N vector registers 2 at a time via the address supplied in base
|
||||
// with each pair indexed using the the start offset plus the
|
||||
// with each pair indexed using the start offset plus the
|
||||
// corresponding entry in the offsets array
|
||||
template<int N>
|
||||
void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
|
||||
@ -5517,7 +5518,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
// store N vector registers 2 at a time interleaved into N/2 pairs
|
||||
// of quadword memory locations via the address supplied in base
|
||||
// with each pair indexed using the the start offset plus the
|
||||
// with each pair indexed using the start offset plus the
|
||||
// corresponding entry in the offsets array
|
||||
template<int N>
|
||||
void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
|
||||
@ -5776,7 +5777,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
// registers.
|
||||
// 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
|
||||
// multiplications (this is because that way there should not be any
|
||||
// overflow during the inverse NTT computation), here we usr R = 2^16 so
|
||||
// overflow during the inverse NTT computation), here we use R = 2^16 so
|
||||
// that we can use the 16-bit arithmetic in the vector unit.
|
||||
//
|
||||
// On each level, we fill up the vector registers in such a way that the
|
||||
@ -5898,7 +5899,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
// level 4
|
||||
// At level 4 coefficients occur in 8 discrete blocks of size 16
|
||||
// so they are loaded using employing an ldr at 8 distinct offsets.
|
||||
// so they are loaded by employing an ldr at 8 distinct offsets.
|
||||
|
||||
vs_ldpq(vq, kyberConsts);
|
||||
int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
|
||||
@ -5954,7 +5955,6 @@ class StubGenerator: public StubCodeGenerator {
|
||||
kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
|
||||
vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
|
||||
vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
|
||||
// __ ldpq(v18, v19, __ post(zetas, 32));
|
||||
load32shorts(vs_front(vs2), zetas);
|
||||
kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
|
||||
vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
|
||||
@ -5970,7 +5970,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
|
||||
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ mov(r0, zr); // return 0
|
||||
__ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
|
||||
__ ret(lr);
|
||||
|
||||
// record the stub entry and end
|
||||
@ -6067,7 +6067,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
// level 2
|
||||
// At level 2 coefficients occur in 8 discrete blocks of size 16
|
||||
// so they are loaded using employing an ldr at 8 distinct offsets.
|
||||
// so they are loaded by employing an ldr at 8 distinct offsets.
|
||||
|
||||
int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
|
||||
vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
|
||||
@ -6262,7 +6262,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
store64shorts(vs2, tmpAddr);
|
||||
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ mov(r0, zr); // return 0
|
||||
__ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
|
||||
__ ret(lr);
|
||||
|
||||
// record the stub entry and end
|
||||
@ -6407,7 +6407,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ br(Assembler::NE, kyberNttMult_loop);
|
||||
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ mov(r0, zr); // return 0
|
||||
__ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
|
||||
__ ret(lr);
|
||||
|
||||
// record the stub entry and end
|
||||
@ -6499,7 +6499,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
}
|
||||
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ mov(r0, zr); // return 0
|
||||
__ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
|
||||
__ ret(lr);
|
||||
|
||||
// record the stub entry and end
|
||||
@ -6606,7 +6606,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
}
|
||||
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ mov(r0, zr); // return 0
|
||||
__ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
|
||||
__ ret(lr);
|
||||
|
||||
// record the stub entry and end
|
||||
@ -6692,8 +6692,8 @@ class StubGenerator: public StubCodeGenerator {
|
||||
// twice, one copy manipulated to provide the lower 4 bits
|
||||
// belonging to the first short in a pair and another copy
|
||||
// manipulated to provide the higher 4 bits belonging to the
|
||||
// second short in a pair. This is why the the vector sequences va
|
||||
// and vb used to hold the expanded 8H elements are of length 8.
|
||||
// second short in a pair. This is why the vector sequences va
|
||||
// and vb are used to hold the expanded 8H elements are of length 8.
|
||||
|
||||
// Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
|
||||
// n.b. target elements 2 and 3 duplicate elements 4 and 5
|
||||
@ -6763,7 +6763,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ br(Assembler::GT, L_loop);
|
||||
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ mov(r0, zr); // return 0
|
||||
__ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
|
||||
__ ret(lr);
|
||||
|
||||
// bind label and generate constant data used by this stub
|
||||
@ -6869,7 +6869,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
}
|
||||
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ mov(r0, zr); // return 0
|
||||
__ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
|
||||
__ ret(lr);
|
||||
|
||||
// record the stub entry and end
|
||||
@ -6943,7 +6943,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
vs_addv(va0, __ T4S, va0, vc);
|
||||
}
|
||||
|
||||
// Perform combined add/sub then montul on 4x4S vectors.
|
||||
// Perform combined add/sub then montmul on 4x4S vectors.
|
||||
void dilithium_sub_add_montmul16(
|
||||
const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
|
||||
const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
|
||||
@ -7079,7 +7079,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
// coefficients we load 4 adjacent values at 8 different offsets
|
||||
// using an indexed ldr with register variant Q and multiply them
|
||||
// in sequence order by the next set of inputs. Likewise we store
|
||||
// the resuls using an indexed str with register variant Q.
|
||||
// the results using an indexed str with register variant Q.
|
||||
for (int i = 0; i < 1024; i += 256) {
|
||||
// reload constants q, qinv each iteration as they get clobbered later
|
||||
vs_ldpq(vq, dilithiumConsts); // qInv, q
|
||||
@ -7129,11 +7129,11 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
// level 7
|
||||
// At level 7 the coefficients we need to combine with the zetas
|
||||
// occur singly with montmul inputs alterating with add/sub
|
||||
// occur singly with montmul inputs alternating with add/sub
|
||||
// inputs. Once again we can use 4-way parallelism to combine 16
|
||||
// zetas at a time. However, we have to load 8 adjacent values at
|
||||
// 4 different offsets using an ld2 load with arrangement 4S. That
|
||||
// interleaves the the odd words of each pair into one
|
||||
// interleaves the odd words of each pair into one
|
||||
// coefficients vector register and the even words of the pair
|
||||
// into the next register. We then need to montmul the 4 even
|
||||
// elements of the coefficients register sequence by the zetas in
|
||||
@ -7155,7 +7155,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
|
||||
}
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ mov(r0, zr); // return 0
|
||||
__ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
|
||||
__ ret(lr);
|
||||
|
||||
// record the stub entry and end
|
||||
@ -7334,7 +7334,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
// c0 load 32 (8x4S) coefficients via first offsets
|
||||
vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
|
||||
// c1 load 32 (8x4S) coefficients via second offsets
|
||||
vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
|
||||
vs_ldr_indexed(vs2, __ Q, coeffs, i, offsets2);
|
||||
// a0 = c0 + c1 n.b. clobbers vq which overlaps vs3
|
||||
vs_addv(vs3, __ T4S, vs1, vs2);
|
||||
// c = c0 - c1
|
||||
@ -7355,7 +7355,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
|
||||
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ mov(r0, zr); // return 0
|
||||
__ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
|
||||
__ ret(lr);
|
||||
|
||||
// record the stub entry and end
|
||||
@ -7367,8 +7367,8 @@ class StubGenerator: public StubCodeGenerator {
|
||||
// Dilithium multiply polynomials in the NTT domain.
|
||||
// Straightforward implementation of the method
|
||||
// static int implDilithiumNttMult(
|
||||
// int[] result, int[] ntta, int[] nttb {} of
|
||||
// the sun.security.provider.ML_DSA class.
|
||||
// int[] product, int[] coeffs1, int[] coeffs2) {}
|
||||
// of the sun.security.provider.ML_DSA class.
|
||||
//
|
||||
// result (int[256]) = c_rarg0
|
||||
// poly1 (int[256]) = c_rarg1
|
||||
@ -7429,7 +7429,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ br(Assembler::GE, L_loop);
|
||||
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ mov(r0, zr); // return 0
|
||||
__ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
|
||||
__ ret(lr);
|
||||
|
||||
// record the stub entry and end
|
||||
@ -7438,10 +7438,10 @@ class StubGenerator: public StubCodeGenerator {
|
||||
return start;
|
||||
}
|
||||
|
||||
// Dilithium Motgomery multiply an array by a constant.
|
||||
// Dilithium Montgomery multiply an array by a constant.
|
||||
// A straightforward implementation of the method
|
||||
// static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
|
||||
// of the sun.security.provider.MLDSA class
|
||||
// of the sun.security.provider.ML_DSA class
|
||||
//
|
||||
// coeffs (int[256]) = c_rarg0
|
||||
// constant (int) = c_rarg1
|
||||
@ -7498,7 +7498,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ br(Assembler::GE, L_loop);
|
||||
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ mov(r0, zr); // return 0
|
||||
__ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
|
||||
__ ret(lr);
|
||||
|
||||
// record the stub entry and end
|
||||
@ -7509,7 +7509,8 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
// Dilithium decompose poly.
|
||||
// Implements the method
|
||||
// static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
|
||||
// static int implDilithiumDecomposePoly(int[] input, int[] lowPart, int[] highPart,
|
||||
// int twoGamma2, int multiplier) {
|
||||
// of the sun.security.provider.ML_DSA class
|
||||
//
|
||||
// input (int[256]) = c_rarg0
|
||||
@ -7613,7 +7614,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
vs_andr(vtmp, vs4, twog2);
|
||||
vs_subv(vs3, __ T4S, vs3, vtmp);
|
||||
|
||||
// quotient += (mask & 1);
|
||||
// quotient += (mask & 1);
|
||||
vs_andr(vtmp, vs4, one);
|
||||
vs_addv(vs2, __ T4S, vs2, vtmp);
|
||||
|
||||
@ -7647,7 +7648,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
// r1 = r1 & quotient;
|
||||
vs_andr(vs1, vs2, vs1);
|
||||
|
||||
// store results inteleaved
|
||||
// store results interleaved
|
||||
// lowPart[m] = r0;
|
||||
// highPart[m] = r1;
|
||||
__ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
|
||||
@ -7664,7 +7665,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ ldpd(v8, v9, __ post(sp, 64));
|
||||
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ mov(r0, zr); // return 0
|
||||
__ mov(r0, zr); // return 0 (Java callees return 1. Caller ignores the return value)
|
||||
__ ret(lr);
|
||||
|
||||
// record the stub entry and end
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user